From 5c97b1ee9a475ac7e0ca850729479e3b1f794580 Mon Sep 17 00:00:00 2001 From: SisMaker <1713699517@qq.com> Date: Sun, 25 Apr 2021 19:43:05 +0800 Subject: [PATCH] =?UTF-8?q?ft:=20=E4=BB=A3=E7=A0=81=E8=B0=83=E6=95=B4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 8 ++++---- src/eAcs.erl | 33 +++++++++++++++++---------------- src/genAcs.erl | 8 ++++---- src/test/test.txt | 7 +++---- 4 files changed, 28 insertions(+), 28 deletions(-) diff --git a/README.md b/README.md index 579aaa8..3135e7d 100644 --- a/README.md +++ b/README.md @@ -1,17 +1,17 @@ eAcs ===== - 基于ac算法实现的快速高效的erl版本敏感词匹配,检查,过滤代码 + 基于ac算法实现的快速高效的敏感词匹配,检查,过滤功能 Build ----- - $ rebar3 escriptize -> genAcs + $ rebar3 escriptize -> genAcs $ rebar3 compile Uses ----- 创建 acsTree.erl - 脚本生成:./genAcs SWord.txtFile OutputDir 或者函数嗲用 - 函数调用: genAcs:main([SWord.txtFile, OutputDir]) + 脚本生成:./genAcs SWordFile OutputDir + 函数调用: genAcs:main([SWordFile, OutputDir]) 匹配 检查 过滤 敏感词 eAcs:matchSw/1 %% 返回匹配的敏感词列表 eAcs:isHasSw/1 %% 检查是否包含敏感词 diff --git a/src/eAcs.erl b/src/eAcs.erl index d166b76..a212e86 100644 --- a/src/eAcs.erl +++ b/src/eAcs.erl @@ -4,6 +4,7 @@ matchSw/1 %% 返回匹配的敏感词列表 , isHasSw/1 %% 检查是否包含敏感词 , replaceSw/1 %% 替换敏感词 + , strSize/2 %% 获取utf8字符串的长度 ]). %% state 0 is the root node @@ -17,10 +18,10 @@ matchSw(BinStr) -> doMatch(<<>>, _, _Index, MatchList) -> MatchList; doMatch(<>, State, Index, MatchList) -> - {NewState, NewMatchList} = matchInner(Word, State, Index, MatchList), + {NewState, NewMatchList} = matchWord(Word, State, Index, MatchList), doMatch(Tail, NewState, Index + 1, NewMatchList). -matchInner(Word, State, Index, MatchList) -> +matchWord(Word, State, Index, MatchList) -> Node = acsTree:goto(State), case Node of undefined -> @@ -29,7 +30,7 @@ matchInner(Word, State, Index, MatchList) -> {State, MatchList}; _ -> {NextState, _} = acsTree:failOut(State), - matchInner(Word, NextState, Index, MatchList) + matchWord(Word, NextState, Index, MatchList) end; _ -> case Node of @@ -42,7 +43,7 @@ matchInner(Word, State, Index, MatchList) -> {State, MatchList}; _ -> {NextState, _} = acsTree:failOut(State), - matchInner(Word, NextState, Index, MatchList) + matchWord(Word, NextState, Index, MatchList) end end end. @@ -55,7 +56,7 @@ getOutput(State, Index, MatchList) -> undefined -> getOutput(FailState, Index, MatchList); _ -> - NewMatchList = [{Index - patternSize(Pattern, 0) + 1, Index, Pattern} | MatchList], + NewMatchList = [{Index - Pattern + 1, Pattern} | MatchList], getOutput(FailState, Index, NewMatchList) end. @@ -68,14 +69,14 @@ isHasSw(BinStr) -> doMatch(<<>>, _) -> false; doMatch(<>, State) -> - case matchInner(Word, State) of + case matchWord(Word, State) of true -> true; NewState -> doMatch(Tail, NewState) end. -matchInner(Word, State) -> +matchWord(Word, State) -> Node = acsTree:goto(State), case Node of undefined -> @@ -84,7 +85,7 @@ matchInner(Word, State) -> State; _ -> {NextState, _} = acsTree:failOut(State), - matchInner(Word, NextState) + matchWord(Word, NextState) end; _ -> case Node of @@ -101,7 +102,7 @@ matchInner(Word, State) -> State; _ -> {NextState, _} = acsTree:failOut(State), - matchInner(Word, NextState) + matchWord(Word, NextState) end end end. @@ -110,11 +111,11 @@ getOutput(0) -> false; getOutput(State) -> {FailState, Pattern} = acsTree:failOut(State), - case Pattern == undefined orelse FailState == 0 of - true -> - false; + case Pattern of + undefined -> + getOutput(FailState); _ -> - getOutput(FailState) + true end. %% *************************************** matchSw end *************************************************************** %% *************************************** replaceSw start ************************************************************* @@ -122,7 +123,7 @@ replaceSw(_BinStr) -> ok. %% *************************************** replaceSw end ************************************************************* -patternSize(<<>>, Cnt) -> +strSize(<<>>, Cnt) -> Cnt; -patternSize(<<_Word/utf8, Left/binary>>, Cnt) -> - patternSize(Left, Cnt + 1). \ No newline at end of file +strSize(<<_Word/utf8, Left/binary>>, Cnt) -> + strSize(Left, Cnt + 1). \ No newline at end of file diff --git a/src/genAcs.erl b/src/genAcs.erl index c8ec0ce..f65d5d2 100644 --- a/src/genAcs.erl +++ b/src/genAcs.erl @@ -23,7 +23,7 @@ dealEveryLine(IoDevice, Goto, Output, MaxState) -> case BinStr =/= <<>> of true -> {NewGoto, NewState, NewMaxState} = addGoto(BinStr, Goto, 0, MaxState), - NewOutput = Output#{NewState => BinStr}, + NewOutput = Output#{NewState => eAcs:strSize(BinStr, 0)}, dealEveryLine(IoDevice, NewGoto, NewOutput, NewMaxState); _ -> dealEveryLine(IoDevice, Goto, Output, MaxState) @@ -146,15 +146,15 @@ doGenGoto([{K, V} | SortKvs], StrAcc) -> end. genFailOut([], _Fail, _Output, StrAcc) -> - < {0, undefined}.\n\n">>; + < {0, undefined}.">>; genFailOut([State], Fail, Output, StrAcc) -> FailState = maps:get(State, Fail, 0), Pattern = maps:get(State, Output, undefined), case FailState /= 0 orelse Pattern /= undefined of true -> - < ", (iolist_to_binary(io_lib:format(<<"~w">>, [{FailState, Pattern}])))/binary, ";\nfailOut(_) -> {0, undefined}.\n\n">>; + < ", (iolist_to_binary(io_lib:format(<<"~w">>, [{FailState, Pattern}])))/binary, ";\nfailOut(_) -> {0, undefined}.">>; _ -> - < {0, undefined}.\n\n">> + < {0, undefined}.">> end; genFailOut([State | SortStates], Fail, Output, StrAcc) -> FailState = maps:get(State, Fail, 0), diff --git a/src/test/test.txt b/src/test/test.txt index 86da781..7d2beae 100644 --- a/src/test/test.txt +++ b/src/test/test.txt @@ -1,7 +1,6 @@ -去你妈的 - - -你妈 +abcd +cd +c