diff --git a/README.md b/README.md index 8d5792d..bf97f72 100644 --- a/README.md +++ b/README.md @@ -1,14 +1,22 @@ eAcs ===== + 基于ac算法实现的快速高效的敏感词匹配,检查,过滤功能, 另外特殊字符不参与敏感词匹配,检查和替换, 替换是会按照原位置保留 Build ----- + $ rebar3 escriptize -> genAcs $ rebar3 compile +Notice +----- + + 编译acsTree.erl 时不要加debug_info 选项 减少编译后的大小和加载后内存占用 + Uses ----- + 敏感词预处理 去除特殊字符和去掉重复的敏感词 (SWordFile 和 OutputDirFile) 可以同名 脚本生成:./genAcs -f/-F SWordFile OutputDirFile 函数调用: genAcs:main(["-f"/"-F", SWordFile, OutputDirFile]) diff --git a/src/eAcs.erl b/src/eAcs.erl index b61bf83..eb4172b 100644 --- a/src/eAcs.erl +++ b/src/eAcs.erl @@ -13,134 +13,123 @@ %% *************************************** matchSw start *************************************************************** -spec matchSw(BinStr :: binary()) -> [{StartIndex :: integer(), EndIndex :: integer(), Pattern :: binary()}]. matchSw(BinStr) -> - doMatchMs(BinStr, 0, _Index = 1, _MatchList = []). + doMatchMs(BinStr, _PtrInx = 0, _Index = 1, _MatchList = []). doMatchMs(<<>>, _, _Index, MatchList) -> MatchList; -doMatchMs(<>, State, Index, MatchList) -> +doMatchMs(<>, PtrInx, Index, MatchList) -> case acsSpw:getSpw(Word) of true -> - doMatchMs(Tail, State, Index, MatchList); + doMatchMs(Tail, PtrInx, Index, MatchList); _ -> - {NewState, NewMatchList} = matchWordMs(Word, State, Index, MatchList), - doMatchMs(Tail, NewState, Index + 1, NewMatchList) + {NewPtrInx, NewMatchList} = matchWordMs(Word, PtrInx, Index, MatchList), + doMatchMs(Tail, NewPtrInx, Index + 1, NewMatchList) end. -matchWordMs(Word, State, Index, MatchList) -> - case acsTree:goto(State) of - undefined -> - case State of +matchWordMs(Word, PtrInx, Index, MatchList) -> + case acsTree:goto(PtrInx) of + {Word, NextPtrInx} -> + NewMatchList = getOutputMs(NextPtrInx, Index, MatchList), + {NextPtrInx, NewMatchList}; + #{Word := NextPtrInx} -> + NewMatchList = getOutputMs(NextPtrInx, Index, MatchList), + {NextPtrInx, NewMatchList}; + _ -> + case PtrInx of 0 -> - {State, MatchList}; - _ -> - {NextState, _} = acsTree:failOut(State), - matchWordMs(Word, NextState, Index, MatchList) - end; - Node -> - case Node of - {Word, NextState} -> - NewMatchList = getOutputMs(NextState, Index, MatchList), - {NextState, NewMatchList}; - #{Word := NextState} -> - NewMatchList = getOutputMs(NextState, Index, MatchList), - {NextState, NewMatchList}; + {PtrInx, MatchList}; _ -> - case State of - 0 -> - {State, MatchList}; - _ -> - {NextState, _} = acsTree:failOut(State), - matchWordMs(Word, NextState, Index, MatchList) + case acsTree:failOut(PtrInx) of + {NextPtrInx, _} -> + matchWordMs(Word, NextPtrInx, Index, MatchList); + NextPtrInx -> + matchWordMs(Word, NextPtrInx, Index, MatchList) end end end. getOutputMs(0, _Index, MatchList) -> MatchList; -getOutputMs(State, Index, MatchList) -> - {FailState, Pattern} = acsTree:failOut(State), - case Pattern of - undefined -> - getOutputMs(FailState, Index, MatchList); - _ -> +getOutputMs(PtrInx, Index, MatchList) -> + case acsTree:failOut(PtrInx) of + 0 -> + MatchList; + {FailPtrInx, Pattern} -> NewMatchList = [{Index - Pattern + 1, Pattern} | MatchList], - getOutputMs(FailState, Index, NewMatchList) + getOutputMs(FailPtrInx, Index, NewMatchList); + FailPtrInx -> + getOutputMs(FailPtrInx, Index, MatchList) end. %% *************************************** matchSw end *************************************************************** %% *************************************** isHasSw start *************************************************************** -spec isHasSw(BinStr :: binary()) -> boolean(). isHasSw(BinStr) -> - doMatchIs(BinStr, 0). + doMatchIs(BinStr, _PtrInx = 0). doMatchIs(<<>>, _) -> false; -doMatchIs(<>, State) -> +doMatchIs(<>, PtrInx) -> case acsSpw:getSpw(Word) of true -> - doMatchIs(Tail, State); + doMatchIs(Tail, PtrInx); _ -> - case matchWordIs(Word, State) of + case matchWordIs(Word, PtrInx) of true -> true; - NewState -> - doMatchIs(Tail, NewState) + NewPtrInx -> + doMatchIs(Tail, NewPtrInx) end end. -matchWordIs(Word, State) -> - case acsTree:goto(State) of - undefined -> - case State of - 0 -> - State; +matchWordIs(Word, PtrInx) -> + case acsTree:goto(PtrInx) of + {Word, NextPtrInx} -> + case getOutputIs(NextPtrInx) of + false -> + NextPtrInx; _ -> - {NextState, _} = acsTree:failOut(State), - matchWordIs(Word, NextState) + true end; - Node -> - case Node of - {Word, NextState} -> - case getOutputIs(NextState) of - false -> - NextState; - _ -> - true - end; - #{Word := NextState} -> - case getOutputIs(NextState) of - false -> - NextState; - _ -> - true - end; + #{Word := NextPtrInx} -> + case getOutputIs(NextPtrInx) of + false -> + NextPtrInx; _ -> - case State of - 0 -> - State; - _ -> - {NextState, _} = acsTree:failOut(State), - matchWordIs(Word, NextState) + true + end; + _ -> + case PtrInx of + 0 -> + PtrInx; + _ -> + case acsTree:failOut(PtrInx) of + {NextPtrInx, _} -> + matchWordIs(Word, NextPtrInx); + NextPtrInx -> + matchWordIs(Word, NextPtrInx) end end end. getOutputIs(0) -> false; -getOutputIs(State) -> - {FailState, Pattern} = acsTree:failOut(State), - case Pattern of - undefined -> - getOutputIs(FailState); - _ -> - true +getOutputIs(PtrInx) -> + case acsTree:failOut(PtrInx) of + 0 -> + false; + {FailPtrInx, _Pattern} -> + true; + FailPtrInx -> + getOutputIs(FailPtrInx) end. + %% *************************************** matchSw end *************************************************************** %% *************************************** replaceSw start ************************************************************* -spec replaceSw(BinStr :: binary()) -> ReBinStr :: binary(). replaceSw(BinStr) -> TotalSize = byte_size(BinStr), - case doMatchRs(BinStr, TotalSize - 1, _Index = 1, _State = 0, _MatchList = []) of + case doMatchRs(BinStr, TotalSize - 1, _Index = 1, _PtrInx = 0, _MatchList = []) of [] -> BinStr; MatchBIMWs -> @@ -150,7 +139,7 @@ replaceSw(BinStr) -> -spec isHasRpSw(BinStr :: binary()) -> {IsHasSw :: boolean(), ReBinStr :: binary()}. isHasRpSw(BinStr) -> TotalSize = byte_size(BinStr), - case doMatchRs(BinStr, TotalSize - 1, _Index = 1, _State = 0, _MatchList = []) of + case doMatchRs(BinStr, TotalSize - 1, _Index = 1, _PtrInx = 0, _MatchList = []) of [] -> {false, BinStr}; MatchBIMWs -> @@ -253,49 +242,42 @@ dealMatchList([{_OldByteIndex, OldMatchWordCnt, OldWordIndex} | LeftMatchList] = dealMatchList(LeftMatchList, CurByteIndex, MatchWordCnt, CurWordIndex) end. -doMatchRs(<<>>, _TotalSize, _CurIndex, _State, MatchList) -> +doMatchRs(<<>>, _TotalSize, _CurIndex, _PtrInx, MatchList) -> MatchList; -doMatchRs(<>, TotalSize, CurIndex, State, MatchList) -> +doMatchRs(<>, TotalSize, CurIndex, PtrInx, MatchList) -> case acsSpw:getSpw(Word) of true -> - doMatchRs(Tail, TotalSize, CurIndex, State, MatchList); + doMatchRs(Tail, TotalSize, CurIndex, PtrInx, MatchList); _ -> - {NewState, MatchCnt} = matchWordRs(Word, State, 0), + {NewPtrInx, MatchCnt} = matchWordRs(Word, PtrInx, 0), case MatchCnt of 0 -> - doMatchRs(Tail, TotalSize, CurIndex + 1, NewState, MatchList); + doMatchRs(Tail, TotalSize, CurIndex + 1, NewPtrInx, MatchList); _ -> LeftSize = byte_size(Tail), NewMatchList = dealMatchList(MatchList, TotalSize - LeftSize, MatchCnt, CurIndex), - doMatchRs(Tail, TotalSize, CurIndex + 1, NewState, NewMatchList) + doMatchRs(Tail, TotalSize, CurIndex + 1, NewPtrInx, NewMatchList) end end. -matchWordRs(Word, State, MatchCnt) -> - case acsTree:goto(State) of - undefined -> - case State of +matchWordRs(Word, PtrInx, MatchCnt) -> + case acsTree:goto(PtrInx) of + {Word, NextPtrInx} -> + NewMatchCnt = getOutputRs(NextPtrInx, MatchCnt), + {NextPtrInx, NewMatchCnt}; + #{Word := NextPtrInx} -> + NewMatchCnt = getOutputRs(NextPtrInx, MatchCnt), + {NextPtrInx, NewMatchCnt}; + _ -> + case PtrInx of 0 -> - {State, MatchCnt}; - _ -> - {NextState, _} = acsTree:failOut(State), - matchWordRs(Word, NextState, MatchCnt) - end; - Node -> - case Node of - {Word, NextState} -> - NewMatchCnt = getOutputRs(NextState, MatchCnt), - {NextState, NewMatchCnt}; - #{Word := NextState} -> - NewMatchCnt = getOutputRs(NextState, MatchCnt), - {NextState, NewMatchCnt}; + {PtrInx, MatchCnt}; _ -> - case State of - 0 -> - {State, MatchCnt}; - _ -> - {NextState, _} = acsTree:failOut(State), - matchWordRs(Word, NextState, MatchCnt) + case acsTree:failOut(PtrInx) of + {NextPtrInx, _} -> + matchWordRs(Word, NextPtrInx, MatchCnt); + NextPtrInx -> + matchWordRs(Word, NextPtrInx, MatchCnt) end end end. @@ -303,19 +285,22 @@ matchWordRs(Word, State, MatchCnt) -> %% 获取当前字符最大匹配数 getOutputRs(0, MatchCnt) -> MatchCnt; -getOutputRs(State, MatchCnt) -> - {FailState, Pattern} = acsTree:failOut(State), - case Pattern of - undefined -> - getOutputRs(FailState, MatchCnt); - _ -> +getOutputRs(PtrInx, MatchCnt) -> + case acsTree:failOut(PtrInx) of + 0 -> + MatchCnt; + {FailPtrInx, Pattern} -> case Pattern > MatchCnt of true -> - getOutputRs(FailState, Pattern); + getOutputRs(FailPtrInx, Pattern); _ -> - getOutputRs(FailState, MatchCnt) - end + getOutputRs(FailPtrInx, MatchCnt) + end; + FailPtrInx -> + getOutputRs(FailPtrInx, MatchCnt) + end. + % *************************************** replaceSw end ************************************************************* strSize(<<>>, Cnt) -> diff --git a/src/genAcs.erl b/src/genAcs.erl index 85fea6b..7af83c6 100644 --- a/src/genAcs.erl +++ b/src/genAcs.erl @@ -12,7 +12,7 @@ main(Args) -> [SWFile, WriteDir] -> case file:open(SWFile, [read, raw, binary, {read_ahead, 65536}, {encoding, utf8}]) of {ok, IoDevice} -> - {Goto, Output} = dealEverySW(IoDevice, _Goto = #{0 => #{}}, _Output = #{}, _State = 0), + {Goto, Output} = dealEverySW(IoDevice, _Goto = #{0 => #{}}, _Output = #{}, _PtrInx = 0), file:close(IoDevice), Fail = genFail(Goto), genSpw(WriteDir), @@ -69,17 +69,23 @@ writeFilter(CurLine, Line, FilterFile, LineMap) -> writeFilter(CurLine + 1, Line, FilterFile, LineMap) end. -dealEverySW(IoDevice, Goto, Output, MaxState) -> +dealEverySW(IoDevice, Goto, Output, MaxPtrInx) -> case file:read_line(IoDevice) of {ok, DataStr} -> - BinStr = binary:part(DataStr, 0, byte_size(DataStr) - 1), + BinStr = + case binary:last(DataStr) of + 10 -> + binary:part(DataStr, 0, byte_size(DataStr) - 1); + _ -> + DataStr + end, case BinStr =/= <<>> of true -> - {NewGoto, NewState, NewMaxState} = addGoto(BinStr, Goto, 0, MaxState), - NewOutput = Output#{NewState => eAcs:strSize(BinStr, 0)}, - dealEverySW(IoDevice, NewGoto, NewOutput, NewMaxState); + {NewGoto, EndPtrInx, NewMaxPtrInx} = addGoto(BinStr, Goto, 0, MaxPtrInx), + NewOutput = Output#{EndPtrInx => eAcs:strSize(BinStr, 0)}, + dealEverySW(IoDevice, NewGoto, NewOutput, NewMaxPtrInx); _ -> - dealEverySW(IoDevice, Goto, Output, MaxState) + dealEverySW(IoDevice, Goto, Output, MaxPtrInx) end; eof -> {Goto, Output}; @@ -90,54 +96,54 @@ dealEverySW(IoDevice, Goto, Output, MaxState) -> %% 从字符串列表构建ac搜索树 genTree(BinStrList) -> %% 先构造 goto and output table - {Goto, Output} = genGotoOutput(BinStrList, _Goto = #{0 => #{}}, _Output = #{}, _State = 0), + {Goto, Output} = genGotoOutput(BinStrList, _Goto = #{0 => #{}}, _Output = #{}, _PtrInx = 0), %% 然后构造 fail table Fail = genFail(Goto), {Goto, Fail, Output}. %% 构造 goto and output table -genGotoOutput([BinStr | Tail], Goto, Output, MaxState) -> +genGotoOutput([BinStr | Tail], Goto, Output, MaxPtrInx) -> case BinStr =/= <<>> of true -> - {NewGoto, NewState, NewMaxState} = addGoto(BinStr, Goto, 0, MaxState), - NewOutput = Output#{NewState => BinStr}, - genGotoOutput(Tail, NewGoto, NewOutput, NewMaxState); + {NewGoto, EndPtrInx, NewMaxPtrInx} = addGoto(BinStr, Goto, 0, MaxPtrInx), + NewOutput = Output#{EndPtrInx => BinStr}, + genGotoOutput(Tail, NewGoto, NewOutput, NewMaxPtrInx); _ -> - genGotoOutput(Tail, Goto, Output, MaxState) + genGotoOutput(Tail, Goto, Output, MaxPtrInx) end; -genGotoOutput([], Goto, Output, _MaxState) -> +genGotoOutput([], Goto, Output, _MaxPtrInx) -> {Goto, Output}. %% 添加Goto 匹配状态转移项 -addGoto(<>, Goto, State, MaxState) -> - #{State := Node} = Goto, +addGoto(<>, Goto, PtrInx, MaxPtrInx) -> + #{PtrInx := Node} = Goto, case Node of - #{Word := NextState} -> - addGoto(Tail, Goto, NextState, MaxState); + #{Word := NextPtrInx} -> + addGoto(Tail, Goto, NextPtrInx, MaxPtrInx); _ -> - NewMaxState = MaxState + 1, - NewNode = Node#{Word => NewMaxState}, - addGoto(Tail, Goto#{NewMaxState => #{}, State => NewNode}, NewMaxState, NewMaxState) + NewMaxPtrInx = MaxPtrInx + 1, + NewNode = Node#{Word => NewMaxPtrInx}, + addGoto(Tail, Goto#{NewMaxPtrInx => #{}, PtrInx => NewNode}, NewMaxPtrInx, NewMaxPtrInx) end; -addGoto(<<>>, Goto, State, MaxState) -> - {Goto, State, MaxState}. +addGoto(<<>>, Goto, PtrInx, MaxPtrInx) -> + {Goto, PtrInx, MaxPtrInx}. %% 添加匹配Fail状态转移项 genFail(#{0 := Node} = Goto) -> genFail(maps:values(Node), Goto, _Fail = #{}). %% 基于bfs搜索构造 Fail -genFail([State | Tail], Goto, Fail) -> - #{State := Node} = Goto, +genFail([PtrInx | Tail], Goto, Fail) -> + #{PtrInx := Node} = Goto, %% 获取父节点的失败节点 - FailState = maps:get(State, Fail, 0), + FatherFailPtrInx = maps:get(PtrInx, Fail, 0), %% 子节点 - Kvs = maps:to_list(Node), + ChildKvs = maps:to_list(Node), %% 为子节点查找失败节点 - NewFail = addFail(Kvs, FailState, Goto, Fail), + NewFail = addFail(ChildKvs, FatherFailPtrInx, Goto, Fail), %% 子节点入队列 NewQueue = Tail ++ maps:values(Node), @@ -146,29 +152,29 @@ genFail([], _Goto, Fail) -> Fail. %% 为节点构造失败指针 -%% @param FailState 是当前节点的失败指针 -addFail([{Word, State} | Tail], FailState, Goto, Fail) -> - NewFail = findFailNode(Word, State, FailState, Goto, Fail), - addFail(Tail, FailState, Goto, NewFail); -addFail([], _FailState, _Goto, Fail) -> +%% @param FatherFailPtrInx 是当前节点的失败指针 +addFail([{Word, PtrInx} | Tail], FatherFailPtrInx, Goto, Fail) -> + NewFail = findFailNode(Word, PtrInx, FatherFailPtrInx, Goto, Fail), + addFail(Tail, FatherFailPtrInx, Goto, NewFail); +addFail([], _FatherFailPtrInx, _Goto, Fail) -> Fail. %% 为某个儿子节点构造失败指针 -findFailNode(Word, State, FailState, Goto, Fail) -> - #{FailState := Node} = Goto, +findFailNode(Word, PtrInx, FatherFailPtrInx, Goto, Fail) -> + #{FatherFailPtrInx := Node} = Goto, case Node of - #{Word := TheFailState} -> + #{Word := MyFailPtrInx} -> %% 找到最近的失败节点的儿子节点拥有当前儿子节点的值,查找成功 - Fail#{State => TheFailState}; + Fail#{PtrInx => MyFailPtrInx}; _ -> - case FailState =:= 0 of + case FatherFailPtrInx =:= 0 of true -> %% 找不到,而且已经到了根节点,查找失败 Fail; _ -> %% 找不到但是还没到根节点,继续往上找 - NewFailState = maps:get(FailState, Fail, 0), - findFailNode(Word, State, NewFailState, Goto, Fail) + NewFatherFailPtrInx = maps:get(FatherFailPtrInx, Fail, 0), + findFailNode(Word, PtrInx, NewFatherFailPtrInx, Goto, Fail) end end. @@ -206,25 +212,36 @@ doGenGoto([{K, V} | SortKvs], StrAcc) -> end. genFailOut([], _Fail, _Output, StrAcc) -> - < {0, undefined}.">>; -genFailOut([State], Fail, Output, StrAcc) -> - FailState = maps:get(State, Fail, 0), - Pattern = maps:get(State, Output, undefined), - case FailState /= 0 orelse Pattern /= undefined of + < 0.">>; +genFailOut([PtrInx], Fail, Output, StrAcc) -> + FailPtrInx = maps:get(PtrInx, Fail, 0), + Pattern = maps:get(PtrInx, Output, undefined), + case FailPtrInx /= 0 orelse Pattern /= undefined of true -> - < ", (iolist_to_binary(io_lib:format(<<"~w">>, [{FailState, Pattern}])))/binary, ";\nfailOut(_) -> {0, undefined}.">>; + case Pattern of + undefined -> + < ", (iolist_to_binary(io_lib:format(<<"~w">>, [FailPtrInx])))/binary, ";\nfailOut(_) -> 0.">>; + _ -> + < ", (iolist_to_binary(io_lib:format(<<"~w">>, [{FailPtrInx, Pattern}])))/binary, ";\nfailOut(_) -> 0.">> + end; _ -> - < {0, undefined}.">> + < 0.">> end; -genFailOut([State | SortStates], Fail, Output, StrAcc) -> - FailState = maps:get(State, Fail, 0), - Pattern = maps:get(State, Output, undefined), - case FailState /= 0 orelse Pattern /= undefined of +genFailOut([PtrInx | SortPtrInxes], Fail, Output, StrAcc) -> + FailPtrInx = maps:get(PtrInx, Fail, 0), + Pattern = maps:get(PtrInx, Output, undefined), + case FailPtrInx /= 0 orelse Pattern /= undefined of true -> - NewStrAcc = < ", (iolist_to_binary(io_lib:format(<<"~w">>, [{FailState, Pattern}])))/binary, ";\n">>, - genFailOut(SortStates, Fail, Output, NewStrAcc); + NewStrAcc = + case Pattern of + undefined -> + < ", (iolist_to_binary(io_lib:format(<<"~w">>, [FailPtrInx])))/binary, ";\n">>; + _ -> + < ", (iolist_to_binary(io_lib:format(<<"~w">>, [{FailPtrInx, Pattern}])))/binary, ";\n">> + end, + genFailOut(SortPtrInxes, Fail, Output, NewStrAcc); _ -> - genFailOut(SortStates, Fail, Output, StrAcc) + genFailOut(SortPtrInxes, Fail, Output, StrAcc) end. -spec load(Module :: atom(), Export :: [{Fun :: atom(), Arity :: pos_integer()}], Str :: string()) -> {module, Module :: atom()} | {error, _}.