Explorar el Código

ft: 代码优化

master
SisMaker hace 3 años
padre
commit
034f6be48f
Se han modificado 3 ficheros con 182 adiciones y 172 borrados
  1. +8
    -0
      README.md
  2. +103
    -118
      src/eAcs.erl
  3. +71
    -54
      src/genAcs.erl

+ 8
- 0
README.md Ver fichero

@ -1,14 +1,22 @@
eAcs
=====
基于ac算法实现的快速高效的敏感词匹配,检查,过滤功能, 另外特殊字符不参与敏感词匹配,检查和替换, 替换是会按照原位置保留
Build
-----
$ rebar3 escriptize -> genAcs
$ rebar3 compile
Notice
-----
编译acsTree.erl 时不要加debug_info 选项 减少编译后的大小和加载后内存占用
Uses
-----
敏感词预处理 去除特殊字符和去掉重复的敏感词 (SWordFile 和 OutputDirFile) 可以同名
脚本生成:./genAcs -f/-F SWordFile OutputDirFile
函数调用: genAcs:main(["-f"/"-F", SWordFile, OutputDirFile])

+ 103
- 118
src/eAcs.erl Ver fichero

@ -13,134 +13,123 @@
%% *************************************** matchSw start ***************************************************************
-spec matchSw(BinStr :: binary()) -> [{StartIndex :: integer(), EndIndex :: integer(), Pattern :: binary()}].
matchSw(BinStr) ->
doMatchMs(BinStr, 0, _Index = 1, _MatchList = []).
doMatchMs(BinStr, _PtrInx = 0, _Index = 1, _MatchList = []).
doMatchMs(<<>>, _, _Index, MatchList) ->
MatchList;
doMatchMs(<<Word/utf8, Tail/binary>>, State, Index, MatchList) ->
doMatchMs(<<Word/utf8, Tail/binary>>, PtrInx, Index, MatchList) ->
case acsSpw:getSpw(Word) of
true ->
doMatchMs(Tail, State, Index, MatchList);
doMatchMs(Tail, PtrInx, Index, MatchList);
_ ->
{NewState, NewMatchList} = matchWordMs(Word, State, Index, MatchList),
doMatchMs(Tail, NewState, Index + 1, NewMatchList)
{NewPtrInx, NewMatchList} = matchWordMs(Word, PtrInx, Index, MatchList),
doMatchMs(Tail, NewPtrInx, Index + 1, NewMatchList)
end.
matchWordMs(Word, State, Index, MatchList) ->
case acsTree:goto(State) of
undefined ->
case State of
matchWordMs(Word, PtrInx, Index, MatchList) ->
case acsTree:goto(PtrInx) of
{Word, NextPtrInx} ->
NewMatchList = getOutputMs(NextPtrInx, Index, MatchList),
{NextPtrInx, NewMatchList};
#{Word := NextPtrInx} ->
NewMatchList = getOutputMs(NextPtrInx, Index, MatchList),
{NextPtrInx, NewMatchList};
_ ->
case PtrInx of
0 ->
{State, MatchList};
_ ->
{NextState, _} = acsTree:failOut(State),
matchWordMs(Word, NextState, Index, MatchList)
end;
Node ->
case Node of
{Word, NextState} ->
NewMatchList = getOutputMs(NextState, Index, MatchList),
{NextState, NewMatchList};
#{Word := NextState} ->
NewMatchList = getOutputMs(NextState, Index, MatchList),
{NextState, NewMatchList};
{PtrInx, MatchList};
_ ->
case State of
0 ->
{State, MatchList};
_ ->
{NextState, _} = acsTree:failOut(State),
matchWordMs(Word, NextState, Index, MatchList)
case acsTree:failOut(PtrInx) of
{NextPtrInx, _} ->
matchWordMs(Word, NextPtrInx, Index, MatchList);
NextPtrInx ->
matchWordMs(Word, NextPtrInx, Index, MatchList)
end
end
end.
getOutputMs(0, _Index, MatchList) ->
MatchList;
getOutputMs(State, Index, MatchList) ->
{FailState, Pattern} = acsTree:failOut(State),
case Pattern of
undefined ->
getOutputMs(FailState, Index, MatchList);
_ ->
getOutputMs(PtrInx, Index, MatchList) ->
case acsTree:failOut(PtrInx) of
0 ->
MatchList;
{FailPtrInx, Pattern} ->
NewMatchList = [{Index - Pattern + 1, Pattern} | MatchList],
getOutputMs(FailState, Index, NewMatchList)
getOutputMs(FailPtrInx, Index, NewMatchList);
FailPtrInx ->
getOutputMs(FailPtrInx, Index, MatchList)
end.
%% *************************************** matchSw end ***************************************************************
%% *************************************** isHasSw start ***************************************************************
-spec isHasSw(BinStr :: binary()) -> boolean().
isHasSw(BinStr) ->
doMatchIs(BinStr, 0).
doMatchIs(BinStr, _PtrInx = 0).
doMatchIs(<<>>, _) ->
false;
doMatchIs(<<Word/utf8, Tail/binary>>, State) ->
doMatchIs(<<Word/utf8, Tail/binary>>, PtrInx) ->
case acsSpw:getSpw(Word) of
true ->
doMatchIs(Tail, State);
doMatchIs(Tail, PtrInx);
_ ->
case matchWordIs(Word, State) of
case matchWordIs(Word, PtrInx) of
true ->
true;
NewState ->
doMatchIs(Tail, NewState)
NewPtrInx ->
doMatchIs(Tail, NewPtrInx)
end
end.
matchWordIs(Word, State) ->
case acsTree:goto(State) of
undefined ->
case State of
0 ->
State;
matchWordIs(Word, PtrInx) ->
case acsTree:goto(PtrInx) of
{Word, NextPtrInx} ->
case getOutputIs(NextPtrInx) of
false ->
NextPtrInx;
_ ->
{NextState, _} = acsTree:failOut(State),
matchWordIs(Word, NextState)
true
end;
Node ->
case Node of
{Word, NextState} ->
case getOutputIs(NextState) of
false ->
NextState;
_ ->
true
end;
#{Word := NextState} ->
case getOutputIs(NextState) of
false ->
NextState;
_ ->
true
end;
#{Word := NextPtrInx} ->
case getOutputIs(NextPtrInx) of
false ->
NextPtrInx;
_ ->
case State of
0 ->
State;
_ ->
{NextState, _} = acsTree:failOut(State),
matchWordIs(Word, NextState)
true
end;
_ ->
case PtrInx of
0 ->
PtrInx;
_ ->
case acsTree:failOut(PtrInx) of
{NextPtrInx, _} ->
matchWordIs(Word, NextPtrInx);
NextPtrInx ->
matchWordIs(Word, NextPtrInx)
end
end
end.
getOutputIs(0) ->
false;
getOutputIs(State) ->
{FailState, Pattern} = acsTree:failOut(State),
case Pattern of
undefined ->
getOutputIs(FailState);
_ ->
true
getOutputIs(PtrInx) ->
case acsTree:failOut(PtrInx) of
0 ->
false;
{FailPtrInx, _Pattern} ->
true;
FailPtrInx ->
getOutputIs(FailPtrInx)
end.
%% *************************************** matchSw end ***************************************************************
%% *************************************** replaceSw start *************************************************************
-spec replaceSw(BinStr :: binary()) -> ReBinStr :: binary().
replaceSw(BinStr) ->
TotalSize = byte_size(BinStr),
case doMatchRs(BinStr, TotalSize - 1, _Index = 1, _State = 0, _MatchList = []) of
case doMatchRs(BinStr, TotalSize - 1, _Index = 1, _PtrInx = 0, _MatchList = []) of
[] ->
BinStr;
MatchBIMWs ->
@ -150,7 +139,7 @@ replaceSw(BinStr) ->
-spec isHasRpSw(BinStr :: binary()) -> {IsHasSw :: boolean(), ReBinStr :: binary()}.
isHasRpSw(BinStr) ->
TotalSize = byte_size(BinStr),
case doMatchRs(BinStr, TotalSize - 1, _Index = 1, _State = 0, _MatchList = []) of
case doMatchRs(BinStr, TotalSize - 1, _Index = 1, _PtrInx = 0, _MatchList = []) of
[] ->
{false, BinStr};
MatchBIMWs ->
@ -253,49 +242,42 @@ dealMatchList([{_OldByteIndex, OldMatchWordCnt, OldWordIndex} | LeftMatchList] =
dealMatchList(LeftMatchList, CurByteIndex, MatchWordCnt, CurWordIndex)
end.
doMatchRs(<<>>, _TotalSize, _CurIndex, _State, MatchList) ->
doMatchRs(<<>>, _TotalSize, _CurIndex, _PtrInx, MatchList) ->
MatchList;
doMatchRs(<<Word/utf8, Tail/binary>>, TotalSize, CurIndex, State, MatchList) ->
doMatchRs(<<Word/utf8, Tail/binary>>, TotalSize, CurIndex, PtrInx, MatchList) ->
case acsSpw:getSpw(Word) of
true ->
doMatchRs(Tail, TotalSize, CurIndex, State, MatchList);
doMatchRs(Tail, TotalSize, CurIndex, PtrInx, MatchList);
_ ->
{NewState, MatchCnt} = matchWordRs(Word, State, 0),
{NewPtrInx, MatchCnt} = matchWordRs(Word, PtrInx, 0),
case MatchCnt of
0 ->
doMatchRs(Tail, TotalSize, CurIndex + 1, NewState, MatchList);
doMatchRs(Tail, TotalSize, CurIndex + 1, NewPtrInx, MatchList);
_ ->
LeftSize = byte_size(Tail),
NewMatchList = dealMatchList(MatchList, TotalSize - LeftSize, MatchCnt, CurIndex),
doMatchRs(Tail, TotalSize, CurIndex + 1, NewState, NewMatchList)
doMatchRs(Tail, TotalSize, CurIndex + 1, NewPtrInx, NewMatchList)
end
end.
matchWordRs(Word, State, MatchCnt) ->
case acsTree:goto(State) of
undefined ->
case State of
matchWordRs(Word, PtrInx, MatchCnt) ->
case acsTree:goto(PtrInx) of
{Word, NextPtrInx} ->
NewMatchCnt = getOutputRs(NextPtrInx, MatchCnt),
{NextPtrInx, NewMatchCnt};
#{Word := NextPtrInx} ->
NewMatchCnt = getOutputRs(NextPtrInx, MatchCnt),
{NextPtrInx, NewMatchCnt};
_ ->
case PtrInx of
0 ->
{State, MatchCnt};
_ ->
{NextState, _} = acsTree:failOut(State),
matchWordRs(Word, NextState, MatchCnt)
end;
Node ->
case Node of
{Word, NextState} ->
NewMatchCnt = getOutputRs(NextState, MatchCnt),
{NextState, NewMatchCnt};
#{Word := NextState} ->
NewMatchCnt = getOutputRs(NextState, MatchCnt),
{NextState, NewMatchCnt};
{PtrInx, MatchCnt};
_ ->
case State of
0 ->
{State, MatchCnt};
_ ->
{NextState, _} = acsTree:failOut(State),
matchWordRs(Word, NextState, MatchCnt)
case acsTree:failOut(PtrInx) of
{NextPtrInx, _} ->
matchWordRs(Word, NextPtrInx, MatchCnt);
NextPtrInx ->
matchWordRs(Word, NextPtrInx, MatchCnt)
end
end
end.
@ -303,19 +285,22 @@ matchWordRs(Word, State, MatchCnt) ->
%%
getOutputRs(0, MatchCnt) ->
MatchCnt;
getOutputRs(State, MatchCnt) ->
{FailState, Pattern} = acsTree:failOut(State),
case Pattern of
undefined ->
getOutputRs(FailState, MatchCnt);
_ ->
getOutputRs(PtrInx, MatchCnt) ->
case acsTree:failOut(PtrInx) of
0 ->
MatchCnt;
{FailPtrInx, Pattern} ->
case Pattern > MatchCnt of
true ->
getOutputRs(FailState, Pattern);
getOutputRs(FailPtrInx, Pattern);
_ ->
getOutputRs(FailState, MatchCnt)
end
getOutputRs(FailPtrInx, MatchCnt)
end;
FailPtrInx ->
getOutputRs(FailPtrInx, MatchCnt)
end.
% *************************************** replaceSw end *************************************************************
strSize(<<>>, Cnt) ->

+ 71
- 54
src/genAcs.erl Ver fichero

@ -12,7 +12,7 @@ main(Args) ->
[SWFile, WriteDir] ->
case file:open(SWFile, [read, raw, binary, {read_ahead, 65536}, {encoding, utf8}]) of
{ok, IoDevice} ->
{Goto, Output} = dealEverySW(IoDevice, _Goto = #{0 => #{}}, _Output = #{}, _State = 0),
{Goto, Output} = dealEverySW(IoDevice, _Goto = #{0 => #{}}, _Output = #{}, _PtrInx = 0),
file:close(IoDevice),
Fail = genFail(Goto),
genSpw(WriteDir),
@ -69,17 +69,23 @@ writeFilter(CurLine, Line, FilterFile, LineMap) ->
writeFilter(CurLine + 1, Line, FilterFile, LineMap)
end.
dealEverySW(IoDevice, Goto, Output, MaxState) ->
dealEverySW(IoDevice, Goto, Output, MaxPtrInx) ->
case file:read_line(IoDevice) of
{ok, DataStr} ->
BinStr = binary:part(DataStr, 0, byte_size(DataStr) - 1),
BinStr =
case binary:last(DataStr) of
10 ->
binary:part(DataStr, 0, byte_size(DataStr) - 1);
_ ->
DataStr
end,
case BinStr =/= <<>> of
true ->
{NewGoto, NewState, NewMaxState} = addGoto(BinStr, Goto, 0, MaxState),
NewOutput = Output#{NewState => eAcs:strSize(BinStr, 0)},
dealEverySW(IoDevice, NewGoto, NewOutput, NewMaxState);
{NewGoto, EndPtrInx, NewMaxPtrInx} = addGoto(BinStr, Goto, 0, MaxPtrInx),
NewOutput = Output#{EndPtrInx => eAcs:strSize(BinStr, 0)},
dealEverySW(IoDevice, NewGoto, NewOutput, NewMaxPtrInx);
_ ->
dealEverySW(IoDevice, Goto, Output, MaxState)
dealEverySW(IoDevice, Goto, Output, MaxPtrInx)
end;
eof ->
{Goto, Output};
@ -90,54 +96,54 @@ dealEverySW(IoDevice, Goto, Output, MaxState) ->
%% ac搜索树
genTree(BinStrList) ->
%% goto and output table
{Goto, Output} = genGotoOutput(BinStrList, _Goto = #{0 => #{}}, _Output = #{}, _State = 0),
{Goto, Output} = genGotoOutput(BinStrList, _Goto = #{0 => #{}}, _Output = #{}, _PtrInx = 0),
%% fail table
Fail = genFail(Goto),
{Goto, Fail, Output}.
%% goto and output table
genGotoOutput([BinStr | Tail], Goto, Output, MaxState) ->
genGotoOutput([BinStr | Tail], Goto, Output, MaxPtrInx) ->
case BinStr =/= <<>> of
true ->
{NewGoto, NewState, NewMaxState} = addGoto(BinStr, Goto, 0, MaxState),
NewOutput = Output#{NewState => BinStr},
genGotoOutput(Tail, NewGoto, NewOutput, NewMaxState);
{NewGoto, EndPtrInx, NewMaxPtrInx} = addGoto(BinStr, Goto, 0, MaxPtrInx),
NewOutput = Output#{EndPtrInx => BinStr},
genGotoOutput(Tail, NewGoto, NewOutput, NewMaxPtrInx);
_ ->
genGotoOutput(Tail, Goto, Output, MaxState)
genGotoOutput(Tail, Goto, Output, MaxPtrInx)
end;
genGotoOutput([], Goto, Output, _MaxState) ->
genGotoOutput([], Goto, Output, _MaxPtrInx) ->
{Goto, Output}.
%% Goto
addGoto(<<Word/utf8, Tail/binary>>, Goto, State, MaxState) ->
#{State := Node} = Goto,
addGoto(<<Word/utf8, Tail/binary>>, Goto, PtrInx, MaxPtrInx) ->
#{PtrInx := Node} = Goto,
case Node of
#{Word := NextState} ->
addGoto(Tail, Goto, NextState, MaxState);
#{Word := NextPtrInx} ->
addGoto(Tail, Goto, NextPtrInx, MaxPtrInx);
_ ->
NewMaxState = MaxState + 1,
NewNode = Node#{Word => NewMaxState},
addGoto(Tail, Goto#{NewMaxState => #{}, State => NewNode}, NewMaxState, NewMaxState)
NewMaxPtrInx = MaxPtrInx + 1,
NewNode = Node#{Word => NewMaxPtrInx},
addGoto(Tail, Goto#{NewMaxPtrInx => #{}, PtrInx => NewNode}, NewMaxPtrInx, NewMaxPtrInx)
end;
addGoto(<<>>, Goto, State, MaxState) ->
{Goto, State, MaxState}.
addGoto(<<>>, Goto, PtrInx, MaxPtrInx) ->
{Goto, PtrInx, MaxPtrInx}.
%% Fail状态转移项
genFail(#{0 := Node} = Goto) ->
genFail(maps:values(Node), Goto, _Fail = #{}).
%% bfs搜索构造 Fail
genFail([State | Tail], Goto, Fail) ->
#{State := Node} = Goto,
genFail([PtrInx | Tail], Goto, Fail) ->
#{PtrInx := Node} = Goto,
%%
FailState = maps:get(State, Fail, 0),
FatherFailPtrInx = maps:get(PtrInx, Fail, 0),
%%
Kvs = maps:to_list(Node),
ChildKvs = maps:to_list(Node),
%%
NewFail = addFail(Kvs, FailState, Goto, Fail),
NewFail = addFail(ChildKvs, FatherFailPtrInx, Goto, Fail),
%%
NewQueue = Tail ++ maps:values(Node),
@ -146,29 +152,29 @@ genFail([], _Goto, Fail) ->
Fail.
%%
%% @param FailState
addFail([{Word, State} | Tail], FailState, Goto, Fail) ->
NewFail = findFailNode(Word, State, FailState, Goto, Fail),
addFail(Tail, FailState, Goto, NewFail);
addFail([], _FailState, _Goto, Fail) ->
%% @param FatherFailPtrInx
addFail([{Word, PtrInx} | Tail], FatherFailPtrInx, Goto, Fail) ->
NewFail = findFailNode(Word, PtrInx, FatherFailPtrInx, Goto, Fail),
addFail(Tail, FatherFailPtrInx, Goto, NewFail);
addFail([], _FatherFailPtrInx, _Goto, Fail) ->
Fail.
%%
findFailNode(Word, State, FailState, Goto, Fail) ->
#{FailState := Node} = Goto,
findFailNode(Word, PtrInx, FatherFailPtrInx, Goto, Fail) ->
#{FatherFailPtrInx := Node} = Goto,
case Node of
#{Word := TheFailState} ->
#{Word := MyFailPtrInx} ->
%%
Fail#{State => TheFailState};
Fail#{PtrInx => MyFailPtrInx};
_ ->
case FailState =:= 0 of
case FatherFailPtrInx =:= 0 of
true ->
%%
Fail;
_ ->
%%
NewFailState = maps:get(FailState, Fail, 0),
findFailNode(Word, State, NewFailState, Goto, Fail)
NewFatherFailPtrInx = maps:get(FatherFailPtrInx, Fail, 0),
findFailNode(Word, PtrInx, NewFatherFailPtrInx, Goto, Fail)
end
end.
@ -206,25 +212,36 @@ doGenGoto([{K, V} | SortKvs], StrAcc) ->
end.
genFailOut([], _Fail, _Output, StrAcc) ->
<<StrAcc/binary, "\nfailOut(_) -> {0, undefined}.">>;
genFailOut([State], Fail, Output, StrAcc) ->
FailState = maps:get(State, Fail, 0),
Pattern = maps:get(State, Output, undefined),
case FailState /= 0 orelse Pattern /= undefined of
<<StrAcc/binary, "\nfailOut(_) -> 0.">>;
genFailOut([PtrInx], Fail, Output, StrAcc) ->
FailPtrInx = maps:get(PtrInx, Fail, 0),
Pattern = maps:get(PtrInx, Output, undefined),
case FailPtrInx /= 0 orelse Pattern /= undefined of
true ->
<<StrAcc/binary, "failOut(", (integer_to_binary(State))/binary, ") -> ", (iolist_to_binary(io_lib:format(<<"~w">>, [{FailState, Pattern}])))/binary, ";\nfailOut(_) -> {0, undefined}.">>;
case Pattern of
undefined ->
<<StrAcc/binary, "failOut(", (integer_to_binary(PtrInx))/binary, ") -> ", (iolist_to_binary(io_lib:format(<<"~w">>, [FailPtrInx])))/binary, ";\nfailOut(_) -> 0.">>;
_ ->
<<StrAcc/binary, "failOut(", (integer_to_binary(PtrInx))/binary, ") -> ", (iolist_to_binary(io_lib:format(<<"~w">>, [{FailPtrInx, Pattern}])))/binary, ";\nfailOut(_) -> 0.">>
end;
_ ->
<<StrAcc/binary, ";\nfailOut(_) -> {0, undefined}.">>
<<StrAcc/binary, ";\nfailOut(_) -> 0.">>
end;
genFailOut([State | SortStates], Fail, Output, StrAcc) ->
FailState = maps:get(State, Fail, 0),
Pattern = maps:get(State, Output, undefined),
case FailState /= 0 orelse Pattern /= undefined of
genFailOut([PtrInx | SortPtrInxes], Fail, Output, StrAcc) ->
FailPtrInx = maps:get(PtrInx, Fail, 0),
Pattern = maps:get(PtrInx, Output, undefined),
case FailPtrInx /= 0 orelse Pattern /= undefined of
true ->
NewStrAcc = <<StrAcc/binary, "failOut(", (integer_to_binary(State))/binary, ") -> ", (iolist_to_binary(io_lib:format(<<"~w">>, [{FailState, Pattern}])))/binary, ";\n">>,
genFailOut(SortStates, Fail, Output, NewStrAcc);
NewStrAcc =
case Pattern of
undefined ->
<<StrAcc/binary, "failOut(", (integer_to_binary(PtrInx))/binary, ") -> ", (iolist_to_binary(io_lib:format(<<"~w">>, [FailPtrInx])))/binary, ";\n">>;
_ ->
<<StrAcc/binary, "failOut(", (integer_to_binary(PtrInx))/binary, ") -> ", (iolist_to_binary(io_lib:format(<<"~w">>, [{FailPtrInx, Pattern}])))/binary, ";\n">>
end,
genFailOut(SortPtrInxes, Fail, Output, NewStrAcc);
_ ->
genFailOut(SortStates, Fail, Output, StrAcc)
genFailOut(SortPtrInxes, Fail, Output, StrAcc)
end.
-spec load(Module :: atom(), Export :: [{Fun :: atom(), Arity :: pos_integer()}], Str :: string()) -> {module, Module :: atom()} | {error, _}.

Cargando…
Cancelar
Guardar