Przeglądaj źródła

ft: replaceSw函数优化 敏感词文本特殊字符预处理and去掉重复的敏感词

master
SisMaker 4 lat temu
rodzic
commit
5cad3efd81
6 zmienionych plików z 17030 dodań i 23 usunięć
  1. +4
    -1
      README.md
  2. BIN
      genAcs
  3. +2
    -0
      genAcs.cmd
  4. +0
    -5
      src/eAcs.erl
  5. +78
    -17
      src/genAcs.erl
  6. +16946
    -0
      src/test/SWord1.txt

+ 4
- 1
README.md Wyświetl plik

@ -1,6 +1,6 @@
eAcs
=====
基于ac算法实现的快速高效的敏感词匹配,检查,过滤功能
基于ac算法实现的快速高效的敏感词匹配,检查,过滤功能, 另外特殊字符不参与敏感词匹配,检查和替换, 替换是会按照原位置保留
Build
-----
@ -9,6 +9,9 @@ Build
Uses
-----
敏感词预处理 去除特殊字符和去掉重复的敏感词 (SWordFile 和 OutputDirFile) 可以同名
脚本生成:./genAcs -f/-F SWordFile OutputDirFile
函数调用: genAcs:main(["-f"/"-F", SWordFile, OutputDirFile])
创建 acsTree.erl
脚本生成:./genAcs SWordFile OutputDir
函数调用: genAcs:main([SWordFile, OutputDir])

BIN
genAcs Wyświetl plik


+ 2
- 0
genAcs.cmd Wyświetl plik

@ -0,0 +1,2 @@
@echo off
escript.exe "%~dpn0" %*

+ 0
- 5
src/eAcs.erl Wyświetl plik

@ -9,9 +9,6 @@
-define(RW, 42). %% utf8code
%% state 0 is the root node
%% Goto: State -> #{Word -> State}
%% failOut: State -> {FailState, BinStr}
%% *************************************** matchSw start ***************************************************************
-spec matchSw(BinStr :: binary()) -> [{StartIndex :: integer(), EndIndex :: integer(), Pattern :: binary()}].
matchSw(BinStr) ->
@ -136,7 +133,6 @@ replaceSw(BinStr) ->
MatchBIMWs = doMatchRs(BinStr, TotalSize - 1, _Index = 1, _State = 0, _MatchList = []),
doReplaceSw(lists:reverse(MatchBIMWs), BinStr, TotalSize, _StartPos = 0, <<>>).
%%
doReplaceSw([], BinStr, TotalSize, StartPos, BinAcc) ->
case TotalSize > StartPos of
true ->
@ -218,7 +214,6 @@ getMatchWords(MatchWordCnt, BinStr, ByteIndex, FilterWs) ->
end
end.
%% {CurByteIndex, MatchWordCnt, CurWordIndex}
dealMatchList([], CurByteIndex, MatchWordCnt, CurWordIndex) ->
[{CurByteIndex, MatchWordCnt, CurWordIndex}];
dealMatchList([{_OldByteIndex, OldMatchWordCnt, OldWordIndex} | LeftMatchList] = OldMatchList, CurByteIndex, MatchWordCnt, CurWordIndex) ->

+ 78
- 17
src/genAcs.erl Wyświetl plik

@ -8,18 +8,68 @@
-define(Spw, <<" ~〜,,::.。;;-_=+*&^…%$#@!!|??'‘’\"“”`·()[]{}()【】「」//\\\n\t"/utf8>>).
main(Args) ->
[SWFile, WriteDir] = Args,
case file:open(SWFile, [read, raw, binary, {read_ahead, 65536}, {encoding, utf8}]) of
{ok, IoDevice} ->
{Goto, Output} = dealEveryLine(IoDevice, _Goto = #{0 => #{}}, _Output = #{}, _State = 0),
Fail = genFail(Goto),
genSpw(WriteDir),
genErl(WriteDir, Goto, Fail, Output);
case Args of
[SWFile, WriteDir] ->
case file:open(SWFile, [read, raw, binary, {read_ahead, 65536}, {encoding, utf8}]) of
{ok, IoDevice} ->
{Goto, Output} = dealEverySW(IoDevice, _Goto = #{0 => #{}}, _Output = #{}, _State = 0),
file:close(IoDevice),
Fail = genFail(Goto),
genSpw(WriteDir),
genErl(WriteDir, Goto, Fail, Output);
_Err ->
io:format("genAcs open the SWord file:~p error ~p~n", [SWFile, _Err])
end;
[Cmd, SWFile, FilterFile] when Cmd == "-F"; Cmd == "-f" ->
load(acsSpw, [{getSpw, 1}], binary_to_list(spwStr())),
case file:open(SWFile, [read, raw, binary, {read_ahead, 65536}, {encoding, utf8}]) of
{ok, IoDevice} ->
{Line, LineMap} = dealEveryFW(IoDevice, _UniqueMap = #{}, _LineMap = #{}, _Line = 1),
file:close(IoDevice),
file:delete(FilterFile),
writeFilter(1, Line, FilterFile, LineMap);
_Err ->
io:format("genAcs open the Filter file:~p error ~p~n", [SWFile, _Err])
end;
_ ->
io:format("Useage:\n\t1: to gen acsTree.erl and acsSqw.erl with genAcs SWFile OuputDir\n\t2: to filter special word in SWFile and frop repetitive words with genAcs -f/F SWFile OuputDir\n"),
ok
end.
dealEveryFW(IoDevice, UniqueMap, LineMap, Line) ->
case file:read_line(IoDevice) of
{ok, DataStr} ->
BinStr = binary:part(DataStr, 0, byte_size(DataStr) - 1),
case BinStr =/= <<>> of
true ->
FilterBin = <<<<W/utf8>> || <<W/utf8>> <= BinStr, acsSpw:getSpw(W) /= true>>,
case UniqueMap of
#{FilterBin := _} ->
dealEveryFW(IoDevice, UniqueMap, LineMap, Line);
_ ->
dealEveryFW(IoDevice, UniqueMap#{FilterBin => 1}, LineMap#{Line => FilterBin}, Line + 1)
end;
_ ->
dealEveryFW(IoDevice, UniqueMap, LineMap, Line)
end;
eof ->
{Line, LineMap};
_Err ->
io:format("genAcs open the file:~p error ~p~n", [SWFile, _Err])
io:format("genAcs read the Filter file error ~p~n", [_Err])
end.
writeFilter(Line, Line, FilterFile, _LineMap) ->
file:write_file(FilterFile, [], [append, sync]);
writeFilter(CurLine, Line, FilterFile, LineMap) ->
case LineMap of
#{CurLine := BinStr} ->
file:write_file(FilterFile, [BinStr, <<"\n">>], [append]),
writeFilter(CurLine + 1, Line, FilterFile, LineMap);
_ ->
writeFilter(CurLine + 1, Line, FilterFile, LineMap)
end.
dealEveryLine(IoDevice, Goto, Output, MaxState) ->
dealEverySW(IoDevice, Goto, Output, MaxState) ->
case file:read_line(IoDevice) of
{ok, DataStr} ->
BinStr = binary:part(DataStr, 0, byte_size(DataStr) - 1),
@ -27,14 +77,14 @@ dealEveryLine(IoDevice, Goto, Output, MaxState) ->
true ->
{NewGoto, NewState, NewMaxState} = addGoto(BinStr, Goto, 0, MaxState),
NewOutput = Output#{NewState => eAcs:strSize(BinStr, 0)},
dealEveryLine(IoDevice, NewGoto, NewOutput, NewMaxState);
dealEverySW(IoDevice, NewGoto, NewOutput, NewMaxState);
_ ->
dealEveryLine(IoDevice, Goto, Output, MaxState)
dealEverySW(IoDevice, Goto, Output, MaxState)
end;
eof ->
{Goto, Output};
_Err ->
io:format("genAcs read the file error ~p~n", [_Err])
io:format("genAcs read the SWord file error ~p~n", [_Err])
end.
%% ac搜索树
@ -170,13 +220,24 @@ genFailOut([State | SortStates], Fail, Output, StrAcc) ->
genFailOut(SortStates, Fail, Output, StrAcc)
end.
genSpw(WriteDir) ->
Head = <<"-module(acsSpw).\n\n-compile([deterministic, no_line_info]).\n\n-export([getSpw/1]).\n\n">>,
-spec load(Module :: atom(), Export :: [{Fun :: atom(), Arity :: pos_integer()}], Str :: string()) -> {module, Module :: atom()} | {error, _}.
load(Module, Export, Str) ->
{ok, Tokens, _EndLine} = erl_scan:string(Str),
{ok, Forms} = erl_parse:parse_form(Tokens),
NewForms = [{attribute, 1, module, Module}, {attribute, 2, export, Export}, Forms],
{ok, _, Binary} = compile:forms(NewForms),
code:load_binary(Module, "", Binary).
spwHead() ->
<<"-module(acsSpw).\n\n-compile([deterministic, no_line_info]).\n\n-export([getSpw/1]).\n\n">>.
spwStr() ->
GetSw = <<<<"getSpw(", (integer_to_binary(Spw))/binary, ") -> true;\n">> || <<Spw/utf8>> <= ?Spw>>,
LastSw = <<Head/binary, GetSw/binary, "getSpw(_) -> false.">>,
FileName = filename:join([WriteDir, "acsSpw.erl"]),
file:write_file(FileName, LastSw).
<<GetSw/binary, "getSpw(_) -> false.">>.
genSpw(WriteDir) ->
FileName = filename:join([WriteDir, "acsSpw.erl"]),
file:write_file(FileName, <<(spwHead())/binary, (spwStr())/binary>>).
genErl(WriteDir, Goto, Fail, Output) ->
HeadStr = genHead(),

+ 16946
- 0
src/test/SWord1.txt
Plik diff jest za duży
Wyświetl plik


Ładowanie…
Anuluj
Zapisz