diff --git a/src/eAcs.erl b/src/eAcs.erl index 5b5187d..3b4a479 100644 --- a/src/eAcs.erl +++ b/src/eAcs.erl @@ -7,6 +7,8 @@ , strSize/2 %% 获取utf8字符串的长度 ]). +-define(RW, 42). %% 替换字符的utf8code + %% state 0 is the root node %% Goto: State -> #{Word -> State} %% failOut: State -> {FailState, BinStr} @@ -132,14 +134,73 @@ getOutputIs(State) -> %% *************************************** matchSw end *************************************************************** %% *************************************** replaceSw start ************************************************************* replaceSw(BinStr) -> - MatchBIMWs = doMatchRs(BinStr, _TotalSize = byte_size(BinStr) - 1, _Index = 1, _State = 0, _MatchList = []), - doReplaceSw(lists:reverse(MatchBIMWs), 0, BinStr, <<>>). + TotalSize = byte_size(BinStr), + MatchBIMWs = doMatchRs(BinStr, TotalSize - 1, _Index = 1, _State = 0, _MatchList = []), + %io:format("IMY******************* ~p~n", [lists:reverse(MatchBIMWs)]), + doReplaceSw(lists:reverse(MatchBIMWs), BinStr, TotalSize, _StartPos = 0, <<>>). %% 从前往后替换 -doReplaceSw([], _BinStr, _StartPos, Acc) -> - iolist_to_binary(Acc); -doReplaceSw([{CurByteIndex, MatchWordCnt, _CurWordIndex} | MatchBIMWs], _BinStr, StartPos, Acc) -> - iolist_to_binary(Acc). +doReplaceSw([], BinStr, TotalSize, StartPos, BinAcc) -> + case TotalSize > StartPos of + true -> + <>; + _ -> + BinAcc + end; +doReplaceSw([{CurByteIndex, MatchWordCnt, _CurWordIndex} | MatchBIMWs], BinStr, TotalSize, StartPos, BinAcc) -> + {EndByteIndex, FilterWs} = getMatchWords(MatchWordCnt, BinStr, CurByteIndex, _BslCnt = 0, _Utf8Code = 0, []), + RPStr = unicode:characters_to_binary(FilterWs, utf8), + case StartPos =< EndByteIndex of + true -> + NewBinAcc = <>; + _ -> + NewBinAcc = <> + end, + doReplaceSw(MatchBIMWs, BinStr, TotalSize, CurByteIndex + 1, NewBinAcc). + +getMatchWords(0, _BinStr, ByteIndex, _BslCnt, _Utf8Code, FilterWs) -> + {ByteIndex, FilterWs}; +getMatchWords(MatchWordCnt, BinStr, ByteIndex, BslCnt, Utf8Code, FilterWs) -> + Byte = binary:at(BinStr, ByteIndex), + %io:format("IMY****************~p ~n", [Byte]), + if + Byte band 2#10000000 == 0 -> + %% ASCII 查看是否特殊字符 是就维护原样 + case acsSpw:getSpw(Byte) of + true -> + %% 是特殊词 + getMatchWords(MatchWordCnt, BinStr, ByteIndex - 1, 0, 0, [Byte | FilterWs]); + _ -> + %% 不是特殊词 + getMatchWords(MatchWordCnt - 1, BinStr, ByteIndex - 1, 0, 0, [?RW | FilterWs]) + end; + Byte band 2#10000000 == 128 -> + Code = Byte band 2#00111111, + getMatchWords(MatchWordCnt, BinStr, ByteIndex - 1, BslCnt + 6, Code bsl BslCnt + Utf8Code, FilterWs); + true -> + %io:format("IMY****************~p", [Byte]), + case BslCnt of + 6 -> + Code = Byte band 2#00011111; + 12 -> + Code = Byte band 2#00001111; + 18 -> + Code = Byte band 2#00000111; + 24 -> + Code = Byte band 2#00000011; + 30 -> + Code = Byte band 2#00000001 + end, + FullWord = Code bsl BslCnt + Utf8Code, + case acsSpw:getSpw(FullWord) of + true -> + %% 是特殊词 + getMatchWords(MatchWordCnt, BinStr, ByteIndex - 1, 0, 0, [FullWord | FilterWs]); + _ -> + %% 不是特殊词 + getMatchWords(MatchWordCnt - 1, BinStr, ByteIndex - 1, 0, 0, [?RW | FilterWs]) + end + end. %% 递归处理判断最新的匹配是否包含或者连接了历史匹配的{CurByteIndex, MatchWordCnt, CurWordIndex} dealMatchList([], CurByteIndex, MatchWordCnt, CurWordIndex) -> diff --git a/src/test/acTest.erl b/src/test/acTest.erl index 72f468f..f5a6782 100644 --- a/src/test/acTest.erl +++ b/src/test/acTest.erl @@ -10,9 +10,22 @@ test1() -> test2() -> acTc:ts(1000000, eAcs, matchSw, [<<"fdsfads拉法叶舰fds淫秽ffdsfdsffdddd"/utf8>>]). +test21() -> + acTc:ts(1000000, eAcs, replaceSw, [<<"fdsfads拉法叶舰fds淫秽ffdsfdsffdddd"/utf8>>]). + +test22() -> + acTc:ts(1000000, eAcs, isHasSw, [<<"fdsfads拉法叶舰fds淫秽ffdsfdsffdddd"/utf8>>]). + test3(Cnt, BinStr) -> acTc:ts(Cnt, eAcs, matchSw, [BinStr]). +test31(Cnt, BinStr) -> + acTc:ts(Cnt, keyword, filter, [BinStr]). + test4(Cnt, FileName) -> {ok, Data} = file:read_file(FileName), - test3(Cnt, Data). \ No newline at end of file + test3(Cnt, Data). + +test41(Cnt, FileName) -> + {ok, Data} = file:read_file(FileName), + test31(Cnt, Data). \ No newline at end of file diff --git a/src/test/test.txt b/src/test/test.txt index 9279f5a..edd2128 100644 --- a/src/test/test.txt +++ b/src/test/test.txt @@ -3,7 +3,3 @@ sssa ssafds sg afd - - - -