Pārlūkot izejas kodu

ft: 增加特殊字符处理逻辑 增加replace函数逻辑 TODO: 构造ac树之前过滤每个敏感词过滤掉特殊字符 完成替换函数后续逻辑

master
SisMaker pirms 4 gadiem
vecāks
revīzija
6c13edfccd
3 mainītis faili ar 143 papildinājumiem un 33 dzēšanām
  1. +127
    -30
      src/eAcs.erl
  2. +11
    -0
      src/genAcs.erl
  3. +5
    -3
      src/test/test.txt

+ 127
- 30
src/eAcs.erl Parādīt failu

@ -13,15 +13,21 @@
%% *************************************** matchSw start ***************************************************************
-spec matchSw(BinStr :: binary()) -> [{StartIndex :: integer(), EndIndex :: integer(), Pattern :: binary()}].
matchSw(BinStr) ->
doMatch(BinStr, 0, _Index = 1, _MatchList = []).
doMatchMs(BinStr, 0, _Index = 1, _MatchList = []).
doMatch(<<>>, _, _Index, MatchList) ->
doMatchMs(<<>>, _, _Index, MatchList) ->
MatchList;
doMatch(<<Word/utf8, Tail/binary>>, State, Index, MatchList) ->
{NewState, NewMatchList} = matchWord(Word, State, Index, MatchList),
doMatch(Tail, NewState, Index + 1, NewMatchList).
doMatchMs(<<Word/utf8, Tail/binary>>, State, Index, MatchList) ->
case acsSpw:getSpw(Word) of
true ->
%% Index 1
doMatchMs(Tail, State, Index, MatchList);
_ ->
{NewState, NewMatchList} = matchWordMs(Word, State, Index, MatchList),
doMatchMs(Tail, NewState, Index + 1, NewMatchList)
end.
matchWord(Word, State, Index, MatchList) ->
matchWordMs(Word, State, Index, MatchList) ->
Node = acsTree:goto(State),
case Node of
undefined ->
@ -30,12 +36,12 @@ matchWord(Word, State, Index, MatchList) ->
{State, MatchList};
_ ->
{NextState, _} = acsTree:failOut(State),
matchWord(Word, NextState, Index, MatchList)
matchWordMs(Word, NextState, Index, MatchList)
end;
_ ->
case Node of
#{Word := NextState} ->
NewMatchList = getOutput(NextState, Index, MatchList),
NewMatchList = getOutputMs(NextState, Index, MatchList),
{NextState, NewMatchList};
_ ->
case State of
@ -43,40 +49,46 @@ matchWord(Word, State, Index, MatchList) ->
{State, MatchList};
_ ->
{NextState, _} = acsTree:failOut(State),
matchWord(Word, NextState, Index, MatchList)
matchWordMs(Word, NextState, Index, MatchList)
end
end
end.
getOutput(0, _Index, MatchList) ->
getOutputMs(0, _Index, MatchList) ->
MatchList;
getOutput(State, Index, MatchList) ->
getOutputMs(State, Index, MatchList) ->
{FailState, Pattern} = acsTree:failOut(State),
case Pattern of
undefined ->
getOutput(FailState, Index, MatchList);
getOutputMs(FailState, Index, MatchList);
_ ->
NewMatchList = [{Index - Pattern + 1, Pattern} | MatchList],
getOutput(FailState, Index, NewMatchList)
getOutputMs(FailState, Index, NewMatchList)
end.
%% *************************************** matchSw end ***************************************************************
%% *************************************** isHasSw start ***************************************************************
-spec isHasSw(BinStr :: binary()) -> boolean().
isHasSw(BinStr) ->
doMatch(BinStr, 0).
doMatchIs(BinStr, 0).
doMatch(<<>>, _) ->
doMatchIs(<<>>, _) ->
false;
doMatch(<<Word/utf8, Tail/binary>>, State) ->
case matchWord(Word, State) of
doMatchIs(<<Word/utf8, Tail/binary>>, State) ->
case acsSpw:getSpw(Word) of
true ->
true;
NewState ->
doMatch(Tail, NewState)
%%
doMatchIs(Tail, State);
_ ->
case matchWordIs(Word, State) of
true ->
true;
NewState ->
doMatchIs(Tail, NewState)
end
end.
matchWord(Word, State) ->
matchWordIs(Word, State) ->
Node = acsTree:goto(State),
case Node of
undefined ->
@ -85,12 +97,12 @@ matchWord(Word, State) ->
State;
_ ->
{NextState, _} = acsTree:failOut(State),
matchWord(Word, NextState)
matchWordIs(Word, NextState)
end;
_ ->
case Node of
#{Word := NextState} ->
case getOutput(NextState) of
case getOutputIs(NextState) of
false ->
NextState;
_ ->
@ -102,26 +114,111 @@ matchWord(Word, State) ->
State;
_ ->
{NextState, _} = acsTree:failOut(State),
matchWord(Word, NextState)
matchWordIs(Word, NextState)
end
end
end.
getOutput(0) ->
getOutputIs(0) ->
false;
getOutput(State) ->
getOutputIs(State) ->
{FailState, Pattern} = acsTree:failOut(State),
case Pattern of
undefined ->
getOutput(FailState);
getOutputIs(FailState);
_ ->
true
end.
%% *************************************** matchSw end ***************************************************************
%% *************************************** replaceSw start *************************************************************
replaceSw(_BinStr) ->
ok.
%% *************************************** replaceSw end *************************************************************
replaceSw(BinStr) ->
MatchBIMWs = doMatchRs(BinStr, _TotalSize = byte_size(BinStr) - 1, _Index = 1, _State = 0, _MatchList = []),
doReplaceSw(lists:reverse(MatchBIMWs), 0, BinStr, <<>>).
%%
doReplaceSw([], _BinStr, _StartPos, Acc) ->
iolist_to_binary(Acc);
doReplaceSw([{CurByteIndex, MatchWordCnt, _CurWordIndex} | MatchBIMWs], _BinStr, StartPos, Acc) ->
iolist_to_binary(Acc).
%% {CurByteIndex, MatchWordCnt, CurWordIndex}
dealMatchList([], CurByteIndex, MatchWordCnt, CurWordIndex) ->
[{CurByteIndex, MatchWordCnt, CurWordIndex}];
dealMatchList([{_OldByteIndex, OldMatchWordCnt, OldWordIndex} | LeftMatchList] = OldMatchList, CurByteIndex, MatchWordCnt, CurWordIndex) ->
CurStartIndex = CurWordIndex - MatchWordCnt,
OldStartIndex = OldWordIndex - OldMatchWordCnt,
if
CurStartIndex > OldWordIndex + 1 ->
[{CurByteIndex, MatchWordCnt, CurWordIndex} | OldMatchList];
CurStartIndex >= OldStartIndex ->
[{CurByteIndex, CurWordIndex - OldStartIndex, CurWordIndex} | LeftMatchList];
true ->
dealMatchList(LeftMatchList, CurByteIndex, MatchWordCnt, CurWordIndex)
end.
doMatchRs(<<>>, _TotalSize, _CurIndex, _State, MatchList) ->
MatchList;
doMatchRs(<<Word/utf8, Tail/binary>>, TotalSize, CurIndex, State, MatchList) ->
case acsSpw:getSpw(Word) of
true ->
%% CurIndex 1
doMatchRs(Tail, TotalSize, CurIndex, State, MatchList);
_ ->
{NewState, MatchCnt} = matchWordRs(Word, State, 0),
case MatchCnt of
0 ->
doMatchRs(Tail, TotalSize, CurIndex + 1, NewState, MatchList);
_ ->
LeftSize = byte_size(Tail),
NewMatchList = dealMatchList(MatchList, TotalSize - LeftSize, MatchCnt, CurIndex),
doMatchRs(Tail, TotalSize, CurIndex + 1, NewState, NewMatchList)
end
end.
matchWordRs(Word, State, MatchCnt) ->
Node = acsTree:goto(State),
case Node of
undefined ->
case State of
0 ->
{State, MatchCnt};
_ ->
{NextState, _} = acsTree:failOut(State),
matchWordRs(Word, NextState, MatchCnt)
end;
_ ->
case Node of
#{Word := NextState} ->
NewMatchCnt = getOutputRs(NextState, MatchCnt),
{NextState, NewMatchCnt};
_ ->
case State of
0 ->
{State, MatchCnt};
_ ->
{NextState, _} = acsTree:failOut(State),
matchWordRs(Word, NextState, MatchCnt)
end
end
end.
%%
getOutputRs(0, MatchCnt) ->
MatchCnt;
getOutputRs(State, MatchCnt) ->
{FailState, Pattern} = acsTree:failOut(State),
case Pattern of
undefined ->
getOutputRs(FailState, MatchCnt);
_ ->
case Pattern > MatchCnt of
true ->
getOutputRs(FailState, Pattern);
_ ->
getOutputRs(FailState, MatchCnt)
end
end.
% *************************************** replaceSw end *************************************************************
strSize(<<>>, Cnt) ->
Cnt;

+ 11
- 0
src/genAcs.erl Parādīt failu

@ -5,12 +5,15 @@
, genTree/1
]).
-define(Spw, <<" ~〜,,::.。;;-_=+*&^…%$#@!!|??'‘’\"“”`·()[]{}()【】「」//\\\n\t"/utf8>>).
main(Args) ->
[SWFile, WriteDir] = Args,
case file:open(SWFile, [read, raw, binary, {read_ahead, 65536}, {encoding, utf8}]) of
{ok, IoDevice} ->
{Goto, Output} = dealEveryLine(IoDevice, _Goto = #{0 => #{}}, _Output = #{}, _State = 0),
Fail = genFail(Goto),
genSpw(WriteDir),
genErl(WriteDir, Goto, Fail, Output);
_Err ->
io:format("genAcs open the file:~p error ~p~n", [SWFile, _Err])
@ -167,6 +170,14 @@ genFailOut([State | SortStates], Fail, Output, StrAcc) ->
genFailOut(SortStates, Fail, Output, StrAcc)
end.
genSpw(WriteDir) ->
Head = <<"-module(acsSpw).\n\n-compile([deterministic, no_line_info]).\n\n-export([getSpw/1]).\n\n">>,
GetSw = <<<<"getSpw(", (integer_to_binary(Spw))/binary, ") -> true;\n">> || <<Spw/utf8>> <= ?Spw>>,
LastSw = <<Head/binary, GetSw/binary, "getSpw(_) -> false.">>,
FileName = filename:join([WriteDir, "acsSpw.erl"]),
file:write_file(FileName, LastSw).
genErl(WriteDir, Goto, Fail, Output) ->
HeadStr = genHead(),
GotoStr = genGoto(Goto, HeadStr),

+ 5
- 3
src/test/test.txt Parādīt failu

@ -1,6 +1,8 @@
abcd
cd
c
abcfdsssafdsgdsd
sssa
ssafds
sg
afd

Notiek ielāde…
Atcelt
Saglabāt