|
|
@ -7,6 +7,8 @@ |
|
|
|
, strSize/2 %% 获取utf8字符串的长度 |
|
|
|
]). |
|
|
|
|
|
|
|
-define(RW, 42). %% 替换字符的utf8code |
|
|
|
|
|
|
|
%% state 0 is the root node |
|
|
|
%% Goto: State -> #{Word -> State} |
|
|
|
%% failOut: State -> {FailState, BinStr} |
|
|
@ -132,14 +134,73 @@ getOutputIs(State) -> |
|
|
|
%% *************************************** matchSw end *************************************************************** |
|
|
|
%% *************************************** replaceSw start ************************************************************* |
|
|
|
replaceSw(BinStr) -> |
|
|
|
MatchBIMWs = doMatchRs(BinStr, _TotalSize = byte_size(BinStr) - 1, _Index = 1, _State = 0, _MatchList = []), |
|
|
|
doReplaceSw(lists:reverse(MatchBIMWs), 0, BinStr, <<>>). |
|
|
|
TotalSize = byte_size(BinStr), |
|
|
|
MatchBIMWs = doMatchRs(BinStr, TotalSize - 1, _Index = 1, _State = 0, _MatchList = []), |
|
|
|
%io:format("IMY******************* ~p~n", [lists:reverse(MatchBIMWs)]), |
|
|
|
doReplaceSw(lists:reverse(MatchBIMWs), BinStr, TotalSize, _StartPos = 0, <<>>). |
|
|
|
|
|
|
|
%% 从前往后替换 |
|
|
|
doReplaceSw([], _BinStr, _StartPos, Acc) -> |
|
|
|
iolist_to_binary(Acc); |
|
|
|
doReplaceSw([{CurByteIndex, MatchWordCnt, _CurWordIndex} | MatchBIMWs], _BinStr, StartPos, Acc) -> |
|
|
|
iolist_to_binary(Acc). |
|
|
|
doReplaceSw([], BinStr, TotalSize, StartPos, BinAcc) -> |
|
|
|
case TotalSize > StartPos of |
|
|
|
true -> |
|
|
|
<<BinAcc/binary, (binary:part(BinStr, StartPos, TotalSize - StartPos))/binary>>; |
|
|
|
_ -> |
|
|
|
BinAcc |
|
|
|
end; |
|
|
|
doReplaceSw([{CurByteIndex, MatchWordCnt, _CurWordIndex} | MatchBIMWs], BinStr, TotalSize, StartPos, BinAcc) -> |
|
|
|
{EndByteIndex, FilterWs} = getMatchWords(MatchWordCnt, BinStr, CurByteIndex, _BslCnt = 0, _Utf8Code = 0, []), |
|
|
|
RPStr = unicode:characters_to_binary(FilterWs, utf8), |
|
|
|
case StartPos =< EndByteIndex of |
|
|
|
true -> |
|
|
|
NewBinAcc = <<BinAcc/binary, (binary:part(BinStr, StartPos, EndByteIndex - StartPos + 1))/binary, RPStr/binary>>; |
|
|
|
_ -> |
|
|
|
NewBinAcc = <<BinAcc/binary, RPStr/binary>> |
|
|
|
end, |
|
|
|
doReplaceSw(MatchBIMWs, BinStr, TotalSize, CurByteIndex + 1, NewBinAcc). |
|
|
|
|
|
|
|
getMatchWords(0, _BinStr, ByteIndex, _BslCnt, _Utf8Code, FilterWs) -> |
|
|
|
{ByteIndex, FilterWs}; |
|
|
|
getMatchWords(MatchWordCnt, BinStr, ByteIndex, BslCnt, Utf8Code, FilterWs) -> |
|
|
|
Byte = binary:at(BinStr, ByteIndex), |
|
|
|
%io:format("IMY****************~p ~n", [Byte]), |
|
|
|
if |
|
|
|
Byte band 2#10000000 == 0 -> |
|
|
|
%% ASCII 查看是否特殊字符 是就维护原样 |
|
|
|
case acsSpw:getSpw(Byte) of |
|
|
|
true -> |
|
|
|
%% 是特殊词 |
|
|
|
getMatchWords(MatchWordCnt, BinStr, ByteIndex - 1, 0, 0, [Byte | FilterWs]); |
|
|
|
_ -> |
|
|
|
%% 不是特殊词 |
|
|
|
getMatchWords(MatchWordCnt - 1, BinStr, ByteIndex - 1, 0, 0, [?RW | FilterWs]) |
|
|
|
end; |
|
|
|
Byte band 2#10000000 == 128 -> |
|
|
|
Code = Byte band 2#00111111, |
|
|
|
getMatchWords(MatchWordCnt, BinStr, ByteIndex - 1, BslCnt + 6, Code bsl BslCnt + Utf8Code, FilterWs); |
|
|
|
true -> |
|
|
|
%io:format("IMY****************~p", [Byte]), |
|
|
|
case BslCnt of |
|
|
|
6 -> |
|
|
|
Code = Byte band 2#00011111; |
|
|
|
12 -> |
|
|
|
Code = Byte band 2#00001111; |
|
|
|
18 -> |
|
|
|
Code = Byte band 2#00000111; |
|
|
|
24 -> |
|
|
|
Code = Byte band 2#00000011; |
|
|
|
30 -> |
|
|
|
Code = Byte band 2#00000001 |
|
|
|
end, |
|
|
|
FullWord = Code bsl BslCnt + Utf8Code, |
|
|
|
case acsSpw:getSpw(FullWord) of |
|
|
|
true -> |
|
|
|
%% 是特殊词 |
|
|
|
getMatchWords(MatchWordCnt, BinStr, ByteIndex - 1, 0, 0, [FullWord | FilterWs]); |
|
|
|
_ -> |
|
|
|
%% 不是特殊词 |
|
|
|
getMatchWords(MatchWordCnt - 1, BinStr, ByteIndex - 1, 0, 0, [?RW | FilterWs]) |
|
|
|
end |
|
|
|
end. |
|
|
|
|
|
|
|
%% 递归处理判断最新的匹配是否包含或者连接了历史匹配的{CurByteIndex, MatchWordCnt, CurWordIndex} |
|
|
|
dealMatchList([], CurByteIndex, MatchWordCnt, CurWordIndex) -> |
|
|
|