@ -0,0 +1,29 @@ | |||
.eunit | |||
*.o | |||
*.beam | |||
*.plt | |||
erl_crash.dump | |||
.concrete/DEV_MODE | |||
# rebar 2.x | |||
.rebar | |||
rel/example_project | |||
ebin/* | |||
deps | |||
# rebar 3 | |||
.rebar3 | |||
_build/ | |||
_checkouts/ | |||
rebar.lock | |||
# idea | |||
.idea | |||
*.iml | |||
cmake-build* | |||
CMakeLists.txt | |||
# nif compile temp file | |||
*.pdb | |||
*.d | |||
compile_commands.json |
@ -0,0 +1,21 @@ | |||
The MIT License | |||
Copyright (c) 2019-2020 alisdair sullivan <alisdairsullivan@yahoo.ca> | |||
Permission is hereby granted, free of charge, to any person obtaining a copy | |||
of this software and associated documentation files (the "Software"), to deal | |||
in the Software without restriction, including without limitation the rights | |||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |||
copies of the Software, and to permit persons to whom the Software is | |||
furnished to do so, subject to the following conditions: | |||
The above copyright notice and this permission notice shall be included in | |||
all copies or substantial portions of the Software. | |||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | |||
THE SOFTWARE. |
@ -0,0 +1,9 @@ | |||
eAcs | |||
===== | |||
An OTP library | |||
Build | |||
----- | |||
$ rebar3 compile |
@ -0,0 +1,3 @@ | |||
去你妈的 | |||
你妈 | |||
@ -0,0 +1,2 @@ | |||
{erl_opts, [no_debug_info]}. | |||
{deps, []}. |
@ -0,0 +1,33 @@ | |||
-module(acTest). | |||
-include_lib("eunit/include/eunit.hrl"). | |||
ahocorasick_test_() -> | |||
[ | |||
{ | |||
"test unicode", | |||
{ | |||
setup, | |||
fun() -> eAcs:genTree(["去你妈的","你妈"]) end, | |||
fun aho_corasick_chn/1 | |||
} | |||
}, | |||
{ | |||
"test ascii code", | |||
{ | |||
setup, | |||
fun() -> eAcs:genTree(["BC","ABCD"]) end, | |||
fun aho_corasick_eng/1 | |||
} | |||
} | |||
]. | |||
aho_corasick_chn(Aho) -> | |||
[ | |||
?_assertEqual([{2,3,"你妈"}], eAcs:match("去你妈", Aho)), | |||
?_assertEqual([], eAcs:match("测试", Aho)) | |||
]. | |||
aho_corasick_eng(Aho) -> | |||
Result = eAcs:match("ABC", Aho), | |||
?_assertEqual([{2,3,"BC"}], Result). |
@ -0,0 +1,10 @@ | |||
{application, eAcs, | |||
[{description, "An OTP library"}, | |||
{vsn, "0.1.0"}, | |||
{registered, []}, | |||
{applications, [kernel, stdlib]}, | |||
{env, []}, | |||
{modules, []}, | |||
{licenses, ["MIT"]}, | |||
{links, []} | |||
]}. |
@ -0,0 +1,54 @@ | |||
-module(eAcs). | |||
-export([ | |||
match/2 | |||
]). | |||
%% State is used to locate node, every node is a map | |||
%% state 0 is the root node | |||
%% | |||
%% Goto: State -> Map{Char -> State} | |||
%% Ouput: State -> String | |||
%% Failure: State -> State | |||
%% try to find patterns in string | |||
%% the match index starts from 1 | |||
%% @return [{StartIndex, EndIndex, Pattern},...] | |||
match(String, {Goto, Failure, Output}) -> | |||
do_match(String, 0, {Goto, Failure, Output}, _Index = 1, _MatchList = []). | |||
do_match([], _, _, _Index, MatchList) -> | |||
MatchList; | |||
do_match([Char|Tail], State, {Goto, Failure, Output}, Index, MatchList) -> | |||
{NewState, NewMatchList} = do_match_inner(Char, State, {Goto, Failure, Output}, Index, MatchList), | |||
do_match(Tail, NewState, {Goto, Failure, Output}, Index + 1, NewMatchList). | |||
%% {NewState, NewMatchList} | |||
do_match_inner(Char, State, {Goto, Failure, Output}, Index, MatchList) -> | |||
#{State := Node} = Goto, | |||
case maps:find(Char, Node) of | |||
error -> | |||
case State =:= 0 of | |||
true -> | |||
{State, MatchList}; | |||
false -> | |||
NextState = maps:get(State, Failure, 0), | |||
do_match_inner(Char, NextState, {Goto, Failure, Output}, Index, MatchList) | |||
end; | |||
{ok, NextState} -> | |||
NewMatchList = get_output(NextState, {Goto, Failure, Output}, Index, MatchList), | |||
{NextState, NewMatchList} | |||
end. | |||
get_output(0, _, _Index, MatchList) -> | |||
MatchList; | |||
get_output(State, {Goto, Failure, Output}, Index, MatchList) -> | |||
NewMatchList = case maps:find(State, Output) of | |||
error -> MatchList; | |||
{ok, Pattern} -> [{Index-length(Pattern) + 1, Index, Pattern} | MatchList] | |||
end, | |||
FailureState = maps:get(State, Failure, 0), | |||
get_output(FailureState, {Goto, Failure, Output}, Index, NewMatchList). | |||
@ -0,0 +1,116 @@ | |||
-module(genAcs). | |||
-export([ | |||
main/1 | |||
, genTree/1 | |||
]). | |||
main(Args) -> | |||
[SNFile] = Args, | |||
case file:open(SNFile, [read, raw, binary, {read_ahead, 65536}, {'encoding', 'utf8'}]) of | |||
{ok, IoDevice} -> | |||
{Goto, Output} = dealEveryLine(IoDevice, _Goto=#{0 => #{}}, _Output=#{}, _State=0), | |||
Failure = genFailure(Goto), | |||
genErl(Goto, Failure, Output); | |||
_Err -> | |||
io:format("genAcs open the file:~p error ~p~n", [SNFile, _Err]) | |||
end. | |||
dealEveryLine(IoDevice, Goto, Output, MaxState) -> | |||
case file:read_line(IoDevice) of | |||
{ok, DataStr} -> | |||
%% io:format("IMY*********** ~w ~n", [DataStr]), | |||
BinStr = binary:part(DataStr, 0, byte_size(DataStr) - 1), | |||
{NewGoto, NewState, NewMaxState} = addPattern(BinStr, Goto, 0, MaxState), | |||
NewOutput = Output#{NewState => BinStr}, | |||
dealEveryLine(IoDevice, NewGoto, NewOutput, NewMaxState); | |||
eof -> | |||
{Goto, Output}; | |||
_Err -> | |||
io:format("genAcs read the file error ~p~n", [_Err]) | |||
end. | |||
%% 从字符串模式列表构建ac搜索树 | |||
genTree(BinStrList) -> | |||
%% 先构造 goto and output table | |||
{Goto, Output} = genGotoOutput(BinStrList, _Goto=#{0 => #{}}, _Output=#{}, _State=0), | |||
%% 然后构造 failure table | |||
Failure = genFailure(Goto), | |||
{Goto, Failure, Output}. | |||
genGotoOutput([BinStr |Tail], Goto, Output, MaxState) -> | |||
{NewGoto, NewState, NewMaxState} = addPattern(BinStr, Goto, 0, MaxState), | |||
NewOutput = Output#{NewState => BinStr}, | |||
genGotoOutput(Tail, NewGoto, NewOutput, NewMaxState); | |||
genGotoOutput([], Goto, Output, _MaxState) -> | |||
{Goto, Output}. | |||
addPattern(<<Word/utf8, Tail/binary>> = BinStr, Goto, State, MaxState) -> | |||
#{State := Node} = Goto, | |||
<<Word1/utf8, Tail2/binary>> = BinStr, | |||
io:format("IMY*********Word~p ~p ~p~n", [Word, Word1, BinStr]), | |||
case Node of | |||
#{Word := NextState} -> | |||
addPattern(Tail, Goto, NextState, MaxState); | |||
_ -> | |||
NewMaxState = MaxState + 1, | |||
NewNode = Node#{Word => NewMaxState}, | |||
addPattern(Tail, Goto#{NewMaxState => #{}, State => NewNode}, NewMaxState, NewMaxState) | |||
end; | |||
addPattern(<<>>, Goto, State, MaxState) -> | |||
{Goto, State, MaxState}. | |||
genFailure(#{0 := Node} = Goto) -> | |||
States = maps:values(Node), | |||
genFailure(States, Goto, _Failure=#{}). | |||
%% 构造 failure with bfs搜索 | |||
genFailure([], _Goto, Failure) -> | |||
Failure; | |||
genFailure([State|Tail], Goto, Failure) -> | |||
#{State := Node} = Goto, | |||
%% find the starting point: the parent's failure node | |||
FailureState = maps:get(State, Failure, 0), | |||
%% children | |||
Kvs = maps:to_list(Node), | |||
%% find failure node for all children | |||
NewFailure = genFailureInner(Kvs, FailureState, Goto, Failure), | |||
%% add children states to the queue | |||
NewQueue = Tail ++ maps:values(Node), | |||
genFailure(NewQueue, Goto, NewFailure). | |||
%% 为节点构造失败指针 | |||
%% @param FailureState 是当前节点的失败指针 | |||
genFailureInner([], _FailureState, _Goto, Failure) -> | |||
Failure; | |||
genFailureInner([{Word, State}|Tail], FailureState, Goto, Failure) -> | |||
NewFailure = findFailureNode(Word, State, FailureState, Goto, Failure), | |||
genFailureInner(Tail, FailureState, Goto, NewFailure). | |||
%% 为某个儿子节点构造失败指针 | |||
findFailureNode(Word, State, FailureState, Goto, Failure) -> | |||
#{FailureState := Node} = Goto, | |||
case Node of | |||
#{Word := TheFailureState} -> | |||
%% 找到最近的失败节点的儿子节点拥有当前儿子节点的值,查找成功 | |||
Failure#{State => TheFailureState}; | |||
_ -> | |||
case FailureState =:= 0 of | |||
true -> %% 找不到,而且已经到了根节点,查找失败 | |||
Failure; | |||
_ -> %% 找不到但是还没到根节点,继续往上找 | |||
NewFailureState = maps:get(FailureState, Failure, 0), | |||
findFailureNode(Word, State, NewFailureState, Goto, Failure) | |||
end | |||
end. | |||
genErl(Goto, Failure, Output) -> | |||
ok. | |||