@ -0,0 +1,29 @@ | |||||
.eunit | |||||
*.o | |||||
*.beam | |||||
*.plt | |||||
erl_crash.dump | |||||
.concrete/DEV_MODE | |||||
# rebar 2.x | |||||
.rebar | |||||
rel/example_project | |||||
ebin/* | |||||
deps | |||||
# rebar 3 | |||||
.rebar3 | |||||
_build/ | |||||
_checkouts/ | |||||
rebar.lock | |||||
# idea | |||||
.idea | |||||
*.iml | |||||
cmake-build* | |||||
CMakeLists.txt | |||||
# nif compile temp file | |||||
*.pdb | |||||
*.d | |||||
compile_commands.json |
@ -0,0 +1,21 @@ | |||||
The MIT License | |||||
Copyright (c) 2019-2020 alisdair sullivan <alisdairsullivan@yahoo.ca> | |||||
Permission is hereby granted, free of charge, to any person obtaining a copy | |||||
of this software and associated documentation files (the "Software"), to deal | |||||
in the Software without restriction, including without limitation the rights | |||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |||||
copies of the Software, and to permit persons to whom the Software is | |||||
furnished to do so, subject to the following conditions: | |||||
The above copyright notice and this permission notice shall be included in | |||||
all copies or substantial portions of the Software. | |||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | |||||
THE SOFTWARE. |
@ -0,0 +1,9 @@ | |||||
eAcs | |||||
===== | |||||
An OTP library | |||||
Build | |||||
----- | |||||
$ rebar3 compile |
@ -0,0 +1,3 @@ | |||||
去你妈的 | |||||
你妈 | |||||
@ -0,0 +1,2 @@ | |||||
{erl_opts, [no_debug_info]}. | |||||
{deps, []}. |
@ -0,0 +1,33 @@ | |||||
-module(acTest). | |||||
-include_lib("eunit/include/eunit.hrl"). | |||||
ahocorasick_test_() -> | |||||
[ | |||||
{ | |||||
"test unicode", | |||||
{ | |||||
setup, | |||||
fun() -> eAcs:genTree(["去你妈的","你妈"]) end, | |||||
fun aho_corasick_chn/1 | |||||
} | |||||
}, | |||||
{ | |||||
"test ascii code", | |||||
{ | |||||
setup, | |||||
fun() -> eAcs:genTree(["BC","ABCD"]) end, | |||||
fun aho_corasick_eng/1 | |||||
} | |||||
} | |||||
]. | |||||
aho_corasick_chn(Aho) -> | |||||
[ | |||||
?_assertEqual([{2,3,"你妈"}], eAcs:match("去你妈", Aho)), | |||||
?_assertEqual([], eAcs:match("测试", Aho)) | |||||
]. | |||||
aho_corasick_eng(Aho) -> | |||||
Result = eAcs:match("ABC", Aho), | |||||
?_assertEqual([{2,3,"BC"}], Result). |
@ -0,0 +1,10 @@ | |||||
{application, eAcs, | |||||
[{description, "An OTP library"}, | |||||
{vsn, "0.1.0"}, | |||||
{registered, []}, | |||||
{applications, [kernel, stdlib]}, | |||||
{env, []}, | |||||
{modules, []}, | |||||
{licenses, ["MIT"]}, | |||||
{links, []} | |||||
]}. |
@ -0,0 +1,54 @@ | |||||
-module(eAcs). | |||||
-export([ | |||||
match/2 | |||||
]). | |||||
%% State is used to locate node, every node is a map | |||||
%% state 0 is the root node | |||||
%% | |||||
%% Goto: State -> Map{Char -> State} | |||||
%% Ouput: State -> String | |||||
%% Failure: State -> State | |||||
%% try to find patterns in string | |||||
%% the match index starts from 1 | |||||
%% @return [{StartIndex, EndIndex, Pattern},...] | |||||
match(String, {Goto, Failure, Output}) -> | |||||
do_match(String, 0, {Goto, Failure, Output}, _Index = 1, _MatchList = []). | |||||
do_match([], _, _, _Index, MatchList) -> | |||||
MatchList; | |||||
do_match([Char|Tail], State, {Goto, Failure, Output}, Index, MatchList) -> | |||||
{NewState, NewMatchList} = do_match_inner(Char, State, {Goto, Failure, Output}, Index, MatchList), | |||||
do_match(Tail, NewState, {Goto, Failure, Output}, Index + 1, NewMatchList). | |||||
%% {NewState, NewMatchList} | |||||
do_match_inner(Char, State, {Goto, Failure, Output}, Index, MatchList) -> | |||||
#{State := Node} = Goto, | |||||
case maps:find(Char, Node) of | |||||
error -> | |||||
case State =:= 0 of | |||||
true -> | |||||
{State, MatchList}; | |||||
false -> | |||||
NextState = maps:get(State, Failure, 0), | |||||
do_match_inner(Char, NextState, {Goto, Failure, Output}, Index, MatchList) | |||||
end; | |||||
{ok, NextState} -> | |||||
NewMatchList = get_output(NextState, {Goto, Failure, Output}, Index, MatchList), | |||||
{NextState, NewMatchList} | |||||
end. | |||||
get_output(0, _, _Index, MatchList) -> | |||||
MatchList; | |||||
get_output(State, {Goto, Failure, Output}, Index, MatchList) -> | |||||
NewMatchList = case maps:find(State, Output) of | |||||
error -> MatchList; | |||||
{ok, Pattern} -> [{Index-length(Pattern) + 1, Index, Pattern} | MatchList] | |||||
end, | |||||
FailureState = maps:get(State, Failure, 0), | |||||
get_output(FailureState, {Goto, Failure, Output}, Index, NewMatchList). | |||||
@ -0,0 +1,116 @@ | |||||
-module(genAcs). | |||||
-export([ | |||||
main/1 | |||||
, genTree/1 | |||||
]). | |||||
main(Args) -> | |||||
[SNFile] = Args, | |||||
case file:open(SNFile, [read, raw, binary, {read_ahead, 65536}, {'encoding', 'utf8'}]) of | |||||
{ok, IoDevice} -> | |||||
{Goto, Output} = dealEveryLine(IoDevice, _Goto=#{0 => #{}}, _Output=#{}, _State=0), | |||||
Failure = genFailure(Goto), | |||||
genErl(Goto, Failure, Output); | |||||
_Err -> | |||||
io:format("genAcs open the file:~p error ~p~n", [SNFile, _Err]) | |||||
end. | |||||
dealEveryLine(IoDevice, Goto, Output, MaxState) -> | |||||
case file:read_line(IoDevice) of | |||||
{ok, DataStr} -> | |||||
%% io:format("IMY*********** ~w ~n", [DataStr]), | |||||
BinStr = binary:part(DataStr, 0, byte_size(DataStr) - 1), | |||||
{NewGoto, NewState, NewMaxState} = addPattern(BinStr, Goto, 0, MaxState), | |||||
NewOutput = Output#{NewState => BinStr}, | |||||
dealEveryLine(IoDevice, NewGoto, NewOutput, NewMaxState); | |||||
eof -> | |||||
{Goto, Output}; | |||||
_Err -> | |||||
io:format("genAcs read the file error ~p~n", [_Err]) | |||||
end. | |||||
%% 从字符串模式列表构建ac搜索树 | |||||
genTree(BinStrList) -> | |||||
%% 先构造 goto and output table | |||||
{Goto, Output} = genGotoOutput(BinStrList, _Goto=#{0 => #{}}, _Output=#{}, _State=0), | |||||
%% 然后构造 failure table | |||||
Failure = genFailure(Goto), | |||||
{Goto, Failure, Output}. | |||||
genGotoOutput([BinStr |Tail], Goto, Output, MaxState) -> | |||||
{NewGoto, NewState, NewMaxState} = addPattern(BinStr, Goto, 0, MaxState), | |||||
NewOutput = Output#{NewState => BinStr}, | |||||
genGotoOutput(Tail, NewGoto, NewOutput, NewMaxState); | |||||
genGotoOutput([], Goto, Output, _MaxState) -> | |||||
{Goto, Output}. | |||||
addPattern(<<Word/utf8, Tail/binary>> = BinStr, Goto, State, MaxState) -> | |||||
#{State := Node} = Goto, | |||||
<<Word1/utf8, Tail2/binary>> = BinStr, | |||||
io:format("IMY*********Word~p ~p ~p~n", [Word, Word1, BinStr]), | |||||
case Node of | |||||
#{Word := NextState} -> | |||||
addPattern(Tail, Goto, NextState, MaxState); | |||||
_ -> | |||||
NewMaxState = MaxState + 1, | |||||
NewNode = Node#{Word => NewMaxState}, | |||||
addPattern(Tail, Goto#{NewMaxState => #{}, State => NewNode}, NewMaxState, NewMaxState) | |||||
end; | |||||
addPattern(<<>>, Goto, State, MaxState) -> | |||||
{Goto, State, MaxState}. | |||||
genFailure(#{0 := Node} = Goto) -> | |||||
States = maps:values(Node), | |||||
genFailure(States, Goto, _Failure=#{}). | |||||
%% 构造 failure with bfs搜索 | |||||
genFailure([], _Goto, Failure) -> | |||||
Failure; | |||||
genFailure([State|Tail], Goto, Failure) -> | |||||
#{State := Node} = Goto, | |||||
%% find the starting point: the parent's failure node | |||||
FailureState = maps:get(State, Failure, 0), | |||||
%% children | |||||
Kvs = maps:to_list(Node), | |||||
%% find failure node for all children | |||||
NewFailure = genFailureInner(Kvs, FailureState, Goto, Failure), | |||||
%% add children states to the queue | |||||
NewQueue = Tail ++ maps:values(Node), | |||||
genFailure(NewQueue, Goto, NewFailure). | |||||
%% 为节点构造失败指针 | |||||
%% @param FailureState 是当前节点的失败指针 | |||||
genFailureInner([], _FailureState, _Goto, Failure) -> | |||||
Failure; | |||||
genFailureInner([{Word, State}|Tail], FailureState, Goto, Failure) -> | |||||
NewFailure = findFailureNode(Word, State, FailureState, Goto, Failure), | |||||
genFailureInner(Tail, FailureState, Goto, NewFailure). | |||||
%% 为某个儿子节点构造失败指针 | |||||
findFailureNode(Word, State, FailureState, Goto, Failure) -> | |||||
#{FailureState := Node} = Goto, | |||||
case Node of | |||||
#{Word := TheFailureState} -> | |||||
%% 找到最近的失败节点的儿子节点拥有当前儿子节点的值,查找成功 | |||||
Failure#{State => TheFailureState}; | |||||
_ -> | |||||
case FailureState =:= 0 of | |||||
true -> %% 找不到,而且已经到了根节点,查找失败 | |||||
Failure; | |||||
_ -> %% 找不到但是还没到根节点,继续往上找 | |||||
NewFailureState = maps:get(FailureState, Failure, 0), | |||||
findFailureNode(Word, State, NewFailureState, Goto, Failure) | |||||
end | |||||
end. | |||||
genErl(Goto, Failure, Output) -> | |||||
ok. | |||||