From 66e39f21f8471f4f9ee60817eea8cc2c686793aa Mon Sep 17 00:00:00 2001 From: Sungjin Park Date: Mon, 5 Jan 2015 16:27:13 +0900 Subject: [PATCH 01/27] Add lager_manager_killer and tests for it. --- rebar.config | 2 ++ src/lager_app.erl | 26 ++++++++++++++ src/lager_handler_watcher.erl | 8 +++++ src/lager_manager_killer.erl | 44 ++++++++++++++++++++++++ test/lager_manager_killer_test.erl | 55 ++++++++++++++++++++++++++++++ test/lager_slow_backend.erl | 34 ++++++++++++++++++ 6 files changed, 169 insertions(+) create mode 100644 src/lager_manager_killer.erl create mode 100644 test/lager_manager_killer_test.erl create mode 100644 test/lager_slow_backend.erl diff --git a/rebar.config b/rebar.config index 494a6ca..dbb749a 100644 --- a/rebar.config +++ b/rebar.config @@ -10,3 +10,5 @@ {cover_enabled, true}. {edoc_opts, [{stylesheet_file, "./priv/edoc.css"}]}. + +{eunit_opts, [verbose]}. diff --git a/src/lager_app.erl b/src/lager_app.erl index 6364aef..e50b41c 100644 --- a/src/lager_app.erl +++ b/src/lager_app.erl @@ -62,6 +62,32 @@ start(_StartType, _StartArgs) -> throw({error, bad_config}) end, + case application:get_env(lager, killer_hwm) of + undefined -> + ok; + {ok, undefined} -> + undefined; + {ok, KillerHWM} when is_integer(KillerHWM), KillerHWM >= 0 -> + KillerReinstallAfter = + case application:get_env(lager, killer_reinstall_after) of + undefined -> + 5000; + {ok, undefined} -> + 5000; + {ok, V} when is_integer(V), V >= 0 -> + V; + {ok, BadKillerReinstallAfter} -> + error_logger:error_msg("Invalid value for 'cooldown': ~p~n", [BadKillerReinstallAfter]), + throw({error, bad_config}) + end, + _ = supervisor:start_child(lager_handler_watcher_sup, + [lager_event, lager_manager_killer, [KillerHWM, KillerReinstallAfter]]), + ok; + {ok, BadKillerHWM} -> + error_logger:error_msg("Invalid value for 'floodline': ~p~n", [BadKillerHWM]), + throw({error, bad_config}) + end, + Handlers = case application:get_env(lager, handlers) of undefined -> [{lager_console_backend, info}, diff --git a/src/lager_handler_watcher.erl b/src/lager_handler_watcher.erl index 24498a4..1cd2804 100644 --- a/src/lager_handler_watcher.erl +++ b/src/lager_handler_watcher.erl @@ -61,6 +61,14 @@ handle_info({gen_event_EXIT, Module, normal}, #state{module=Module} = State) -> {stop, normal, State}; handle_info({gen_event_EXIT, Module, shutdown}, #state{module=Module} = State) -> {stop, normal, State}; +handle_info({gen_event_EXIT, Module, {'EXIT', {kill_me, [KillerHWM, KillerReinstallAfter]}}}, + #state{module=Module, event=Event} = State) -> + % Brutally kill the manager but stay alive to restore settings. + Manager = whereis(Event), + unlink(Manager), + exit(Manager, kill), + erlang:send_after(KillerReinstallAfter, self(), reinstall_handler), + {noreply, State#state{config=[KillerHWM, KillerReinstallAfter]}}; handle_info({gen_event_EXIT, Module, Reason}, #state{module=Module, config=Config, event=Event} = State) -> case lager:log(error, self(), "Lager event handler ~p exited with reason ~s", diff --git a/src/lager_manager_killer.erl b/src/lager_manager_killer.erl new file mode 100644 index 0000000..bb959f9 --- /dev/null +++ b/src/lager_manager_killer.erl @@ -0,0 +1,44 @@ +-module(lager_manager_killer). +-author("Sungjin Park "). +-behavior(gen_event). + +-export([init/1, handle_event/2, handle_call/2, handle_info/2, terminate/2, code_change/3]). + +-include("lager.hrl"). + +-record(state, { + killer_hwm :: non_neg_integer(), + killer_reinstall_after :: non_neg_integer() +}). + +init([KillerHWM, KillerReinstallAfter]) -> + {ok, #state{killer_hwm=KillerHWM, killer_reinstall_after=KillerReinstallAfter}}. + +handle_call(get_loglevel, State) -> + {ok, {mask, ?LOG_NONE}, State}; +handle_call({set_loglevel, _Level}, State) -> + {ok, ok, State}; +handle_call(get_settings, State = #state{killer_hwm=KillerHWM, killer_reinstall_after=KillerReinstallAfter}) -> + {ok, [KillerHWM, KillerReinstallAfter], State}; +handle_call(_Request, State) -> + {ok, ok, State}. + +handle_event({log, _Message}, State = #state{killer_hwm=KillerHWM, killer_reinstall_after=KillerReinstallAfter}) -> + {message_queue_len, Len} = process_info(self(), message_queue_len), + case Len > KillerHWM of + true -> + exit({kill_me, [KillerHWM, KillerReinstallAfter]}); + _ -> + {ok, State} + end; +handle_event(_Event, State) -> + {ok, State}. + +handle_info(_Info, State) -> + {ok, State}. + +terminate(_Reason, _State) -> + ok. + +code_change(_OldVsn, State, _Extra) -> + {ok, State}. diff --git a/test/lager_manager_killer_test.erl b/test/lager_manager_killer_test.erl new file mode 100644 index 0000000..1fcb59f --- /dev/null +++ b/test/lager_manager_killer_test.erl @@ -0,0 +1,55 @@ +-module(lager_manager_killer_test). +-author("Sungjin Park "). + +-compile([{parse_transform, lager_transform}]). + +-ifdef(TEST). +-include_lib("eunit/include/eunit.hrl"). + +overload_test() -> + application:stop(lager), + application:load(lager), + Delay = 1000, % sleep 1 sec on every log + KillerHWM = 10, % kill the manager if there are more than 10 pending logs + KillerReinstallAfter = 1000, % reinstall killer after 1 sec + application:set_env(lager, handlers, [{lager_slow_backend, Delay}]), + application:set_env(lager, async_threshold, undefined), + application:set_env(lager, killer_hwm, KillerHWM), + application:set_env(lager, killer_reinstall_after, KillerReinstallAfter), + ensure_started(lager), + lager_config:set(async, true), + Manager = whereis(lager_event), + erlang:trace(all, true, [procs]), + [lager:info("~p'th message", [N]) || N <- lists:seq(1,KillerHWM+2)], + Margin = 100, + receive + {trace, Manager, exit, killed} -> + ?debugFmt("Manager ~p killed", [Manager]) + after Delay+Margin -> + ?assert(false) + end, + receive + {trace, _Sup, spawn, Pid, Fun} -> + ?assert(process_info(Pid, registered_name) =:= {registered_name, lager_event}), + ?debugFmt("Manager ~p start with ~p", [Pid, Fun]), + ?assertNot(lists:member(lager_manager_killer, gen_event:which_handlers(lager_event))) + after Margin -> + ?assert(false) + end, + erlang:trace(all, false, [procs]), + timer:sleep(KillerReinstallAfter), + ?assert(proplists:get_value(lager_manager_killer, gen_event:which_handlers(lager_event))), + ?assert(gen_event:call(lager_event, lager_manager_killer, get_settings) =:= [KillerHWM, KillerReinstallAfter]), + ?debugFmt("Killer reinstalled with [~p, ~p]", [KillerHWM, KillerReinstallAfter]), + application:stop(lager). + +ensure_started(App) -> + case application:start(App) of + ok -> + ok; + {error, {not_started, Dep}} -> + ensure_started(Dep), + ensure_started(App) + end. + +-endif. \ No newline at end of file diff --git a/test/lager_slow_backend.erl b/test/lager_slow_backend.erl new file mode 100644 index 0000000..9771083 --- /dev/null +++ b/test/lager_slow_backend.erl @@ -0,0 +1,34 @@ +-module(lager_slow_backend). +-author("Sungjin Park "). +-behavior(gen_event). + +-export([init/1, handle_call/2, handle_event/2, handle_info/2, terminate/2, code_change/3]). + +-include("lager.hrl"). + +-record(state, { + delay :: non_neg_integer() +}). + +init(Delay) -> + {ok, #state{delay=Delay}}. + +handle_call(get_loglevel, State) -> + {ok, lager_util:config_to_mask(debug), State}; +handle_call(_Request, State) -> + {ok, ok, State}. + +handle_event({log, _Message}, State) -> + timer:sleep(State#state.delay), + {ok, State}; +handle_event(_Event, State) -> + {ok, State}. + +handle_info(_Info, State) -> + {ok, State}. + +terminate(_Reason, _State) -> + ok. + +code_change(_OldVsn, State, _Extra) -> + {ok, State}. From 03cd95562ae8821cc79cab297c0a69924b460344 Mon Sep 17 00:00:00 2001 From: "Brian L. Troutwine" Date: Thu, 10 Mar 2016 16:03:39 -0800 Subject: [PATCH 02/27] Attempt to redress test failures. At this point in the work, the killer will correctly stop the DEFAULT_SINK in the event of overload but our friend never comes back up and lager stops working past this point. Oops! Error message: =SUPERVISOR REPORT==== 10-Mar-2016::16:18:11 === Supervisor: {local,lager_handler_watcher_sup} Context: child_terminated Reason: killed Offender: [{pid,<0.63.0>}, {id,lager_handler_watcher}, {mfargs,{lager_handler_watcher,start_link,undefined}}, {restart_type,temporary}, {shutdown,5000}, {child_type,worker}] =SUPERVISOR REPORT==== 10-Mar-2016::16:18:11 === Supervisor: {local,lager_handler_watcher_sup} Context: child_terminated Reason: killed Offender: [{pid,<0.59.0>}, {id,lager_handler_watcher}, {mfargs,{lager_handler_watcher,start_link,undefined}}, {restart_type,temporary}, {shutdown,5000}, {child_type,worker}] =PROGRESS REPORT==== 10-Mar-2016::16:18:11 === supervisor: {local,lager_sup} started: [{pid,<0.113.0>}, {id,lager}, {mfargs,{gen_event,start_link,[{local,lager_event}]}}, {restart_type,permanent}, {shutdown,5000}, {child_type,worker}] Signed-off-by: Brian L. Troutwine --- src/lager_app.erl | 36 ++++++++++++++-------------- src/lager_handler_watcher.erl | 4 ++-- src/lager_manager_killer.erl | 38 +++++++++++++++--------------- test/lager_manager_killer_test.erl | 4 ++-- test/lager_slow_backend.erl | 2 +- 5 files changed, 42 insertions(+), 42 deletions(-) diff --git a/src/lager_app.erl b/src/lager_app.erl index 87ef4e9..67a5f7a 100644 --- a/src/lager_app.erl +++ b/src/lager_app.erl @@ -146,24 +146,24 @@ start_error_logger_handler(_, HWM, {ok, WhiteList}) -> throw({error, bad_config}) end, - KillerHWM = case application:get_env(lager, killer_hwm) of - undefined -> -1; - {ok, undefined} -> -1; - {ok, KHWM} when is_integer(KHWM), KHWM >= 0 -> KHWM - end, - KillerReinstallAfter = case application:get_env(lager, killer_reinstall_after) of - undefined -> 5000; - {ok, undefined} -> 5000; - {ok, V} when is_integer(V), V >= 0 -> V; - {ok, BadKillerReinstallAfter} -> - error_logger:error_msg("Invalid value for 'cooldown': ~p~n", - [BadKillerReinstallAfter]), - throw({error, bad_config}) - end, - - _ = supervisor:start_child(lager_handler_watcher_sup, - [?DEFAULT_SINK, lager_manager_killer, - [KillerHWM, KillerReinstallAfter]]), + %% Conditionally start the high watermark killer. + case application:get_env(lager, killer_hwm) of + undefined -> ok; + {ok, undefined} -> ok; + {ok, KillerHWM} when is_integer(KillerHWM), KillerHWM >= 0 -> + KillerReinstallAfter = case application:get_env(lager, killer_reinstall_after) of + undefined -> 5000; + {ok, undefined} -> 5000; + {ok, V} when is_integer(V), V >= 0 -> V; + {ok, BadKillerReinstallAfter} -> + error_logger:error_msg("Invalid value for 'cooldown': ~p~n", + [BadKillerReinstallAfter]), + throw({error, bad_config}) + end, + _ = supervisor:start_child(lager_handler_watcher_sup, + [?DEFAULT_SINK, lager_manager_killer, + [KillerHWM, KillerReinstallAfter]]) + end, case supervisor:start_child(lager_handler_watcher_sup, [error_logger, error_logger_lager_h, [HWM, GlStrategy]]) of {ok, _} -> diff --git a/src/lager_handler_watcher.erl b/src/lager_handler_watcher.erl index e34553c..da67d8b 100644 --- a/src/lager_handler_watcher.erl +++ b/src/lager_handler_watcher.erl @@ -61,14 +61,14 @@ handle_info({gen_event_EXIT, Module, normal}, #state{module=Module} = State) -> {stop, normal, State}; handle_info({gen_event_EXIT, Module, shutdown}, #state{module=Module} = State) -> {stop, normal, State}; -handle_info({gen_event_EXIT, Module, {'EXIT', {kill_me, [KillerHWM, KillerReinstallAfter]}}}, +handle_info({gen_event_EXIT, Module, {'EXIT', {kill_me, [_KillerHWM, KillerReinstallAfter]}}}, #state{module=Module, sink=Sink} = State) -> % Brutally kill the manager but stay alive to restore settings. Manager = whereis(Sink), unlink(Manager), exit(Manager, kill), erlang:send_after(KillerReinstallAfter, self(), reinstall_handler), - {noreply, State#state{config=[KillerHWM, KillerReinstallAfter]}}; + {noreply, State}; handle_info({gen_event_EXIT, Module, Reason}, #state{module=Module, config=Config, sink=Sink} = State) -> case lager:log(error, self(), "Lager event handler ~p exited with reason ~s", diff --git a/src/lager_manager_killer.erl b/src/lager_manager_killer.erl index bb959f9..158b471 100644 --- a/src/lager_manager_killer.erl +++ b/src/lager_manager_killer.erl @@ -7,38 +7,38 @@ -include("lager.hrl"). -record(state, { - killer_hwm :: non_neg_integer(), - killer_reinstall_after :: non_neg_integer() -}). + killer_hwm :: non_neg_integer(), + killer_reinstall_after :: non_neg_integer() + }). init([KillerHWM, KillerReinstallAfter]) -> - {ok, #state{killer_hwm=KillerHWM, killer_reinstall_after=KillerReinstallAfter}}. + {ok, #state{killer_hwm=KillerHWM, killer_reinstall_after=KillerReinstallAfter}}. handle_call(get_loglevel, State) -> - {ok, {mask, ?LOG_NONE}, State}; + {ok, {mask, ?LOG_NONE}, State}; handle_call({set_loglevel, _Level}, State) -> - {ok, ok, State}; + {ok, ok, State}; handle_call(get_settings, State = #state{killer_hwm=KillerHWM, killer_reinstall_after=KillerReinstallAfter}) -> - {ok, [KillerHWM, KillerReinstallAfter], State}; + {ok, [KillerHWM, KillerReinstallAfter], State}; handle_call(_Request, State) -> - {ok, ok, State}. + {ok, ok, State}. handle_event({log, _Message}, State = #state{killer_hwm=KillerHWM, killer_reinstall_after=KillerReinstallAfter}) -> - {message_queue_len, Len} = process_info(self(), message_queue_len), - case Len > KillerHWM of - true -> - exit({kill_me, [KillerHWM, KillerReinstallAfter]}); - _ -> - {ok, State} - end; + {message_queue_len, Len} = process_info(self(), message_queue_len), + case Len > KillerHWM of + true -> + exit({kill_me, [KillerHWM, KillerReinstallAfter]}); + _ -> + {ok, State} + end; handle_event(_Event, State) -> - {ok, State}. + {ok, State}. handle_info(_Info, State) -> - {ok, State}. + {ok, State}. terminate(_Reason, _State) -> - ok. + ok. code_change(_OldVsn, State, _Extra) -> - {ok, State}. + {ok, State}. diff --git a/test/lager_manager_killer_test.erl b/test/lager_manager_killer_test.erl index 1fcb59f..6e63079 100644 --- a/test/lager_manager_killer_test.erl +++ b/test/lager_manager_killer_test.erl @@ -12,7 +12,7 @@ overload_test() -> Delay = 1000, % sleep 1 sec on every log KillerHWM = 10, % kill the manager if there are more than 10 pending logs KillerReinstallAfter = 1000, % reinstall killer after 1 sec - application:set_env(lager, handlers, [{lager_slow_backend, Delay}]), + application:set_env(lager, handlers, [{lager_slow_backend, [{delay, Delay}]}]), application:set_env(lager, async_threshold, undefined), application:set_env(lager, killer_hwm, KillerHWM), application:set_env(lager, killer_reinstall_after, KillerReinstallAfter), @@ -52,4 +52,4 @@ ensure_started(App) -> ensure_started(App) end. --endif. \ No newline at end of file +-endif. diff --git a/test/lager_slow_backend.erl b/test/lager_slow_backend.erl index 9771083..e467c23 100644 --- a/test/lager_slow_backend.erl +++ b/test/lager_slow_backend.erl @@ -10,7 +10,7 @@ delay :: non_neg_integer() }). -init(Delay) -> +init([{delay, Delay}]) -> {ok, #state{delay=Delay}}. handle_call(get_loglevel, State) -> From 34d994280a1f594fa20d96672b98ebd8f4813309 Mon Sep 17 00:00:00 2001 From: "Brian L. Troutwine" Date: Mon, 14 Mar 2016 13:36:17 -0700 Subject: [PATCH 03/27] Update test to not rely on sleep. This test only passes when run in isolation. That is, if you run the suite solo everything is beautiful and nothing hurts. When run in the group, it fails. By inspection it seems that no trace messages are being sent and this causes the failure, for want of specific traces. Signed-off-by: Brian L. Troutwine --- test/lager_manager_killer_test.erl | 107 ++++++++++++++++++----------- 1 file changed, 65 insertions(+), 42 deletions(-) diff --git a/test/lager_manager_killer_test.erl b/test/lager_manager_killer_test.erl index 6e63079..ac8898f 100644 --- a/test/lager_manager_killer_test.erl +++ b/test/lager_manager_killer_test.erl @@ -7,49 +7,72 @@ -include_lib("eunit/include/eunit.hrl"). overload_test() -> - application:stop(lager), - application:load(lager), - Delay = 1000, % sleep 1 sec on every log - KillerHWM = 10, % kill the manager if there are more than 10 pending logs - KillerReinstallAfter = 1000, % reinstall killer after 1 sec - application:set_env(lager, handlers, [{lager_slow_backend, [{delay, Delay}]}]), - application:set_env(lager, async_threshold, undefined), - application:set_env(lager, killer_hwm, KillerHWM), - application:set_env(lager, killer_reinstall_after, KillerReinstallAfter), - ensure_started(lager), - lager_config:set(async, true), - Manager = whereis(lager_event), - erlang:trace(all, true, [procs]), - [lager:info("~p'th message", [N]) || N <- lists:seq(1,KillerHWM+2)], - Margin = 100, - receive - {trace, Manager, exit, killed} -> - ?debugFmt("Manager ~p killed", [Manager]) - after Delay+Margin -> - ?assert(false) - end, - receive - {trace, _Sup, spawn, Pid, Fun} -> - ?assert(process_info(Pid, registered_name) =:= {registered_name, lager_event}), - ?debugFmt("Manager ~p start with ~p", [Pid, Fun]), - ?assertNot(lists:member(lager_manager_killer, gen_event:which_handlers(lager_event))) - after Margin -> - ?assert(false) - end, - erlang:trace(all, false, [procs]), - timer:sleep(KillerReinstallAfter), - ?assert(proplists:get_value(lager_manager_killer, gen_event:which_handlers(lager_event))), - ?assert(gen_event:call(lager_event, lager_manager_killer, get_settings) =:= [KillerHWM, KillerReinstallAfter]), - ?debugFmt("Killer reinstalled with [~p, ~p]", [KillerHWM, KillerReinstallAfter]), - application:stop(lager). + application:stop(lager), + application:load(lager), + Delay = 1000, % sleep 1 sec on every log + KillerHWM = 10, % kill the manager if there are more than 10 pending logs + KillerReinstallAfter = 1000, % reinstall killer after 1 sec + application:set_env(lager, handlers, [{lager_slow_backend, [{delay, Delay}]}]), + application:set_env(lager, async_threshold, undefined), + application:set_env(lager, killer_hwm, KillerHWM), + application:set_env(lager, killer_reinstall_after, KillerReinstallAfter), + ensure_started(lager), + lager_config:set(async, true), + Manager = whereis(lager_event), + erlang:trace(all, true, [procs]), + [lager:info("~p'th message", [N]) || N <- lists:seq(1,KillerHWM+2)], + Margin = 100, + ok = confirm_manager_exit(Manager, Delay+Margin), + ok = confirm_sink_reregister(Margin), + erlang:trace(all, false, [procs]), + wait_until(fun() -> + case proplists:get_value(lager_manager_killer, gen_event:which_handlers(lager_event)) of + [] -> false; + _ -> true + end + end, Margin, 15), + wait_until(fun() -> + case gen_event:call(lager_event, lager_manager_killer, get_settings) of + [KillerHWM, KillerReinstallAfter] -> true; + _Other -> false + end + end, Margin, 15), + application:stop(lager). ensure_started(App) -> - case application:start(App) of - ok -> - ok; - {error, {not_started, Dep}} -> - ensure_started(Dep), - ensure_started(App) - end. + case application:start(App) of + ok -> + ok; + {error, {not_started, Dep}} -> + ensure_started(Dep), + ensure_started(App) + end. + +confirm_manager_exit(Manager, Delay) -> + receive + {trace, Manager, exit, killed} -> + ?debugFmt("Manager ~p killed", [Manager]); + Other -> + ?debugFmt("OTHER MSG: ~p", [Other]), + confirm_manager_exit(Manager, Delay) + after Delay -> + ?assert(false) + end. + +confirm_sink_reregister(Delay) -> + receive + {trace, _Pid, register, lager_event} -> + ?assertNot(lists:member(lager_manager_killer, gen_event:which_handlers(lager_event))) + after Delay -> + ?assert(false) + end. + +wait_until(_Fun, _Delay, 0) -> + {error, too_many_retries}; +wait_until(Fun, Delay, Retries) -> + case Fun() of + true -> ok; + false -> timer:sleep(Delay), wait_until(Fun, Delay, Retries-1) + end. -endif. From 101dba232e22602255c4be7169e160b3de54a83e Mon Sep 17 00:00:00 2001 From: "Brian L. Troutwine" Date: Thu, 17 Mar 2016 10:11:42 -0700 Subject: [PATCH 04/27] Force restart of system after a watcher kill. This is, perhaps, not the most ideal setup. It still ignores other sinks but does bring logging back online after the appropriate delay. Signed-off-by: Brian L. Troutwine --- src/lager_app.erl | 9 ++++++--- src/lager_handler_watcher.erl | 8 ++++---- src/lager_manager_killer.erl | 7 +++++++ 3 files changed, 17 insertions(+), 7 deletions(-) diff --git a/src/lager_app.erl b/src/lager_app.erl index 67a5f7a..5ed1415 100644 --- a/src/lager_app.erl +++ b/src/lager_app.erl @@ -28,7 +28,8 @@ -export([start/0, start/2, start_handler/3, - stop/1]). + stop/1, + boot/0]). -define(FILENAMES, '__lager_file_backend_filenames'). -define(THROTTLE, lager_backend_throttle). @@ -220,7 +221,10 @@ get_env_default({ok, Value}, _Default) -> start(_StartType, _StartArgs) -> {ok, Pid} = lager_sup:start_link(), + SavedHandlers = boot(), + {ok, Pid, SavedHandlers}. +boot() -> %% Handle the default sink. determine_async_behavior(?DEFAULT_SINK, application:get_env(lager, async_threshold), @@ -246,8 +250,7 @@ start(_StartType, _StartArgs) -> clean_up_config_checks(), - {ok, Pid, SavedHandlers}. - + SavedHandlers. stop(Handlers) -> lists:foreach(fun(Handler) -> diff --git a/src/lager_handler_watcher.erl b/src/lager_handler_watcher.erl index da67d8b..336ee06 100644 --- a/src/lager_handler_watcher.erl +++ b/src/lager_handler_watcher.erl @@ -63,11 +63,11 @@ handle_info({gen_event_EXIT, Module, shutdown}, #state{module=Module} = State) - {stop, normal, State}; handle_info({gen_event_EXIT, Module, {'EXIT', {kill_me, [_KillerHWM, KillerReinstallAfter]}}}, #state{module=Module, sink=Sink} = State) -> - % Brutally kill the manager but stay alive to restore settings. + %% Brutally kill the manager but stay alive to restore settings. Manager = whereis(Sink), unlink(Manager), exit(Manager, kill), - erlang:send_after(KillerReinstallAfter, self(), reinstall_handler), + erlang:send_after(KillerReinstallAfter, self(), reboot), {noreply, State}; handle_info({gen_event_EXIT, Module, Reason}, #state{module=Module, config=Config, sink=Sink} = State) -> @@ -80,8 +80,8 @@ handle_info({gen_event_EXIT, Module, Reason}, #state{module=Module, ok end, {noreply, State}; -handle_info(reinstall_handler, #state{module=Module, config=Config, sink=Sink} = State) -> - install_handler(Sink, Module, Config), +handle_info(reboot, State) -> + lager_app:boot(), {noreply, State}; handle_info(stop, State) -> {stop, normal, State}; diff --git a/src/lager_manager_killer.erl b/src/lager_manager_killer.erl index 158b471..3f15dc4 100644 --- a/src/lager_manager_killer.erl +++ b/src/lager_manager_killer.erl @@ -4,6 +4,8 @@ -export([init/1, handle_event/2, handle_call/2, handle_info/2, terminate/2, code_change/3]). +-export([kill_me/0]). + -include("lager.hrl"). -record(state, { @@ -11,6 +13,9 @@ killer_reinstall_after :: non_neg_integer() }). +kill_me() -> + gen_event:call(lager_event, ?MODULE, kill_self). + init([KillerHWM, KillerReinstallAfter]) -> {ok, #state{killer_hwm=KillerHWM, killer_reinstall_after=KillerReinstallAfter}}. @@ -20,6 +25,8 @@ handle_call({set_loglevel, _Level}, State) -> {ok, ok, State}; handle_call(get_settings, State = #state{killer_hwm=KillerHWM, killer_reinstall_after=KillerReinstallAfter}) -> {ok, [KillerHWM, KillerReinstallAfter], State}; +handle_call(kill_self, #state{killer_hwm=KillerHWM, killer_reinstall_after=KillerReinstallAfter}) -> + exit({kill_me, [KillerHWM, KillerReinstallAfter]}); handle_call(_Request, State) -> {ok, ok, State}. From 2d0351f0485b35cc327da8fb9dc791de6a3b1700 Mon Sep 17 00:00:00 2001 From: "Brian L. Troutwine" Date: Thu, 17 Mar 2016 13:06:02 -0700 Subject: [PATCH 05/27] Correct test failures for lager_handler_watcher. Signed-off-by: Brian L. Troutwine --- test/lager_manager_killer_test.erl | 68 ++++++++++++++++-------------- 1 file changed, 36 insertions(+), 32 deletions(-) diff --git a/test/lager_manager_killer_test.erl b/test/lager_manager_killer_test.erl index ac8898f..7a0622a 100644 --- a/test/lager_manager_killer_test.erl +++ b/test/lager_manager_killer_test.erl @@ -6,38 +6,42 @@ -ifdef(TEST). -include_lib("eunit/include/eunit.hrl"). -overload_test() -> - application:stop(lager), - application:load(lager), - Delay = 1000, % sleep 1 sec on every log - KillerHWM = 10, % kill the manager if there are more than 10 pending logs - KillerReinstallAfter = 1000, % reinstall killer after 1 sec - application:set_env(lager, handlers, [{lager_slow_backend, [{delay, Delay}]}]), - application:set_env(lager, async_threshold, undefined), - application:set_env(lager, killer_hwm, KillerHWM), - application:set_env(lager, killer_reinstall_after, KillerReinstallAfter), - ensure_started(lager), - lager_config:set(async, true), - Manager = whereis(lager_event), - erlang:trace(all, true, [procs]), - [lager:info("~p'th message", [N]) || N <- lists:seq(1,KillerHWM+2)], - Margin = 100, - ok = confirm_manager_exit(Manager, Delay+Margin), - ok = confirm_sink_reregister(Margin), - erlang:trace(all, false, [procs]), - wait_until(fun() -> - case proplists:get_value(lager_manager_killer, gen_event:which_handlers(lager_event)) of - [] -> false; - _ -> true - end - end, Margin, 15), - wait_until(fun() -> - case gen_event:call(lager_event, lager_manager_killer, get_settings) of - [KillerHWM, KillerReinstallAfter] -> true; - _Other -> false - end - end, Margin, 15), - application:stop(lager). +overload_test_() -> + {timeout, 60, + fun() -> + application:stop(lager), + application:load(lager), + Delay = 1000, % sleep 1 sec on every log + KillerHWM = 10, % kill the manager if there are more than 10 pending logs + KillerReinstallAfter = 1000, % reinstall killer after 1 sec + application:set_env(lager, handlers, [{lager_slow_backend, [{delay, Delay}]}]), + application:set_env(lager, async_threshold, undefined), + application:set_env(lager, error_logger_redirect, true), + application:set_env(lager, killer_hwm, KillerHWM), + application:set_env(lager, killer_reinstall_after, KillerReinstallAfter), + ensure_started(lager), + lager_config:set(async, true), + Manager = whereis(lager_event), + erlang:trace(all, true, [procs]), + [lager:info("~p'th message", [N]) || N <- lists:seq(1,KillerHWM+2)], + Margin = 100, + ok = confirm_manager_exit(Manager, Delay+Margin), + ok = confirm_sink_reregister(Margin), + erlang:trace(all, false, [procs]), + wait_until(fun() -> + case proplists:get_value(lager_manager_killer, gen_event:which_handlers(lager_event)) of + [] -> false; + _ -> true + end + end, Margin, 15), + wait_until(fun() -> + case gen_event:call(lager_event, lager_manager_killer, get_settings) of + [KillerHWM, KillerReinstallAfter] -> true; + _Other -> false + end + end, Margin, 15), + application:stop(lager) + end}. ensure_started(App) -> case application:start(App) of From fb0064824bd58f70ede68dee40ffc4d7acd32559 Mon Sep 17 00:00:00 2001 From: "Brian L. Troutwine" Date: Fri, 18 Mar 2016 17:54:52 -0700 Subject: [PATCH 06/27] Correct removal of reinstall_handler. Signed-off-by: Brian L. Troutwine --- src/lager_handler_watcher.erl | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/lager_handler_watcher.erl b/src/lager_handler_watcher.erl index 336ee06..3d13cd3 100644 --- a/src/lager_handler_watcher.erl +++ b/src/lager_handler_watcher.erl @@ -80,6 +80,9 @@ handle_info({gen_event_EXIT, Module, Reason}, #state{module=Module, ok end, {noreply, State}; +handle_info(reinstall_handler, #state{module=Module, config=Config, sink=Sink} = State) -> + install_handler(Sink, Module, Config), + {noreply, State}; handle_info(reboot, State) -> lager_app:boot(), {noreply, State}; From 4b5260c4524688b545cc12da6baa2dfa4f2afec9 Mon Sep 17 00:00:00 2001 From: Mark Allen Date: Tue, 29 Mar 2016 10:46:27 -0500 Subject: [PATCH 07/27] Overhaul start up and add killer config Previously, configuration would come from application:get_env/3 or proplists:get_value/3. Now we standardize on a single format for retrieving values from the environment or from a proplist. --- src/lager_app.erl | 106 +++++++++++++++++++++------------------------- 1 file changed, 49 insertions(+), 57 deletions(-) diff --git a/src/lager_app.erl b/src/lager_app.erl index 5ed1415..136caa5 100644 --- a/src/lager_app.erl +++ b/src/lager_app.erl @@ -29,7 +29,7 @@ start/2, start_handler/3, stop/1, - boot/0]). + boot/1]). -define(FILENAMES, '__lager_file_backend_filenames'). -define(THROTTLE, lager_backend_throttle). @@ -53,21 +53,19 @@ start_throttle(Sink, Threshold, Window) -> [Sink, ?THROTTLE, [Threshold, Window]]), ok. -determine_async_behavior(_Sink, {ok, undefined}, _Window) -> - ok; determine_async_behavior(_Sink, undefined, _Window) -> ok; -determine_async_behavior(_Sink, {ok, Threshold}, _Window) when not is_integer(Threshold) orelse Threshold < 0 -> +determine_async_behavior(_Sink, Threshold, _Window) when not is_integer(Threshold) orelse Threshold < 0 -> error_logger:error_msg("Invalid value for 'async_threshold': ~p~n", [Threshold]), throw({error, bad_config}); -determine_async_behavior(Sink, {ok, Threshold}, undefined) -> +determine_async_behavior(Sink, Threshold, undefined) -> start_throttle(Sink, Threshold, erlang:trunc(Threshold * 0.2)); -determine_async_behavior(_Sink, {ok, Threshold}, {ok, Window}) when not is_integer(Window) orelse Window > Threshold orelse Window < 0 -> +determine_async_behavior(_Sink, Threshold, Window) when not is_integer(Window) orelse Window > Threshold orelse Window < 0 -> error_logger:error_msg( "Invalid value for 'async_threshold_window': ~p~n", [Window]), throw({error, bad_config}); -determine_async_behavior(Sink, {ok, Threshold}, {ok, Window}) -> +determine_async_behavior(Sink, Threshold, Window) -> start_throttle(Sink, Threshold, Window). start_handlers(_Sink, undefined) -> @@ -119,14 +117,22 @@ clean_up_config_checks() -> interpret_hwm(undefined) -> undefined; -interpret_hwm({ok, undefined}) -> - undefined; -interpret_hwm({ok, HWM}) when not is_integer(HWM) orelse HWM < 0 -> +interpret_hwm(HWM) when not is_integer(HWM) orelse HWM < 0 -> _ = lager:log(warning, self(), "Invalid error_logger high water mark: ~p, disabling", [HWM]), undefined; -interpret_hwm({ok, HWM}) -> +interpret_hwm(HWM) -> HWM. +maybe_install_sink_killer(_Sink, undefined, _ReinstallTimer) -> ok; +maybe_install_sink_killer(Sink, HWM, undefined) -> maybe_install_sink_killer(Sink, HWM, 5000); +maybe_install_sink_killer(Sink, HWM, ReinstallTimer) when is_integer(HWM) andalso is_integer(ReinstallTimer) + andalso HWM >= 0 andalso ReinstallTimer >= 0 -> + _ = supervisor:start_child(lager_handler_watcher_sup, [Sink, lager_manager_killer, + [HWM, ReinstallTimer]]); +maybe_install_sink_killer(_Sink, HWM, ReinstallTimer) -> + error_logger:error_msg("Invalid value for 'killer_hwm': ~p or 'killer_reinstall_after': ~p", [HWM, ReinstallTimer]), + throw({error, bad_config}). + start_error_logger_handler({ok, false}, _HWM, _Whitelist) -> []; start_error_logger_handler(_, HWM, undefined) -> @@ -147,24 +153,6 @@ start_error_logger_handler(_, HWM, {ok, WhiteList}) -> throw({error, bad_config}) end, - %% Conditionally start the high watermark killer. - case application:get_env(lager, killer_hwm) of - undefined -> ok; - {ok, undefined} -> ok; - {ok, KillerHWM} when is_integer(KillerHWM), KillerHWM >= 0 -> - KillerReinstallAfter = case application:get_env(lager, killer_reinstall_after) of - undefined -> 5000; - {ok, undefined} -> 5000; - {ok, V} when is_integer(V), V >= 0 -> V; - {ok, BadKillerReinstallAfter} -> - error_logger:error_msg("Invalid value for 'cooldown': ~p~n", - [BadKillerReinstallAfter]), - throw({error, bad_config}) - end, - _ = supervisor:start_child(lager_handler_watcher_sup, - [?DEFAULT_SINK, lager_manager_killer, - [KillerHWM, KillerReinstallAfter]]) - end, case supervisor:start_child(lager_handler_watcher_sup, [error_logger, error_logger_lager_h, [HWM, GlStrategy]]) of {ok, _} -> @@ -174,17 +162,6 @@ start_error_logger_handler(_, HWM, {ok, WhiteList}) -> [] end. - -%% `determine_async_behavior/3' is called with the results from either -%% `application:get_env/2' and `proplists:get_value/2'. Since -%% `application:get_env/2' wraps a successful retrieval in an `{ok, -%% Value}' tuple, do the same for the result from -%% `proplists:get_value/2'. -wrap_proplist_value(undefined) -> - undefined; -wrap_proplist_value(Value) -> - {ok, Value}. - configure_sink(Sink, SinkDef) -> lager_config:new_sink(Sink), ChildId = lager_util:make_internal_sink_name(Sink), @@ -193,12 +170,11 @@ configure_sink(Sink, SinkDef) -> {gen_event, start_link, [{local, Sink}]}, permanent, 5000, worker, dynamic}), - determine_async_behavior(Sink, - wrap_proplist_value( - proplists:get_value(async_threshold, SinkDef)), - wrap_proplist_value( - proplists:get_value(async_threshold_window, SinkDef)) + determine_async_behavior(Sink, proplists:get_value(async_threshold, SinkDef), + proplists:get_value(async_threshold_window, SinkDef) ), + maybe_install_sink_killer(Sink, proplists:get_value(killer_hwm, SinkDef), + proplists:get_value(killer_reinstall_after, SinkDef)), start_handlers(Sink, proplists:get_value(handlers, SinkDef, [])), @@ -209,6 +185,8 @@ configure_extra_sinks(Sinks) -> lists:foreach(fun({Sink, Proplist}) -> configure_sink(Sink, Proplist) end, Sinks). +get_env(Application, Key) -> + get_env(Application, Key, undefined). %% R15 doesn't know about application:get_env/3 get_env(Application, Key, Default) -> get_env_default(application:get_env(Application, Key), @@ -222,35 +200,49 @@ get_env_default({ok, Value}, _Default) -> start(_StartType, _StartArgs) -> {ok, Pid} = lager_sup:start_link(), SavedHandlers = boot(), + _ = boot('__all_extra'), + _ = boot('__traces'), + clean_up_config_checks(), {ok, Pid, SavedHandlers}. boot() -> %% Handle the default sink. determine_async_behavior(?DEFAULT_SINK, - application:get_env(lager, async_threshold), - application:get_env(lager, async_threshold_window)), + get_env(lager, async_threshold), + get_env(lager, async_threshold_window)), + + maybe_install_sink_killer(?DEFAULT_SINK, get_env(lager, killer_hwm), + get_env(lager, killer_reinstall_after)), + start_handlers(?DEFAULT_SINK, get_env(lager, handlers, ?DEFAULT_HANDLER_CONF)), - lager:update_loglevel_config(?DEFAULT_SINK), SavedHandlers = start_error_logger_handler( - application:get_env(lager, error_logger_redirect), - interpret_hwm(application:get_env(lager, error_logger_hwm)), - application:get_env(lager, error_logger_whitelist) + get_env(lager, error_logger_redirect), + interpret_hwm(get_env(lager, error_logger_hwm)), + get_env(lager, error_logger_whitelist) ), - _ = lager_util:trace_filter(none), + SavedHandlers. - %% Now handle extra sinks - configure_extra_sinks(get_env(lager, extra_sinks, [])), +boot('__traces') -> + _ = lager_util:trace_filter(none), + ok = add_configured_traces(); - ok = add_configured_traces(), +boot('__all_extra') -> + configure_extra_sinks(get_env(lager, extra_sinks, [])); - clean_up_config_checks(), +boot(?DEFAULT_SINK) -> boot(); +boot(Sink) -> + AllSinksDef = get_env(lager, extra_sinks, []), + boot_sink(Sink, lists:keyfind(Sink, 1, AllSinksDef)). - SavedHandlers. +boot_sink(Sink, {Sink, Def}) -> + configure_sink(Sink, Def); +boot_sink(Sink, false) -> + configure_sink(Sink, []). stop(Handlers) -> lists:foreach(fun(Handler) -> From a170ebc9a5696806dad51f1b7b6746fcbb7f49c7 Mon Sep 17 00:00:00 2001 From: Mark Allen Date: Tue, 29 Mar 2016 10:48:19 -0500 Subject: [PATCH 08/27] Send the sink name with reboot message Previously, was always hardcoded to the default sink name. --- src/lager_handler_watcher.erl | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/src/lager_handler_watcher.erl b/src/lager_handler_watcher.erl index 3d13cd3..df92584 100644 --- a/src/lager_handler_watcher.erl +++ b/src/lager_handler_watcher.erl @@ -64,10 +64,14 @@ handle_info({gen_event_EXIT, Module, shutdown}, #state{module=Module} = State) - handle_info({gen_event_EXIT, Module, {'EXIT', {kill_me, [_KillerHWM, KillerReinstallAfter]}}}, #state{module=Module, sink=Sink} = State) -> %% Brutally kill the manager but stay alive to restore settings. - Manager = whereis(Sink), - unlink(Manager), - exit(Manager, kill), - erlang:send_after(KillerReinstallAfter, self(), reboot), + %% + %% SinkPid here means the gen_event process. Handlers *all* live inside the + %% same gen_event process space, so when the Pid is killed, *all* of the + %% pending log messages in its mailbox will die too. + SinkPid = whereis(Sink), + unlink(SinkPid), + exit(SinkPid, kill), + erlang:send_after(KillerReinstallAfter, self(), {reboot, Sink}), {noreply, State}; handle_info({gen_event_EXIT, Module, Reason}, #state{module=Module, config=Config, sink=Sink} = State) -> @@ -83,8 +87,8 @@ handle_info({gen_event_EXIT, Module, Reason}, #state{module=Module, handle_info(reinstall_handler, #state{module=Module, config=Config, sink=Sink} = State) -> install_handler(Sink, Module, Config), {noreply, State}; -handle_info(reboot, State) -> - lager_app:boot(), +handle_info({reboot, Sink}, State) -> + lager_app:boot(Sink), {noreply, State}; handle_info(stop, State) -> {stop, normal, State}; From 7897f3462adff6327b9da7ee070058b772f2aff8 Mon Sep 17 00:00:00 2001 From: Mark Allen Date: Tue, 29 Mar 2016 16:28:56 -0500 Subject: [PATCH 09/27] Add a sleep to prevent losing race condition Lager logs its own application start up message. Previously this message was discarded or not received quickly enough to cause test failures, but now it seems as though it is being received and causing several test failures even though we are explicitly flushing it away. We introduce a 5 millisecond sleep for several tests to make sure the start up message is received before the flush command dumps it. --- src/lager_file_backend.erl | 11 +++++++++-- src/lager_handler_watcher.erl | 7 +++---- test/lager_test_backend.erl | 12 ++++++++++++ 3 files changed, 24 insertions(+), 6 deletions(-) diff --git a/src/lager_file_backend.erl b/src/lager_file_backend.erl index a0fd225..b32b39e 100644 --- a/src/lager_file_backend.erl +++ b/src/lager_file_backend.erl @@ -520,7 +520,12 @@ filesystem_test_() -> application:set_env(lager, handlers, [{lager_test_backend, info}]), application:set_env(lager, error_logger_redirect, false), application:set_env(lager, async_threshold, undefined), - lager:start() + lager:start(), + %% race condition where lager logs its own start up + %% makes several tests fail. See test/lager_test_backend + %% around line 800 for more information. + timer:sleep(5), + lager_test_backend:flush() end, fun(_) -> file:delete("test.log"), @@ -808,7 +813,9 @@ formatting_test_() -> application:load(lager), application:set_env(lager, handlers, [{lager_test_backend, info}]), application:set_env(lager, error_logger_redirect, false), - lager:start() + lager:start(), + %% same race condition issue + timer:sleep(5) end, fun(_) -> file:delete("test.log"), diff --git a/src/lager_handler_watcher.erl b/src/lager_handler_watcher.erl index df92584..e1ea43f 100644 --- a/src/lager_handler_watcher.erl +++ b/src/lager_handler_watcher.erl @@ -150,11 +150,10 @@ reinstall_on_initial_failure_test_() -> application:unset_env(lager, crash_log), lager:start(), try - ?assertEqual(1, lager_test_backend:count()), {_Level, _Time, Message, _Metadata} = lager_test_backend:pop(), ?assertMatch("Lager failed to install handler lager_crash_backend into lager_event, retrying later :"++_, lists:flatten(Message)), - ?assertEqual(0, lager_test_backend:count()), timer:sleep(6000), + lager_test_backend:flush(), ?assertEqual(0, lager_test_backend:count()), ?assert(lists:member(lager_crash_backend, gen_event:which_handlers(lager_event))) after @@ -177,10 +176,10 @@ reinstall_on_runtime_failure_test_() -> application:unset_env(lager, crash_log), lager:start(), try - ?assertEqual(0, lager_test_backend:count()), ?assert(lists:member(lager_crash_backend, gen_event:which_handlers(lager_event))), timer:sleep(6000), - ?assertEqual(2, lager_test_backend:count()), + _ = lager_test_backend:pop(), %% throw away application start up message + _ = lager_test_backend:pop(), %% throw away gen_event crash message {_Severity, _Date, Msg, _Metadata} = lager_test_backend:pop(), ?assertEqual("Lager event handler lager_crash_backend exited with reason crash", lists:flatten(Msg)), {_Severity2, _Date2, Msg2, _Metadata2} = lager_test_backend:pop(), diff --git a/test/lager_test_backend.erl b/test/lager_test_backend.erl index dd4950c..be32c2b 100644 --- a/test/lager_test_backend.erl +++ b/test/lager_test_backend.erl @@ -795,6 +795,18 @@ setup() -> application:set_env(lager, handlers, [{?MODULE, info}]), application:set_env(lager, error_logger_redirect, false), lager:start(), + %% There is a race condition between the application start up, lager logging its own + %% start up condition and several tests that count messages or parse the output of + %% tests. When the lager start up message wins the race, it causes these tests + %% which parse output or count message arrivals to fail. + %% + %% We introduce a sleep here to allow `flush' to arrive *after* the start up + %% message has been received and processed. + %% + %% This race condition was first exposed during the work on + %% 4b5260c4524688b545cc12da6baa2dfa4f2afec9 which introduced the lager + %% manager killer PR. + timer:sleep(5), gen_event:call(lager_event, ?MODULE, flush). cleanup(_) -> From 0a9dd17f9062c8429150bcaf751bec0e34772ae5 Mon Sep 17 00:00:00 2001 From: Mark Allen Date: Wed, 6 Apr 2016 23:10:01 -0500 Subject: [PATCH 10/27] Test manager killer on alternate sink --- test/lager_manager_killer_test.erl | 51 +++++++++++++++++++++++++++--- 1 file changed, 47 insertions(+), 4 deletions(-) diff --git a/test/lager_manager_killer_test.erl b/test/lager_manager_killer_test.erl index 7a0622a..96199b0 100644 --- a/test/lager_manager_killer_test.erl +++ b/test/lager_manager_killer_test.erl @@ -6,6 +6,9 @@ -ifdef(TEST). -include_lib("eunit/include/eunit.hrl"). +-define(TEST_SINK_NAME, '__lager_test_sink'). %% <-- used by parse transform +-define(TEST_SINK_EVENT, '__lager_test_sink_lager_event'). %% <-- used by lager API calls and internals for gen_event + overload_test_() -> {timeout, 60, fun() -> @@ -26,7 +29,7 @@ overload_test_() -> [lager:info("~p'th message", [N]) || N <- lists:seq(1,KillerHWM+2)], Margin = 100, ok = confirm_manager_exit(Manager, Delay+Margin), - ok = confirm_sink_reregister(Margin), + ok = confirm_sink_reregister(lager_event, Margin), erlang:trace(all, false, [procs]), wait_until(fun() -> case proplists:get_value(lager_manager_killer, gen_event:which_handlers(lager_event)) of @@ -43,6 +46,46 @@ overload_test_() -> application:stop(lager) end}. +overload_alternate_sink_test_() -> + {timeout, 60, + fun() -> + application:stop(lager), + application:load(lager), + Delay = 1000, % sleep 1 sec on every log + KillerHWM = 10, % kill the manager if there are more than 10 pending logs + KillerReinstallAfter = 1000, % reinstall killer after 1 sec + application:set_env(lager, handlers, []), + application:set_env(lager, extra_sinks, [{?TEST_SINK_EVENT, [ + {handlers, [{lager_slow_backend, [{delay, Delay}]}]}, + {killer_hwm, KillerHWM}, + {killer_reinstall_after, KillerReinstallAfter}, + {async_threshold, undefined} + ]}]), + application:set_env(lager, error_logger_redirect, true), + ensure_started(lager), + lager_config:set({?TEST_SINK_EVENT, async}, true), + Manager = whereis(?TEST_SINK_EVENT), + erlang:trace(all, true, [procs]), + [?TEST_SINK_NAME:info("~p'th message", [N]) || N <- lists:seq(1,KillerHWM+2)], + Margin = 100, + ok = confirm_manager_exit(Manager, Delay+Margin), + ok = confirm_sink_reregister(?TEST_SINK_EVENT, Margin), + erlang:trace(all, false, [procs]), + wait_until(fun() -> + case proplists:get_value(lager_manager_killer, gen_event:which_handlers(?TEST_SINK_EVENT)) of + [] -> false; + _ -> true + end + end, Margin, 15), + wait_until(fun() -> + case gen_event:call(?TEST_SINK_EVENT, lager_manager_killer, get_settings) of + [KillerHWM, KillerReinstallAfter] -> true; + _Other -> false + end + end, Margin, 15), + application:stop(lager) + end}. + ensure_started(App) -> case application:start(App) of ok -> @@ -63,10 +106,10 @@ confirm_manager_exit(Manager, Delay) -> ?assert(false) end. -confirm_sink_reregister(Delay) -> +confirm_sink_reregister(Sink, Delay) -> receive - {trace, _Pid, register, lager_event} -> - ?assertNot(lists:member(lager_manager_killer, gen_event:which_handlers(lager_event))) + {trace, _Pid, register, Sink} -> + ?assertNot(lists:member(lager_manager_killer, gen_event:which_handlers(Sink))) after Delay -> ?assert(false) end. From da14d152ba4980a68f995316d7b9041e3b14c4db Mon Sep 17 00:00:00 2001 From: Mark Allen Date: Wed, 6 Apr 2016 23:20:07 -0500 Subject: [PATCH 11/27] Fix dialyzer warnings --- src/lager_app.erl | 4 ++-- src/lager_handler_watcher.erl | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/lager_app.erl b/src/lager_app.erl index 136caa5..1d426d7 100644 --- a/src/lager_app.erl +++ b/src/lager_app.erl @@ -173,7 +173,7 @@ configure_sink(Sink, SinkDef) -> determine_async_behavior(Sink, proplists:get_value(async_threshold, SinkDef), proplists:get_value(async_threshold_window, SinkDef) ), - maybe_install_sink_killer(Sink, proplists:get_value(killer_hwm, SinkDef), + _ = maybe_install_sink_killer(Sink, proplists:get_value(killer_hwm, SinkDef), proplists:get_value(killer_reinstall_after, SinkDef)), start_handlers(Sink, proplists:get_value(handlers, SinkDef, [])), @@ -211,7 +211,7 @@ boot() -> get_env(lager, async_threshold), get_env(lager, async_threshold_window)), - maybe_install_sink_killer(?DEFAULT_SINK, get_env(lager, killer_hwm), + _ = maybe_install_sink_killer(?DEFAULT_SINK, get_env(lager, killer_hwm), get_env(lager, killer_reinstall_after)), start_handlers(?DEFAULT_SINK, diff --git a/src/lager_handler_watcher.erl b/src/lager_handler_watcher.erl index e1ea43f..d5fe140 100644 --- a/src/lager_handler_watcher.erl +++ b/src/lager_handler_watcher.erl @@ -88,7 +88,7 @@ handle_info(reinstall_handler, #state{module=Module, config=Config, sink=Sink} = install_handler(Sink, Module, Config), {noreply, State}; handle_info({reboot, Sink}, State) -> - lager_app:boot(Sink), + _ = lager_app:boot(Sink), {noreply, State}; handle_info(stop, State) -> {stop, normal, State}; From 09b36f1c1298fc56d48741670fffd9ca6d60e2e3 Mon Sep 17 00:00:00 2001 From: Sungjin Park Date: Mon, 5 Jan 2015 16:27:13 +0900 Subject: [PATCH 12/27] Add lager_manager_killer and tests for it. --- rebar.config | 2 ++ src/lager_app.erl | 38 ++++++++++++++++++++- src/lager_handler_watcher.erl | 8 +++++ src/lager_manager_killer.erl | 44 ++++++++++++++++++++++++ test/lager_manager_killer_test.erl | 55 ++++++++++++++++++++++++++++++ test/lager_slow_backend.erl | 34 ++++++++++++++++++ 6 files changed, 180 insertions(+), 1 deletion(-) create mode 100644 src/lager_manager_killer.erl create mode 100644 test/lager_manager_killer_test.erl create mode 100644 test/lager_slow_backend.erl diff --git a/rebar.config b/rebar.config index 3198d59..77272c7 100644 --- a/rebar.config +++ b/rebar.config @@ -52,3 +52,5 @@ {cover_enabled, true}. {edoc_opts, [{stylesheet_file, "./priv/edoc.css"}]}. + +{eunit_opts, [verbose]}. diff --git a/src/lager_app.erl b/src/lager_app.erl index 1c679d7..d40f85b 100644 --- a/src/lager_app.erl +++ b/src/lager_app.erl @@ -157,7 +157,43 @@ start_error_logger_handler(_, HWM, {ok, WhiteList}) -> X <- gen_event:which_handlers(error_logger) -- [error_logger_lager_h | WhiteList]]; {error, _} -> [] - end. + end, + + case application:get_env(lager, killer_hwm) of + undefined -> + ok; + {ok, undefined} -> + undefined; + {ok, KillerHWM} when is_integer(KillerHWM), KillerHWM >= 0 -> + KillerReinstallAfter = + case application:get_env(lager, killer_reinstall_after) of + undefined -> + 5000; + {ok, undefined} -> + 5000; + {ok, V} when is_integer(V), V >= 0 -> + V; + {ok, BadKillerReinstallAfter} -> + error_logger:error_msg("Invalid value for 'cooldown': ~p~n", [BadKillerReinstallAfter]), + throw({error, bad_config}) + end, + _ = supervisor:start_child(lager_handler_watcher_sup, + [lager_event, lager_manager_killer, [KillerHWM, KillerReinstallAfter]]), + ok; + {ok, BadKillerHWM} -> + error_logger:error_msg("Invalid value for 'floodline': ~p~n", [BadKillerHWM]), + throw({error, bad_config}) + end, + + Handlers = case application:get_env(lager, handlers) of + undefined -> + [{lager_console_backend, info}, + {lager_file_backend, [{file, "log/error.log"}, {level, error}, {size, 10485760}, {date, "$D0"}, {count, 5}]}, + {lager_file_backend, [{file, "log/console.log"}, {level, info}, {size, 10485760}, {date, "$D0"}, {count, 5}]}]; + {ok, Val} -> + Val + end, + Handlers. %% `determine_async_behavior/3' is called with the results from either %% `application:get_env/2' and `proplists:get_value/2'. Since diff --git a/src/lager_handler_watcher.erl b/src/lager_handler_watcher.erl index 4251db5..c051101 100644 --- a/src/lager_handler_watcher.erl +++ b/src/lager_handler_watcher.erl @@ -61,6 +61,14 @@ handle_info({gen_event_EXIT, Module, normal}, #state{module=Module} = State) -> {stop, normal, State}; handle_info({gen_event_EXIT, Module, shutdown}, #state{module=Module} = State) -> {stop, normal, State}; +handle_info({gen_event_EXIT, Module, {'EXIT', {kill_me, [KillerHWM, KillerReinstallAfter]}}}, + #state{module=Module, event=Event} = State) -> + % Brutally kill the manager but stay alive to restore settings. + Manager = whereis(Event), + unlink(Manager), + exit(Manager, kill), + erlang:send_after(KillerReinstallAfter, self(), reinstall_handler), + {noreply, State#state{config=[KillerHWM, KillerReinstallAfter]}}; handle_info({gen_event_EXIT, Module, Reason}, #state{module=Module, config=Config, sink=Sink} = State) -> case lager:log(error, self(), "Lager event handler ~p exited with reason ~s", diff --git a/src/lager_manager_killer.erl b/src/lager_manager_killer.erl new file mode 100644 index 0000000..bb959f9 --- /dev/null +++ b/src/lager_manager_killer.erl @@ -0,0 +1,44 @@ +-module(lager_manager_killer). +-author("Sungjin Park "). +-behavior(gen_event). + +-export([init/1, handle_event/2, handle_call/2, handle_info/2, terminate/2, code_change/3]). + +-include("lager.hrl"). + +-record(state, { + killer_hwm :: non_neg_integer(), + killer_reinstall_after :: non_neg_integer() +}). + +init([KillerHWM, KillerReinstallAfter]) -> + {ok, #state{killer_hwm=KillerHWM, killer_reinstall_after=KillerReinstallAfter}}. + +handle_call(get_loglevel, State) -> + {ok, {mask, ?LOG_NONE}, State}; +handle_call({set_loglevel, _Level}, State) -> + {ok, ok, State}; +handle_call(get_settings, State = #state{killer_hwm=KillerHWM, killer_reinstall_after=KillerReinstallAfter}) -> + {ok, [KillerHWM, KillerReinstallAfter], State}; +handle_call(_Request, State) -> + {ok, ok, State}. + +handle_event({log, _Message}, State = #state{killer_hwm=KillerHWM, killer_reinstall_after=KillerReinstallAfter}) -> + {message_queue_len, Len} = process_info(self(), message_queue_len), + case Len > KillerHWM of + true -> + exit({kill_me, [KillerHWM, KillerReinstallAfter]}); + _ -> + {ok, State} + end; +handle_event(_Event, State) -> + {ok, State}. + +handle_info(_Info, State) -> + {ok, State}. + +terminate(_Reason, _State) -> + ok. + +code_change(_OldVsn, State, _Extra) -> + {ok, State}. diff --git a/test/lager_manager_killer_test.erl b/test/lager_manager_killer_test.erl new file mode 100644 index 0000000..1fcb59f --- /dev/null +++ b/test/lager_manager_killer_test.erl @@ -0,0 +1,55 @@ +-module(lager_manager_killer_test). +-author("Sungjin Park "). + +-compile([{parse_transform, lager_transform}]). + +-ifdef(TEST). +-include_lib("eunit/include/eunit.hrl"). + +overload_test() -> + application:stop(lager), + application:load(lager), + Delay = 1000, % sleep 1 sec on every log + KillerHWM = 10, % kill the manager if there are more than 10 pending logs + KillerReinstallAfter = 1000, % reinstall killer after 1 sec + application:set_env(lager, handlers, [{lager_slow_backend, Delay}]), + application:set_env(lager, async_threshold, undefined), + application:set_env(lager, killer_hwm, KillerHWM), + application:set_env(lager, killer_reinstall_after, KillerReinstallAfter), + ensure_started(lager), + lager_config:set(async, true), + Manager = whereis(lager_event), + erlang:trace(all, true, [procs]), + [lager:info("~p'th message", [N]) || N <- lists:seq(1,KillerHWM+2)], + Margin = 100, + receive + {trace, Manager, exit, killed} -> + ?debugFmt("Manager ~p killed", [Manager]) + after Delay+Margin -> + ?assert(false) + end, + receive + {trace, _Sup, spawn, Pid, Fun} -> + ?assert(process_info(Pid, registered_name) =:= {registered_name, lager_event}), + ?debugFmt("Manager ~p start with ~p", [Pid, Fun]), + ?assertNot(lists:member(lager_manager_killer, gen_event:which_handlers(lager_event))) + after Margin -> + ?assert(false) + end, + erlang:trace(all, false, [procs]), + timer:sleep(KillerReinstallAfter), + ?assert(proplists:get_value(lager_manager_killer, gen_event:which_handlers(lager_event))), + ?assert(gen_event:call(lager_event, lager_manager_killer, get_settings) =:= [KillerHWM, KillerReinstallAfter]), + ?debugFmt("Killer reinstalled with [~p, ~p]", [KillerHWM, KillerReinstallAfter]), + application:stop(lager). + +ensure_started(App) -> + case application:start(App) of + ok -> + ok; + {error, {not_started, Dep}} -> + ensure_started(Dep), + ensure_started(App) + end. + +-endif. \ No newline at end of file diff --git a/test/lager_slow_backend.erl b/test/lager_slow_backend.erl new file mode 100644 index 0000000..9771083 --- /dev/null +++ b/test/lager_slow_backend.erl @@ -0,0 +1,34 @@ +-module(lager_slow_backend). +-author("Sungjin Park "). +-behavior(gen_event). + +-export([init/1, handle_call/2, handle_event/2, handle_info/2, terminate/2, code_change/3]). + +-include("lager.hrl"). + +-record(state, { + delay :: non_neg_integer() +}). + +init(Delay) -> + {ok, #state{delay=Delay}}. + +handle_call(get_loglevel, State) -> + {ok, lager_util:config_to_mask(debug), State}; +handle_call(_Request, State) -> + {ok, ok, State}. + +handle_event({log, _Message}, State) -> + timer:sleep(State#state.delay), + {ok, State}; +handle_event(_Event, State) -> + {ok, State}. + +handle_info(_Info, State) -> + {ok, State}. + +terminate(_Reason, _State) -> + ok. + +code_change(_OldVsn, State, _Extra) -> + {ok, State}. From 49311b27e1309136dbd58db3760f7a207e354be2 Mon Sep 17 00:00:00 2001 From: "Brian L. Troutwine" Date: Thu, 10 Mar 2016 16:03:39 -0800 Subject: [PATCH 13/27] Attempt to redress test failures. At this point in the work, the killer will correctly stop the DEFAULT_SINK in the event of overload but our friend never comes back up and lager stops working past this point. Oops! Error message: =SUPERVISOR REPORT==== 10-Mar-2016::16:18:11 === Supervisor: {local,lager_handler_watcher_sup} Context: child_terminated Reason: killed Offender: [{pid,<0.63.0>}, {id,lager_handler_watcher}, {mfargs,{lager_handler_watcher,start_link,undefined}}, {restart_type,temporary}, {shutdown,5000}, {child_type,worker}] =SUPERVISOR REPORT==== 10-Mar-2016::16:18:11 === Supervisor: {local,lager_handler_watcher_sup} Context: child_terminated Reason: killed Offender: [{pid,<0.59.0>}, {id,lager_handler_watcher}, {mfargs,{lager_handler_watcher,start_link,undefined}}, {restart_type,temporary}, {shutdown,5000}, {child_type,worker}] =PROGRESS REPORT==== 10-Mar-2016::16:18:11 === supervisor: {local,lager_sup} started: [{pid,<0.113.0>}, {id,lager}, {mfargs,{gen_event,start_link,[{local,lager_event}]}}, {restart_type,permanent}, {shutdown,5000}, {child_type,worker}] Signed-off-by: Brian L. Troutwine --- src/lager_app.erl | 45 +++++++++++++----------------- src/lager_handler_watcher.erl | 8 +++--- src/lager_manager_killer.erl | 38 ++++++++++++------------- test/lager_manager_killer_test.erl | 4 +-- test/lager_slow_backend.erl | 2 +- 5 files changed, 45 insertions(+), 52 deletions(-) diff --git a/src/lager_app.erl b/src/lager_app.erl index d40f85b..2661eac 100644 --- a/src/lager_app.erl +++ b/src/lager_app.erl @@ -151,6 +151,25 @@ start_error_logger_handler(_, HWM, {ok, WhiteList}) -> throw({error, bad_config}) end, + %% Conditionally start the high watermark killer. + case application:get_env(lager, killer_hwm) of + undefined -> ok; + {ok, undefined} -> ok; + {ok, KillerHWM} when is_integer(KillerHWM), KillerHWM >= 0 -> + KillerReinstallAfter = case application:get_env(lager, killer_reinstall_after) of + undefined -> 5000; + {ok, undefined} -> 5000; + {ok, V} when is_integer(V), V >= 0 -> V; + {ok, BadKillerReinstallAfter} -> + error_logger:error_msg("Invalid value for 'cooldown': ~p~n", + [BadKillerReinstallAfter]), + throw({error, bad_config}) + end, + _ = supervisor:start_child(lager_handler_watcher_sup, + [?DEFAULT_SINK, lager_manager_killer, + [KillerHWM, KillerReinstallAfter]]) + end, + case supervisor:start_child(lager_handler_watcher_sup, [error_logger, error_logger_lager_h, [HWM, GlStrategy]]) of {ok, _} -> [begin error_logger:delete_report_handler(X), X end || @@ -159,32 +178,6 @@ start_error_logger_handler(_, HWM, {ok, WhiteList}) -> [] end, - case application:get_env(lager, killer_hwm) of - undefined -> - ok; - {ok, undefined} -> - undefined; - {ok, KillerHWM} when is_integer(KillerHWM), KillerHWM >= 0 -> - KillerReinstallAfter = - case application:get_env(lager, killer_reinstall_after) of - undefined -> - 5000; - {ok, undefined} -> - 5000; - {ok, V} when is_integer(V), V >= 0 -> - V; - {ok, BadKillerReinstallAfter} -> - error_logger:error_msg("Invalid value for 'cooldown': ~p~n", [BadKillerReinstallAfter]), - throw({error, bad_config}) - end, - _ = supervisor:start_child(lager_handler_watcher_sup, - [lager_event, lager_manager_killer, [KillerHWM, KillerReinstallAfter]]), - ok; - {ok, BadKillerHWM} -> - error_logger:error_msg("Invalid value for 'floodline': ~p~n", [BadKillerHWM]), - throw({error, bad_config}) - end, - Handlers = case application:get_env(lager, handlers) of undefined -> [{lager_console_backend, info}, diff --git a/src/lager_handler_watcher.erl b/src/lager_handler_watcher.erl index c051101..da67d8b 100644 --- a/src/lager_handler_watcher.erl +++ b/src/lager_handler_watcher.erl @@ -61,14 +61,14 @@ handle_info({gen_event_EXIT, Module, normal}, #state{module=Module} = State) -> {stop, normal, State}; handle_info({gen_event_EXIT, Module, shutdown}, #state{module=Module} = State) -> {stop, normal, State}; -handle_info({gen_event_EXIT, Module, {'EXIT', {kill_me, [KillerHWM, KillerReinstallAfter]}}}, - #state{module=Module, event=Event} = State) -> +handle_info({gen_event_EXIT, Module, {'EXIT', {kill_me, [_KillerHWM, KillerReinstallAfter]}}}, + #state{module=Module, sink=Sink} = State) -> % Brutally kill the manager but stay alive to restore settings. - Manager = whereis(Event), + Manager = whereis(Sink), unlink(Manager), exit(Manager, kill), erlang:send_after(KillerReinstallAfter, self(), reinstall_handler), - {noreply, State#state{config=[KillerHWM, KillerReinstallAfter]}}; + {noreply, State}; handle_info({gen_event_EXIT, Module, Reason}, #state{module=Module, config=Config, sink=Sink} = State) -> case lager:log(error, self(), "Lager event handler ~p exited with reason ~s", diff --git a/src/lager_manager_killer.erl b/src/lager_manager_killer.erl index bb959f9..158b471 100644 --- a/src/lager_manager_killer.erl +++ b/src/lager_manager_killer.erl @@ -7,38 +7,38 @@ -include("lager.hrl"). -record(state, { - killer_hwm :: non_neg_integer(), - killer_reinstall_after :: non_neg_integer() -}). + killer_hwm :: non_neg_integer(), + killer_reinstall_after :: non_neg_integer() + }). init([KillerHWM, KillerReinstallAfter]) -> - {ok, #state{killer_hwm=KillerHWM, killer_reinstall_after=KillerReinstallAfter}}. + {ok, #state{killer_hwm=KillerHWM, killer_reinstall_after=KillerReinstallAfter}}. handle_call(get_loglevel, State) -> - {ok, {mask, ?LOG_NONE}, State}; + {ok, {mask, ?LOG_NONE}, State}; handle_call({set_loglevel, _Level}, State) -> - {ok, ok, State}; + {ok, ok, State}; handle_call(get_settings, State = #state{killer_hwm=KillerHWM, killer_reinstall_after=KillerReinstallAfter}) -> - {ok, [KillerHWM, KillerReinstallAfter], State}; + {ok, [KillerHWM, KillerReinstallAfter], State}; handle_call(_Request, State) -> - {ok, ok, State}. + {ok, ok, State}. handle_event({log, _Message}, State = #state{killer_hwm=KillerHWM, killer_reinstall_after=KillerReinstallAfter}) -> - {message_queue_len, Len} = process_info(self(), message_queue_len), - case Len > KillerHWM of - true -> - exit({kill_me, [KillerHWM, KillerReinstallAfter]}); - _ -> - {ok, State} - end; + {message_queue_len, Len} = process_info(self(), message_queue_len), + case Len > KillerHWM of + true -> + exit({kill_me, [KillerHWM, KillerReinstallAfter]}); + _ -> + {ok, State} + end; handle_event(_Event, State) -> - {ok, State}. + {ok, State}. handle_info(_Info, State) -> - {ok, State}. + {ok, State}. terminate(_Reason, _State) -> - ok. + ok. code_change(_OldVsn, State, _Extra) -> - {ok, State}. + {ok, State}. diff --git a/test/lager_manager_killer_test.erl b/test/lager_manager_killer_test.erl index 1fcb59f..6e63079 100644 --- a/test/lager_manager_killer_test.erl +++ b/test/lager_manager_killer_test.erl @@ -12,7 +12,7 @@ overload_test() -> Delay = 1000, % sleep 1 sec on every log KillerHWM = 10, % kill the manager if there are more than 10 pending logs KillerReinstallAfter = 1000, % reinstall killer after 1 sec - application:set_env(lager, handlers, [{lager_slow_backend, Delay}]), + application:set_env(lager, handlers, [{lager_slow_backend, [{delay, Delay}]}]), application:set_env(lager, async_threshold, undefined), application:set_env(lager, killer_hwm, KillerHWM), application:set_env(lager, killer_reinstall_after, KillerReinstallAfter), @@ -52,4 +52,4 @@ ensure_started(App) -> ensure_started(App) end. --endif. \ No newline at end of file +-endif. diff --git a/test/lager_slow_backend.erl b/test/lager_slow_backend.erl index 9771083..e467c23 100644 --- a/test/lager_slow_backend.erl +++ b/test/lager_slow_backend.erl @@ -10,7 +10,7 @@ delay :: non_neg_integer() }). -init(Delay) -> +init([{delay, Delay}]) -> {ok, #state{delay=Delay}}. handle_call(get_loglevel, State) -> From 901ad9b1dd421bc6089f9d8b46893d94ef63044e Mon Sep 17 00:00:00 2001 From: "Brian L. Troutwine" Date: Mon, 14 Mar 2016 13:36:17 -0700 Subject: [PATCH 14/27] Update test to not rely on sleep. This test only passes when run in isolation. That is, if you run the suite solo everything is beautiful and nothing hurts. When run in the group, it fails. By inspection it seems that no trace messages are being sent and this causes the failure, for want of specific traces. Signed-off-by: Brian L. Troutwine --- test/lager_manager_killer_test.erl | 107 ++++++++++++++++++----------- 1 file changed, 65 insertions(+), 42 deletions(-) diff --git a/test/lager_manager_killer_test.erl b/test/lager_manager_killer_test.erl index 6e63079..ac8898f 100644 --- a/test/lager_manager_killer_test.erl +++ b/test/lager_manager_killer_test.erl @@ -7,49 +7,72 @@ -include_lib("eunit/include/eunit.hrl"). overload_test() -> - application:stop(lager), - application:load(lager), - Delay = 1000, % sleep 1 sec on every log - KillerHWM = 10, % kill the manager if there are more than 10 pending logs - KillerReinstallAfter = 1000, % reinstall killer after 1 sec - application:set_env(lager, handlers, [{lager_slow_backend, [{delay, Delay}]}]), - application:set_env(lager, async_threshold, undefined), - application:set_env(lager, killer_hwm, KillerHWM), - application:set_env(lager, killer_reinstall_after, KillerReinstallAfter), - ensure_started(lager), - lager_config:set(async, true), - Manager = whereis(lager_event), - erlang:trace(all, true, [procs]), - [lager:info("~p'th message", [N]) || N <- lists:seq(1,KillerHWM+2)], - Margin = 100, - receive - {trace, Manager, exit, killed} -> - ?debugFmt("Manager ~p killed", [Manager]) - after Delay+Margin -> - ?assert(false) - end, - receive - {trace, _Sup, spawn, Pid, Fun} -> - ?assert(process_info(Pid, registered_name) =:= {registered_name, lager_event}), - ?debugFmt("Manager ~p start with ~p", [Pid, Fun]), - ?assertNot(lists:member(lager_manager_killer, gen_event:which_handlers(lager_event))) - after Margin -> - ?assert(false) - end, - erlang:trace(all, false, [procs]), - timer:sleep(KillerReinstallAfter), - ?assert(proplists:get_value(lager_manager_killer, gen_event:which_handlers(lager_event))), - ?assert(gen_event:call(lager_event, lager_manager_killer, get_settings) =:= [KillerHWM, KillerReinstallAfter]), - ?debugFmt("Killer reinstalled with [~p, ~p]", [KillerHWM, KillerReinstallAfter]), - application:stop(lager). + application:stop(lager), + application:load(lager), + Delay = 1000, % sleep 1 sec on every log + KillerHWM = 10, % kill the manager if there are more than 10 pending logs + KillerReinstallAfter = 1000, % reinstall killer after 1 sec + application:set_env(lager, handlers, [{lager_slow_backend, [{delay, Delay}]}]), + application:set_env(lager, async_threshold, undefined), + application:set_env(lager, killer_hwm, KillerHWM), + application:set_env(lager, killer_reinstall_after, KillerReinstallAfter), + ensure_started(lager), + lager_config:set(async, true), + Manager = whereis(lager_event), + erlang:trace(all, true, [procs]), + [lager:info("~p'th message", [N]) || N <- lists:seq(1,KillerHWM+2)], + Margin = 100, + ok = confirm_manager_exit(Manager, Delay+Margin), + ok = confirm_sink_reregister(Margin), + erlang:trace(all, false, [procs]), + wait_until(fun() -> + case proplists:get_value(lager_manager_killer, gen_event:which_handlers(lager_event)) of + [] -> false; + _ -> true + end + end, Margin, 15), + wait_until(fun() -> + case gen_event:call(lager_event, lager_manager_killer, get_settings) of + [KillerHWM, KillerReinstallAfter] -> true; + _Other -> false + end + end, Margin, 15), + application:stop(lager). ensure_started(App) -> - case application:start(App) of - ok -> - ok; - {error, {not_started, Dep}} -> - ensure_started(Dep), - ensure_started(App) - end. + case application:start(App) of + ok -> + ok; + {error, {not_started, Dep}} -> + ensure_started(Dep), + ensure_started(App) + end. + +confirm_manager_exit(Manager, Delay) -> + receive + {trace, Manager, exit, killed} -> + ?debugFmt("Manager ~p killed", [Manager]); + Other -> + ?debugFmt("OTHER MSG: ~p", [Other]), + confirm_manager_exit(Manager, Delay) + after Delay -> + ?assert(false) + end. + +confirm_sink_reregister(Delay) -> + receive + {trace, _Pid, register, lager_event} -> + ?assertNot(lists:member(lager_manager_killer, gen_event:which_handlers(lager_event))) + after Delay -> + ?assert(false) + end. + +wait_until(_Fun, _Delay, 0) -> + {error, too_many_retries}; +wait_until(Fun, Delay, Retries) -> + case Fun() of + true -> ok; + false -> timer:sleep(Delay), wait_until(Fun, Delay, Retries-1) + end. -endif. From 3d56494e398d601a41b8ef1b3a35fa59fb8017c5 Mon Sep 17 00:00:00 2001 From: "Brian L. Troutwine" Date: Thu, 17 Mar 2016 10:11:42 -0700 Subject: [PATCH 15/27] Force restart of system after a watcher kill. This is, perhaps, not the most ideal setup. It still ignores other sinks but does bring logging back online after the appropriate delay. Signed-off-by: Brian L. Troutwine --- src/lager_app.erl | 9 ++++++--- src/lager_handler_watcher.erl | 8 ++++---- src/lager_manager_killer.erl | 7 +++++++ 3 files changed, 17 insertions(+), 7 deletions(-) diff --git a/src/lager_app.erl b/src/lager_app.erl index 2661eac..6b6b63d 100644 --- a/src/lager_app.erl +++ b/src/lager_app.erl @@ -29,7 +29,8 @@ start/2, start_handler/3, configure_sink/2, - stop/1]). + stop/1, + boot/0]). %% The `application:get_env/3` compatibility wrapper is useful %% for other modules @@ -234,7 +235,10 @@ get_env_default({ok, Value}, _Default) -> start(_StartType, _StartArgs) -> {ok, Pid} = lager_sup:start_link(), + SavedHandlers = boot(), + {ok, Pid, SavedHandlers}. +boot() -> %% Handle the default sink. determine_async_behavior(?DEFAULT_SINK, application:get_env(lager, async_threshold), @@ -260,8 +264,7 @@ start(_StartType, _StartArgs) -> clean_up_config_checks(), - {ok, Pid, SavedHandlers}. - + SavedHandlers. stop(Handlers) -> lists:foreach(fun(Handler) -> diff --git a/src/lager_handler_watcher.erl b/src/lager_handler_watcher.erl index da67d8b..336ee06 100644 --- a/src/lager_handler_watcher.erl +++ b/src/lager_handler_watcher.erl @@ -63,11 +63,11 @@ handle_info({gen_event_EXIT, Module, shutdown}, #state{module=Module} = State) - {stop, normal, State}; handle_info({gen_event_EXIT, Module, {'EXIT', {kill_me, [_KillerHWM, KillerReinstallAfter]}}}, #state{module=Module, sink=Sink} = State) -> - % Brutally kill the manager but stay alive to restore settings. + %% Brutally kill the manager but stay alive to restore settings. Manager = whereis(Sink), unlink(Manager), exit(Manager, kill), - erlang:send_after(KillerReinstallAfter, self(), reinstall_handler), + erlang:send_after(KillerReinstallAfter, self(), reboot), {noreply, State}; handle_info({gen_event_EXIT, Module, Reason}, #state{module=Module, config=Config, sink=Sink} = State) -> @@ -80,8 +80,8 @@ handle_info({gen_event_EXIT, Module, Reason}, #state{module=Module, ok end, {noreply, State}; -handle_info(reinstall_handler, #state{module=Module, config=Config, sink=Sink} = State) -> - install_handler(Sink, Module, Config), +handle_info(reboot, State) -> + lager_app:boot(), {noreply, State}; handle_info(stop, State) -> {stop, normal, State}; diff --git a/src/lager_manager_killer.erl b/src/lager_manager_killer.erl index 158b471..3f15dc4 100644 --- a/src/lager_manager_killer.erl +++ b/src/lager_manager_killer.erl @@ -4,6 +4,8 @@ -export([init/1, handle_event/2, handle_call/2, handle_info/2, terminate/2, code_change/3]). +-export([kill_me/0]). + -include("lager.hrl"). -record(state, { @@ -11,6 +13,9 @@ killer_reinstall_after :: non_neg_integer() }). +kill_me() -> + gen_event:call(lager_event, ?MODULE, kill_self). + init([KillerHWM, KillerReinstallAfter]) -> {ok, #state{killer_hwm=KillerHWM, killer_reinstall_after=KillerReinstallAfter}}. @@ -20,6 +25,8 @@ handle_call({set_loglevel, _Level}, State) -> {ok, ok, State}; handle_call(get_settings, State = #state{killer_hwm=KillerHWM, killer_reinstall_after=KillerReinstallAfter}) -> {ok, [KillerHWM, KillerReinstallAfter], State}; +handle_call(kill_self, #state{killer_hwm=KillerHWM, killer_reinstall_after=KillerReinstallAfter}) -> + exit({kill_me, [KillerHWM, KillerReinstallAfter]}); handle_call(_Request, State) -> {ok, ok, State}. From 2d42e95fe8a013ce2cd686bb144438c9d5e51227 Mon Sep 17 00:00:00 2001 From: "Brian L. Troutwine" Date: Thu, 17 Mar 2016 13:06:02 -0700 Subject: [PATCH 16/27] Correct test failures for lager_handler_watcher. Signed-off-by: Brian L. Troutwine --- test/lager_manager_killer_test.erl | 68 ++++++++++++++++-------------- 1 file changed, 36 insertions(+), 32 deletions(-) diff --git a/test/lager_manager_killer_test.erl b/test/lager_manager_killer_test.erl index ac8898f..7a0622a 100644 --- a/test/lager_manager_killer_test.erl +++ b/test/lager_manager_killer_test.erl @@ -6,38 +6,42 @@ -ifdef(TEST). -include_lib("eunit/include/eunit.hrl"). -overload_test() -> - application:stop(lager), - application:load(lager), - Delay = 1000, % sleep 1 sec on every log - KillerHWM = 10, % kill the manager if there are more than 10 pending logs - KillerReinstallAfter = 1000, % reinstall killer after 1 sec - application:set_env(lager, handlers, [{lager_slow_backend, [{delay, Delay}]}]), - application:set_env(lager, async_threshold, undefined), - application:set_env(lager, killer_hwm, KillerHWM), - application:set_env(lager, killer_reinstall_after, KillerReinstallAfter), - ensure_started(lager), - lager_config:set(async, true), - Manager = whereis(lager_event), - erlang:trace(all, true, [procs]), - [lager:info("~p'th message", [N]) || N <- lists:seq(1,KillerHWM+2)], - Margin = 100, - ok = confirm_manager_exit(Manager, Delay+Margin), - ok = confirm_sink_reregister(Margin), - erlang:trace(all, false, [procs]), - wait_until(fun() -> - case proplists:get_value(lager_manager_killer, gen_event:which_handlers(lager_event)) of - [] -> false; - _ -> true - end - end, Margin, 15), - wait_until(fun() -> - case gen_event:call(lager_event, lager_manager_killer, get_settings) of - [KillerHWM, KillerReinstallAfter] -> true; - _Other -> false - end - end, Margin, 15), - application:stop(lager). +overload_test_() -> + {timeout, 60, + fun() -> + application:stop(lager), + application:load(lager), + Delay = 1000, % sleep 1 sec on every log + KillerHWM = 10, % kill the manager if there are more than 10 pending logs + KillerReinstallAfter = 1000, % reinstall killer after 1 sec + application:set_env(lager, handlers, [{lager_slow_backend, [{delay, Delay}]}]), + application:set_env(lager, async_threshold, undefined), + application:set_env(lager, error_logger_redirect, true), + application:set_env(lager, killer_hwm, KillerHWM), + application:set_env(lager, killer_reinstall_after, KillerReinstallAfter), + ensure_started(lager), + lager_config:set(async, true), + Manager = whereis(lager_event), + erlang:trace(all, true, [procs]), + [lager:info("~p'th message", [N]) || N <- lists:seq(1,KillerHWM+2)], + Margin = 100, + ok = confirm_manager_exit(Manager, Delay+Margin), + ok = confirm_sink_reregister(Margin), + erlang:trace(all, false, [procs]), + wait_until(fun() -> + case proplists:get_value(lager_manager_killer, gen_event:which_handlers(lager_event)) of + [] -> false; + _ -> true + end + end, Margin, 15), + wait_until(fun() -> + case gen_event:call(lager_event, lager_manager_killer, get_settings) of + [KillerHWM, KillerReinstallAfter] -> true; + _Other -> false + end + end, Margin, 15), + application:stop(lager) + end}. ensure_started(App) -> case application:start(App) of From 35e10cfc4e1ef994e7cd0fde76378263564abd63 Mon Sep 17 00:00:00 2001 From: "Brian L. Troutwine" Date: Fri, 18 Mar 2016 17:54:52 -0700 Subject: [PATCH 17/27] Correct removal of reinstall_handler. Signed-off-by: Brian L. Troutwine --- src/lager_handler_watcher.erl | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/lager_handler_watcher.erl b/src/lager_handler_watcher.erl index 336ee06..3d13cd3 100644 --- a/src/lager_handler_watcher.erl +++ b/src/lager_handler_watcher.erl @@ -80,6 +80,9 @@ handle_info({gen_event_EXIT, Module, Reason}, #state{module=Module, ok end, {noreply, State}; +handle_info(reinstall_handler, #state{module=Module, config=Config, sink=Sink} = State) -> + install_handler(Sink, Module, Config), + {noreply, State}; handle_info(reboot, State) -> lager_app:boot(), {noreply, State}; From bd3c1ecd9dd79134745a6a270f2f513aae299f51 Mon Sep 17 00:00:00 2001 From: Mark Allen Date: Tue, 29 Mar 2016 10:46:27 -0500 Subject: [PATCH 18/27] Overhaul start up and add killer config Previously, configuration would come from application:get_env/3 or proplists:get_value/3. Now we standardize on a single format for retrieving values from the environment or from a proplist. --- src/lager_app.erl | 105 ++++++++++++++++++++++------------------------ 1 file changed, 49 insertions(+), 56 deletions(-) diff --git a/src/lager_app.erl b/src/lager_app.erl index 6b6b63d..3529585 100644 --- a/src/lager_app.erl +++ b/src/lager_app.erl @@ -30,7 +30,7 @@ start_handler/3, configure_sink/2, stop/1, - boot/0]). + boot/1]). %% The `application:get_env/3` compatibility wrapper is useful %% for other modules @@ -58,21 +58,19 @@ start_throttle(Sink, Threshold, Window) -> [Sink, ?THROTTLE, [Threshold, Window]]), ok. -determine_async_behavior(_Sink, {ok, undefined}, _Window) -> - ok; determine_async_behavior(_Sink, undefined, _Window) -> ok; -determine_async_behavior(_Sink, {ok, Threshold}, _Window) when not is_integer(Threshold) orelse Threshold < 0 -> +determine_async_behavior(_Sink, Threshold, _Window) when not is_integer(Threshold) orelse Threshold < 0 -> error_logger:error_msg("Invalid value for 'async_threshold': ~p~n", [Threshold]), throw({error, bad_config}); -determine_async_behavior(Sink, {ok, Threshold}, undefined) -> +determine_async_behavior(Sink, Threshold, undefined) -> start_throttle(Sink, Threshold, erlang:trunc(Threshold * 0.2)); -determine_async_behavior(_Sink, {ok, Threshold}, {ok, Window}) when not is_integer(Window) orelse Window > Threshold orelse Window < 0 -> +determine_async_behavior(_Sink, Threshold, Window) when not is_integer(Window) orelse Window > Threshold orelse Window < 0 -> error_logger:error_msg( "Invalid value for 'async_threshold_window': ~p~n", [Window]), throw({error, bad_config}); -determine_async_behavior(Sink, {ok, Threshold}, {ok, Window}) -> +determine_async_behavior(Sink, Threshold, Window) -> start_throttle(Sink, Threshold, Window). start_handlers(_Sink, undefined) -> @@ -124,14 +122,22 @@ clean_up_config_checks() -> interpret_hwm(undefined) -> undefined; -interpret_hwm({ok, undefined}) -> - undefined; -interpret_hwm({ok, HWM}) when not is_integer(HWM) orelse HWM < 0 -> +interpret_hwm(HWM) when not is_integer(HWM) orelse HWM < 0 -> _ = lager:log(warning, self(), "Invalid error_logger high water mark: ~p, disabling", [HWM]), undefined; -interpret_hwm({ok, HWM}) -> +interpret_hwm(HWM) -> HWM. +maybe_install_sink_killer(_Sink, undefined, _ReinstallTimer) -> ok; +maybe_install_sink_killer(Sink, HWM, undefined) -> maybe_install_sink_killer(Sink, HWM, 5000); +maybe_install_sink_killer(Sink, HWM, ReinstallTimer) when is_integer(HWM) andalso is_integer(ReinstallTimer) + andalso HWM >= 0 andalso ReinstallTimer >= 0 -> + _ = supervisor:start_child(lager_handler_watcher_sup, [Sink, lager_manager_killer, + [HWM, ReinstallTimer]]); +maybe_install_sink_killer(_Sink, HWM, ReinstallTimer) -> + error_logger:error_msg("Invalid value for 'killer_hwm': ~p or 'killer_reinstall_after': ~p", [HWM, ReinstallTimer]), + throw({error, bad_config}). + start_error_logger_handler({ok, false}, _HWM, _Whitelist) -> []; start_error_logger_handler(_, HWM, undefined) -> @@ -152,24 +158,6 @@ start_error_logger_handler(_, HWM, {ok, WhiteList}) -> throw({error, bad_config}) end, - %% Conditionally start the high watermark killer. - case application:get_env(lager, killer_hwm) of - undefined -> ok; - {ok, undefined} -> ok; - {ok, KillerHWM} when is_integer(KillerHWM), KillerHWM >= 0 -> - KillerReinstallAfter = case application:get_env(lager, killer_reinstall_after) of - undefined -> 5000; - {ok, undefined} -> 5000; - {ok, V} when is_integer(V), V >= 0 -> V; - {ok, BadKillerReinstallAfter} -> - error_logger:error_msg("Invalid value for 'cooldown': ~p~n", - [BadKillerReinstallAfter]), - throw({error, bad_config}) - end, - _ = supervisor:start_child(lager_handler_watcher_sup, - [?DEFAULT_SINK, lager_manager_killer, - [KillerHWM, KillerReinstallAfter]]) - end, case supervisor:start_child(lager_handler_watcher_sup, [error_logger, error_logger_lager_h, [HWM, GlStrategy]]) of {ok, _} -> @@ -189,16 +177,6 @@ start_error_logger_handler(_, HWM, {ok, WhiteList}) -> end, Handlers. -%% `determine_async_behavior/3' is called with the results from either -%% `application:get_env/2' and `proplists:get_value/2'. Since -%% `application:get_env/2' wraps a successful retrieval in an `{ok, -%% Value}' tuple, do the same for the result from -%% `proplists:get_value/2'. -wrap_proplist_value(undefined) -> - undefined; -wrap_proplist_value(Value) -> - {ok, Value}. - configure_sink(Sink, SinkDef) -> lager_config:new_sink(Sink), ChildId = lager_util:make_internal_sink_name(Sink), @@ -207,12 +185,11 @@ configure_sink(Sink, SinkDef) -> {gen_event, start_link, [{local, Sink}]}, permanent, 5000, worker, dynamic}), - determine_async_behavior(Sink, - wrap_proplist_value( - proplists:get_value(async_threshold, SinkDef)), - wrap_proplist_value( - proplists:get_value(async_threshold_window, SinkDef)) + determine_async_behavior(Sink, proplists:get_value(async_threshold, SinkDef), + proplists:get_value(async_threshold_window, SinkDef) ), + maybe_install_sink_killer(Sink, proplists:get_value(killer_hwm, SinkDef), + proplists:get_value(killer_reinstall_after, SinkDef)), start_handlers(Sink, proplists:get_value(handlers, SinkDef, [])), @@ -223,6 +200,8 @@ configure_extra_sinks(Sinks) -> lists:foreach(fun({Sink, Proplist}) -> configure_sink(Sink, Proplist) end, Sinks). +get_env(Application, Key) -> + get_env(Application, Key, undefined). %% R15 doesn't know about application:get_env/3 get_env(Application, Key, Default) -> get_env_default(application:get_env(Application, Key), @@ -236,35 +215,49 @@ get_env_default({ok, Value}, _Default) -> start(_StartType, _StartArgs) -> {ok, Pid} = lager_sup:start_link(), SavedHandlers = boot(), + _ = boot('__all_extra'), + _ = boot('__traces'), + clean_up_config_checks(), {ok, Pid, SavedHandlers}. boot() -> %% Handle the default sink. determine_async_behavior(?DEFAULT_SINK, - application:get_env(lager, async_threshold), - application:get_env(lager, async_threshold_window)), + get_env(lager, async_threshold), + get_env(lager, async_threshold_window)), + + maybe_install_sink_killer(?DEFAULT_SINK, get_env(lager, killer_hwm), + get_env(lager, killer_reinstall_after)), + start_handlers(?DEFAULT_SINK, get_env(lager, handlers, ?DEFAULT_HANDLER_CONF)), - lager:update_loglevel_config(?DEFAULT_SINK), SavedHandlers = start_error_logger_handler( - application:get_env(lager, error_logger_redirect), - interpret_hwm(application:get_env(lager, error_logger_hwm)), - application:get_env(lager, error_logger_whitelist) + get_env(lager, error_logger_redirect), + interpret_hwm(get_env(lager, error_logger_hwm)), + get_env(lager, error_logger_whitelist) ), - _ = lager_util:trace_filter(none), + SavedHandlers. - %% Now handle extra sinks - configure_extra_sinks(get_env(lager, extra_sinks, [])), +boot('__traces') -> + _ = lager_util:trace_filter(none), + ok = add_configured_traces(); - ok = add_configured_traces(), +boot('__all_extra') -> + configure_extra_sinks(get_env(lager, extra_sinks, [])); - clean_up_config_checks(), +boot(?DEFAULT_SINK) -> boot(); +boot(Sink) -> + AllSinksDef = get_env(lager, extra_sinks, []), + boot_sink(Sink, lists:keyfind(Sink, 1, AllSinksDef)). - SavedHandlers. +boot_sink(Sink, {Sink, Def}) -> + configure_sink(Sink, Def); +boot_sink(Sink, false) -> + configure_sink(Sink, []). stop(Handlers) -> lists:foreach(fun(Handler) -> From e6e7f6645fd5590bec25a1b2afa2f1b2071c057b Mon Sep 17 00:00:00 2001 From: Mark Allen Date: Tue, 29 Mar 2016 10:48:19 -0500 Subject: [PATCH 19/27] Send the sink name with reboot message Previously, was always hardcoded to the default sink name. --- src/lager_handler_watcher.erl | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/src/lager_handler_watcher.erl b/src/lager_handler_watcher.erl index 3d13cd3..df92584 100644 --- a/src/lager_handler_watcher.erl +++ b/src/lager_handler_watcher.erl @@ -64,10 +64,14 @@ handle_info({gen_event_EXIT, Module, shutdown}, #state{module=Module} = State) - handle_info({gen_event_EXIT, Module, {'EXIT', {kill_me, [_KillerHWM, KillerReinstallAfter]}}}, #state{module=Module, sink=Sink} = State) -> %% Brutally kill the manager but stay alive to restore settings. - Manager = whereis(Sink), - unlink(Manager), - exit(Manager, kill), - erlang:send_after(KillerReinstallAfter, self(), reboot), + %% + %% SinkPid here means the gen_event process. Handlers *all* live inside the + %% same gen_event process space, so when the Pid is killed, *all* of the + %% pending log messages in its mailbox will die too. + SinkPid = whereis(Sink), + unlink(SinkPid), + exit(SinkPid, kill), + erlang:send_after(KillerReinstallAfter, self(), {reboot, Sink}), {noreply, State}; handle_info({gen_event_EXIT, Module, Reason}, #state{module=Module, config=Config, sink=Sink} = State) -> @@ -83,8 +87,8 @@ handle_info({gen_event_EXIT, Module, Reason}, #state{module=Module, handle_info(reinstall_handler, #state{module=Module, config=Config, sink=Sink} = State) -> install_handler(Sink, Module, Config), {noreply, State}; -handle_info(reboot, State) -> - lager_app:boot(), +handle_info({reboot, Sink}, State) -> + lager_app:boot(Sink), {noreply, State}; handle_info(stop, State) -> {stop, normal, State}; From ec43800bd5bf0286c5d591fbda0b2d22fccf4d7b Mon Sep 17 00:00:00 2001 From: Mark Allen Date: Tue, 29 Mar 2016 16:28:56 -0500 Subject: [PATCH 20/27] Add a sleep to prevent losing race condition Lager logs its own application start up message. Previously this message was discarded or not received quickly enough to cause test failures, but now it seems as though it is being received and causing several test failures even though we are explicitly flushing it away. We introduce a 5 millisecond sleep for several tests to make sure the start up message is received before the flush command dumps it. --- src/lager_file_backend.erl | 11 +++++++++-- src/lager_handler_watcher.erl | 7 +++---- test/lager_test_backend.erl | 12 ++++++++++++ 3 files changed, 24 insertions(+), 6 deletions(-) diff --git a/src/lager_file_backend.erl b/src/lager_file_backend.erl index 2d50165..be6e059 100644 --- a/src/lager_file_backend.erl +++ b/src/lager_file_backend.erl @@ -520,7 +520,12 @@ filesystem_test_() -> application:set_env(lager, handlers, [{lager_test_backend, info}]), application:set_env(lager, error_logger_redirect, false), application:set_env(lager, async_threshold, undefined), - lager:start() + lager:start(), + %% race condition where lager logs its own start up + %% makes several tests fail. See test/lager_test_backend + %% around line 800 for more information. + timer:sleep(5), + lager_test_backend:flush() end, fun(_) -> file:delete("test.log"), @@ -866,7 +871,9 @@ formatting_test_() -> application:load(lager), application:set_env(lager, handlers, [{lager_test_backend, info}]), application:set_env(lager, error_logger_redirect, false), - lager:start() + lager:start(), + %% same race condition issue + timer:sleep(5) end, fun(_) -> file:delete("test.log"), diff --git a/src/lager_handler_watcher.erl b/src/lager_handler_watcher.erl index df92584..e1ea43f 100644 --- a/src/lager_handler_watcher.erl +++ b/src/lager_handler_watcher.erl @@ -150,11 +150,10 @@ reinstall_on_initial_failure_test_() -> application:unset_env(lager, crash_log), lager:start(), try - ?assertEqual(1, lager_test_backend:count()), {_Level, _Time, Message, _Metadata} = lager_test_backend:pop(), ?assertMatch("Lager failed to install handler lager_crash_backend into lager_event, retrying later :"++_, lists:flatten(Message)), - ?assertEqual(0, lager_test_backend:count()), timer:sleep(6000), + lager_test_backend:flush(), ?assertEqual(0, lager_test_backend:count()), ?assert(lists:member(lager_crash_backend, gen_event:which_handlers(lager_event))) after @@ -177,10 +176,10 @@ reinstall_on_runtime_failure_test_() -> application:unset_env(lager, crash_log), lager:start(), try - ?assertEqual(0, lager_test_backend:count()), ?assert(lists:member(lager_crash_backend, gen_event:which_handlers(lager_event))), timer:sleep(6000), - ?assertEqual(2, lager_test_backend:count()), + _ = lager_test_backend:pop(), %% throw away application start up message + _ = lager_test_backend:pop(), %% throw away gen_event crash message {_Severity, _Date, Msg, _Metadata} = lager_test_backend:pop(), ?assertEqual("Lager event handler lager_crash_backend exited with reason crash", lists:flatten(Msg)), {_Severity2, _Date2, Msg2, _Metadata2} = lager_test_backend:pop(), diff --git a/test/lager_test_backend.erl b/test/lager_test_backend.erl index d8af4d4..c97dbb8 100644 --- a/test/lager_test_backend.erl +++ b/test/lager_test_backend.erl @@ -796,6 +796,18 @@ setup() -> application:set_env(lager, error_logger_redirect, false), application:unset_env(lager, traces), lager:start(), + %% There is a race condition between the application start up, lager logging its own + %% start up condition and several tests that count messages or parse the output of + %% tests. When the lager start up message wins the race, it causes these tests + %% which parse output or count message arrivals to fail. + %% + %% We introduce a sleep here to allow `flush' to arrive *after* the start up + %% message has been received and processed. + %% + %% This race condition was first exposed during the work on + %% 4b5260c4524688b545cc12da6baa2dfa4f2afec9 which introduced the lager + %% manager killer PR. + timer:sleep(5), gen_event:call(lager_event, ?MODULE, flush). cleanup(_) -> From 9393c5bd3bff1d14a495eef59155d3747c274c19 Mon Sep 17 00:00:00 2001 From: Mark Allen Date: Wed, 6 Apr 2016 23:10:01 -0500 Subject: [PATCH 21/27] Test manager killer on alternate sink --- test/lager_manager_killer_test.erl | 51 +++++++++++++++++++++++++++--- 1 file changed, 47 insertions(+), 4 deletions(-) diff --git a/test/lager_manager_killer_test.erl b/test/lager_manager_killer_test.erl index 7a0622a..96199b0 100644 --- a/test/lager_manager_killer_test.erl +++ b/test/lager_manager_killer_test.erl @@ -6,6 +6,9 @@ -ifdef(TEST). -include_lib("eunit/include/eunit.hrl"). +-define(TEST_SINK_NAME, '__lager_test_sink'). %% <-- used by parse transform +-define(TEST_SINK_EVENT, '__lager_test_sink_lager_event'). %% <-- used by lager API calls and internals for gen_event + overload_test_() -> {timeout, 60, fun() -> @@ -26,7 +29,7 @@ overload_test_() -> [lager:info("~p'th message", [N]) || N <- lists:seq(1,KillerHWM+2)], Margin = 100, ok = confirm_manager_exit(Manager, Delay+Margin), - ok = confirm_sink_reregister(Margin), + ok = confirm_sink_reregister(lager_event, Margin), erlang:trace(all, false, [procs]), wait_until(fun() -> case proplists:get_value(lager_manager_killer, gen_event:which_handlers(lager_event)) of @@ -43,6 +46,46 @@ overload_test_() -> application:stop(lager) end}. +overload_alternate_sink_test_() -> + {timeout, 60, + fun() -> + application:stop(lager), + application:load(lager), + Delay = 1000, % sleep 1 sec on every log + KillerHWM = 10, % kill the manager if there are more than 10 pending logs + KillerReinstallAfter = 1000, % reinstall killer after 1 sec + application:set_env(lager, handlers, []), + application:set_env(lager, extra_sinks, [{?TEST_SINK_EVENT, [ + {handlers, [{lager_slow_backend, [{delay, Delay}]}]}, + {killer_hwm, KillerHWM}, + {killer_reinstall_after, KillerReinstallAfter}, + {async_threshold, undefined} + ]}]), + application:set_env(lager, error_logger_redirect, true), + ensure_started(lager), + lager_config:set({?TEST_SINK_EVENT, async}, true), + Manager = whereis(?TEST_SINK_EVENT), + erlang:trace(all, true, [procs]), + [?TEST_SINK_NAME:info("~p'th message", [N]) || N <- lists:seq(1,KillerHWM+2)], + Margin = 100, + ok = confirm_manager_exit(Manager, Delay+Margin), + ok = confirm_sink_reregister(?TEST_SINK_EVENT, Margin), + erlang:trace(all, false, [procs]), + wait_until(fun() -> + case proplists:get_value(lager_manager_killer, gen_event:which_handlers(?TEST_SINK_EVENT)) of + [] -> false; + _ -> true + end + end, Margin, 15), + wait_until(fun() -> + case gen_event:call(?TEST_SINK_EVENT, lager_manager_killer, get_settings) of + [KillerHWM, KillerReinstallAfter] -> true; + _Other -> false + end + end, Margin, 15), + application:stop(lager) + end}. + ensure_started(App) -> case application:start(App) of ok -> @@ -63,10 +106,10 @@ confirm_manager_exit(Manager, Delay) -> ?assert(false) end. -confirm_sink_reregister(Delay) -> +confirm_sink_reregister(Sink, Delay) -> receive - {trace, _Pid, register, lager_event} -> - ?assertNot(lists:member(lager_manager_killer, gen_event:which_handlers(lager_event))) + {trace, _Pid, register, Sink} -> + ?assertNot(lists:member(lager_manager_killer, gen_event:which_handlers(Sink))) after Delay -> ?assert(false) end. From af9947da25dc64abab2df1be2af9962e9f17c64e Mon Sep 17 00:00:00 2001 From: Mark Allen Date: Wed, 6 Apr 2016 23:20:07 -0500 Subject: [PATCH 22/27] Fix dialyzer warnings --- src/lager_app.erl | 4 ++-- src/lager_handler_watcher.erl | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/lager_app.erl b/src/lager_app.erl index 3529585..497681d 100644 --- a/src/lager_app.erl +++ b/src/lager_app.erl @@ -188,7 +188,7 @@ configure_sink(Sink, SinkDef) -> determine_async_behavior(Sink, proplists:get_value(async_threshold, SinkDef), proplists:get_value(async_threshold_window, SinkDef) ), - maybe_install_sink_killer(Sink, proplists:get_value(killer_hwm, SinkDef), + _ = maybe_install_sink_killer(Sink, proplists:get_value(killer_hwm, SinkDef), proplists:get_value(killer_reinstall_after, SinkDef)), start_handlers(Sink, proplists:get_value(handlers, SinkDef, [])), @@ -226,7 +226,7 @@ boot() -> get_env(lager, async_threshold), get_env(lager, async_threshold_window)), - maybe_install_sink_killer(?DEFAULT_SINK, get_env(lager, killer_hwm), + _ = maybe_install_sink_killer(?DEFAULT_SINK, get_env(lager, killer_hwm), get_env(lager, killer_reinstall_after)), start_handlers(?DEFAULT_SINK, diff --git a/src/lager_handler_watcher.erl b/src/lager_handler_watcher.erl index e1ea43f..d5fe140 100644 --- a/src/lager_handler_watcher.erl +++ b/src/lager_handler_watcher.erl @@ -88,7 +88,7 @@ handle_info(reinstall_handler, #state{module=Module, config=Config, sink=Sink} = install_handler(Sink, Module, Config), {noreply, State}; handle_info({reboot, Sink}, State) -> - lager_app:boot(Sink), + _ = lager_app:boot(Sink), {noreply, State}; handle_info(stop, State) -> {stop, normal, State}; From 82788b996c19b11c45756281af5ca4381d5e6739 Mon Sep 17 00:00:00 2001 From: Mark Allen Date: Thu, 7 Apr 2016 00:08:07 -0500 Subject: [PATCH 23/27] Make sure we flush after app restart --- test/lager_test_backend.erl | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test/lager_test_backend.erl b/test/lager_test_backend.erl index c97dbb8..a2503ef 100644 --- a/test/lager_test_backend.erl +++ b/test/lager_test_backend.erl @@ -515,6 +515,8 @@ lager_test_() -> application:stop(lager), application:set_env(lager, traces, [{lager_test_backend, [{foo, bar}], debug}]), lager:start(), + timer:sleep(5), + flush(), lager:debug([{foo, bar}], "hello world"), ?assertEqual(1, count()), application:unset_env(lager, traces), From d2dfec2a324659080b8f09752a686a007c0e9ced Mon Sep 17 00:00:00 2001 From: Mark Allen Date: Thu, 7 Apr 2016 00:32:44 -0500 Subject: [PATCH 24/27] Add 10 second timeout for trace stop --- test/lager_test_backend.erl | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/test/lager_test_backend.erl b/test/lager_test_backend.erl index a2503ef..7982481 100644 --- a/test/lager_test_backend.erl +++ b/test/lager_test_backend.erl @@ -539,6 +539,7 @@ lager_test_() -> end }, {"stopped trace stops and removes its event handler - default sink (gh#267)", + {timeout, 10, fun() -> Sink = ?DEFAULT_SINK, StartHandlers = gen_event:which_handlers(Sink), @@ -562,14 +563,14 @@ lager_test_() -> ?assertEqual(length(StartGlobal)+1, length(lager_config:global_get(handlers))), ok = lager:stop_trace(TestTrace2), - EndHandlers = gen_event:which_handlers(?DEFAULT_SINK), + EndHandlers = gen_event:which_handlers(Sink), EndGlobal = lager_config:global_get(handlers), {_, T3} = lager_config:get({Sink, loglevel}), ?assertEqual([], T3), ?assertEqual(StartHandlers, EndHandlers), ?assertEqual(StartGlobal, EndGlobal), ok - end + end} }, {"record printing works", fun() -> From f947d65a0236315d39fe9dc85c65ec17324f7ea8 Mon Sep 17 00:00:00 2001 From: Mark Allen Date: Thu, 7 Apr 2016 00:59:23 -0500 Subject: [PATCH 25/27] Document sink killer in README --- README.md | 54 +++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 45 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 1bec42f..7e95291 100644 --- a/README.md +++ b/README.md @@ -25,6 +25,8 @@ Features * Syslog style log level comparison flags * Colored terminal output (requires R16+) * Map support (requires 17+) +* Optional load shedding by setting a high water mark to kill (and reinstall) + a sink after a configurable cool down timer Usage ----- @@ -189,14 +191,19 @@ for the backend: ]}. ``` -Included is `lager_default_formatter`. This provides a generic, default formatting for log messages using a structure similar to Erlang's [iolist](http://learnyousomeerlang.com/buckets-of-sockets#io-lists) which we call "semi-iolist": +Included is `lager_default_formatter`. This provides a generic, default +formatting for log messages using a structure similar to Erlang's +[iolist](http://learnyousomeerlang.com/buckets-of-sockets#io-lists) which we +call "semi-iolist": * Any traditional iolist elements in the configuration are printed verbatim. -* Atoms in the configuration are treated as placeholders for lager metadata and extracted from the log message. +* Atoms in the configuration are treated as placeholders for lager metadata and + extracted from the log message. * The placeholders `date`, `time`, `message`, `sev` and `severity` will always exist. - * `sev` is an abbreviated severity which is interpreted as a capitalized single letter encoding of the severity level - (e.g. `'debug'` -> `$D`) - * The placeholders `pid`, `file`, `line`, `module`, `function`, and `node` will always exist if the parse transform is used. + * `sev` is an abbreviated severity which is interpreted as a capitalized + single letter encoding of the severity level (e.g. `'debug'` -> `$D`) + * The placeholders `pid`, `file`, `line`, `module`, `function`, and `node` + will always exist if the parse transform is used. * Applications can define their own metadata placeholder. * A tuple of `{atom(), semi-iolist()}` allows for a fallback for the atom placeholder. If the value represented by the atom @@ -235,6 +242,7 @@ specified via the `crash_log_msg_size` application variable. Messages from `error_logger` will be redirected to `error_logger_lager_event` sink if it is defined so it can be redirected to another log file. + For example: ``` @@ -249,15 +257,18 @@ For example: }] }]. ``` -Will send all `error_logger` messages to `error_logger.log` file. +will send all `error_logger` messages to `error_logger.log` file. + Overload Protection ------------------- +### Asynchronous mode + Prior to lager 2.0, the `gen_event` at the core of lager operated purely in synchronous mode. Asynchronous mode is faster, but has no protection against -message queue overload. In lager 2.0, the `gen_event` takes a hybrid approach. it -polls its own mailbox size and toggles the messaging between synchronous and -asynchronous depending on mailbox size. +message queue overload. As of lager 2.0, the `gen_event` takes a hybrid +approach. it polls its own mailbox size and toggles the messaging between +synchronous and asynchronous depending on mailbox size. ```erlang {async_threshold, 20}, @@ -283,6 +294,31 @@ related processes crash, you can set a limit: It is probably best to keep this number small. +### Sink Killer + +In some high volume situations, it may be preferable to drop all pending log +messages instead of letting them drain over time. + +If you prefer, you may choose to use the sink killer to shed load. In this +operational mode, if the `gen_event` mailbox exceeds a configurable +high water mark, the sink will be killed and reinstalled after a +configurable cool down time. + +You can configure this behavior by using these configuration directives: + +```erlang +{killer_hwm, 1000}, +{killer_reinstall_after, 5000} +``` + +This means if the sink's mailbox size exceeds 1000 messages, kill the +entire sink and reload it after 5000 milliseconds. This behavior can +also be installed into alternative sinks if desired. + +By default, the manager killer *is not installed* into any sink. If +the `killer_reinstall_after` cool down time is not specified it defaults +to 5000. + "Unsafe" -------- The unsafe code pathway bypasses the normal lager formatting code and uses the From 2f9ce2513d4a3df6333dd8638477aedc43435801 Mon Sep 17 00:00:00 2001 From: Mark Allen Date: Thu, 7 Apr 2016 10:30:30 -0500 Subject: [PATCH 26/27] Reduce amount of backend spawns --- test/lager_test_backend.erl | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/test/lager_test_backend.erl b/test/lager_test_backend.erl index 7982481..36409a3 100644 --- a/test/lager_test_backend.erl +++ b/test/lager_test_backend.erl @@ -1570,7 +1570,8 @@ async_threshold_test_() -> ?assertEqual(true, lager_config:get(async)), %% put a ton of things in the queue - Workers = [spawn_monitor(fun() -> [lager:info("hello world") || _ <- lists:seq(1, 1000)] end) || _ <- lists:seq(1, 15)], + Workers = [spawn_monitor(fun() -> [lager:info("hello world") || _ <- lists:seq(1, 100)] end) + || _ <- lists:seq(1, 10)], %% serialize on mailbox _ = gen_event:which_handlers(lager_event), @@ -1588,7 +1589,9 @@ async_threshold_test_() -> %% point in the past ?assertMatch([{sync_toggled, N}] when N > 0, ets:lookup(async_threshold_test, sync_toggled)), - %% wait for all the workers to return, meaning that all the messages have been logged (since we're definitely in sync mode at the end of the run) + %% wait for all the workers to return, meaning that all + %% the messages have been logged (since we're + %% definitely in sync mode at the end of the run) collect_workers(Workers), %% serialize on the mailbox again _ = gen_event:which_handlers(lager_event), From 7b752f99375f1cd3f795b6bce3831561d24f81af Mon Sep 17 00:00:00 2001 From: Mark Allen Date: Fri, 8 Apr 2016 15:27:52 -0500 Subject: [PATCH 27/27] Add a message about checking mailbox length --- src/lager_manager_killer.erl | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/lager_manager_killer.erl b/src/lager_manager_killer.erl index 3f15dc4..d9a5edb 100644 --- a/src/lager_manager_killer.erl +++ b/src/lager_manager_killer.erl @@ -29,7 +29,9 @@ handle_call(kill_self, #state{killer_hwm=KillerHWM, killer_reinstall_after=Kille exit({kill_me, [KillerHWM, KillerReinstallAfter]}); handle_call(_Request, State) -> {ok, ok, State}. - +%% It's not the best idea in the world to check the queue length for every +%% log message. We can make this operation work on a poll timer in the +%% future. handle_event({log, _Message}, State = #state{killer_hwm=KillerHWM, killer_reinstall_after=KillerReinstallAfter}) -> {message_queue_len, Len} = process_info(self(), message_queue_len), case Len > KillerHWM of