1: %%
    2: %% %CopyrightBegin%
    3: %%
    4: %% Copyright Ericsson AB 1996-2011. All Rights Reserved.
    5: %%
    6: %% The contents of this file are subject to the Erlang Public License,
    7: %% Version 1.1, (the "License"); you may not use this file except in
    8: %% compliance with the License. You should have received a copy of the
    9: %% Erlang Public License along with this software. If not, it can be
   10: %% retrieved online at http://www.erlang.org/.
   11: %%
   12: %% Software distributed under the License is distributed on an "AS IS"
   13: %% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
   14: %% the License for the specific language governing rights and limitations
   15: %% under the License.
   16: %%
   17: %% %CopyrightEnd%
   18: %%
   19: -module(disksup_SUITE).
   20: -include_lib("test_server/include/test_server.hrl").
   21: 
   22: %% Test server specific exports
   23: -export([all/0, suite/0,groups/0,init_per_group/2,end_per_group/2]).
   24: -export([init_per_suite/1, end_per_suite/1]).
   25: -export([init_per_testcase/2, end_per_testcase/2]).
   26: 
   27: %% Test cases
   28: -export([api/1, config/1, alarm/1]).
   29: -export([port/1]).
   30: -export([terminate/1, unavailable/1, restart/1]).
   31: -export([otp_5910/1]).
   32: 
   33: %% Default timetrap timeout (set in init_per_testcase)
   34: -define(default_timeout, ?t:minutes(1)).
   35: 
   36: init_per_suite(Config) when is_list(Config) ->
   37:     ok = application:start(os_mon),
   38:     Config.
   39: 
   40: end_per_suite(Config) when is_list(Config) ->
   41:     ok = application:stop(os_mon),
   42:     Config.
   43: 
   44: init_per_testcase(unavailable, Config) ->
   45:     terminate(Config),
   46:     init_per_testcase(dummy, Config);
   47: init_per_testcase(_Case, Config) ->
   48:     Dog = ?t:timetrap(?default_timeout),
   49:     [{watchdog,Dog} | Config].
   50: 
   51: end_per_testcase(unavailable, Config) ->
   52:     restart(Config),
   53:     end_per_testcase(dummy, Config);
   54: end_per_testcase(_Case, Config) ->
   55:     Dog = ?config(watchdog, Config),
   56:     ?t:timetrap_cancel(Dog),
   57:     ok.
   58: 
   59: suite() -> [{ct_hooks,[ts_install_cth]}].
   60: 
   61: all() -> 
   62:     Bugs = [otp_5910],
   63:     case test_server:os_type() of
   64: 	{unix, sunos} ->
   65: 	    [api, config, alarm, port, unavailable] ++ Bugs;
   66: 	{unix, _OSname} -> [api, alarm] ++ Bugs;
   67: 	{win32, _OSname} -> [api, alarm] ++ Bugs;
   68: 	_OS -> [unavailable]
   69:     end.
   70: 
   71: groups() -> 
   72:     [].
   73: 
   74: init_per_group(_GroupName, Config) ->
   75:     Config.
   76: 
   77: end_per_group(_GroupName, Config) ->
   78:     Config.
   79: 
   80: 
   81: api(suite) -> [];
   82: api(doc) -> ["Test of API functions"];
   83: api(Config) when is_list(Config) ->
   84: 
   85:     %% get_disk_data()
   86:     [{Id,KByte,Capacity}|_] = get_disk_data(),
   87:     true = io_lib:printable_list(Id),
   88:     true = is_integer(KByte),
   89:     true = is_integer(Capacity),
   90:     true = Capacity>0,
   91:     true = KByte>0,
   92: 
   93:     %% get_check_interval()
   94:     1800000 = disksup:get_check_interval(),
   95: 
   96:     %% set_check_interval(Minutes)
   97:     ok = disksup:set_check_interval(20),
   98:     1200000 = disksup:get_check_interval(),
   99:     {'EXIT',{badarg,_}} = (catch disksup:set_check_interval(0.5)),
  100:     1200000 = disksup:get_check_interval(),
  101:     ok = disksup:set_check_interval(30),
  102: 
  103:     %% get_almost_full_threshold()
  104:     80 = disksup:get_almost_full_threshold(),
  105: 
  106:     %% set_almost_full_threshold(Float)
  107:     ok = disksup:set_almost_full_threshold(0.90),
  108:     90 = disksup:get_almost_full_threshold(),
  109:     {'EXIT',{badarg,_}} =
  110: 	(catch disksup:set_almost_full_threshold(-0.5)),
  111:     90 = disksup:get_almost_full_threshold(),
  112:     ok = disksup:set_almost_full_threshold(0.80),
  113: 
  114:     ok.
  115: 
  116: config(suite) -> [];
  117: config(doc) -> ["Test configuration"];
  118: config(Config) when is_list(Config) ->
  119: 
  120:     %% Change configuration parameters and make sure change is reflected
  121:     %% when disksup is restarted
  122:     ok = application:set_env(os_mon, disk_space_check_interval, 29),
  123:     ok = application:set_env(os_mon, disk_almost_full_threshold, 0.81),
  124: 
  125:     ok = supervisor:terminate_child(os_mon_sup, disksup),
  126:     {ok, _Child1} = supervisor:restart_child(os_mon_sup, disksup),
  127: 
  128:     1740000 = disksup:get_check_interval(),
  129:     81 = disksup:get_almost_full_threshold(),
  130: 
  131:     %% Also try this with bad parameter values, should be ignored
  132:     ok =
  133: 	application:set_env(os_mon, disk_space_check_interval, 0.5),
  134:     ok =
  135: 	application:set_env(os_mon, disk_almost_full_threshold, -0.81),
  136: 
  137:     ok = supervisor:terminate_child(os_mon_sup, disksup),
  138:     {ok, _Child2} = supervisor:restart_child(os_mon_sup, disksup),
  139: 
  140:     1800000 = disksup:get_check_interval(),
  141:     80 = disksup:get_almost_full_threshold(),
  142: 
  143:     %% Reset configuration parameters
  144:     ok = application:set_env(os_mon, disk_space_check_interval, 30),
  145:     ok = application:set_env(os_mon, disk_almost_full_threshold, 0.80),
  146:     ok.
  147: 
  148: %%----------------------------------------------------------------------
  149: %% NOTE: The test case is a bit weak as it will fail if the disk usage
  150: %% changes too much during its course, or if there are timing problems
  151: %% with the alarm_handler receiving the alarms too late
  152: %%----------------------------------------------------------------------
  153: alarm(suite) -> [];
  154: alarm(doc) -> ["Test that alarms are set and cleared"];
  155: alarm(Config) when is_list(Config) ->
  156: 
  157:     %% Find out how many disks exceed the threshold
  158:     %% and make sure the corresponding number of alarms is set
  159:     Threshold1 = disksup:get_almost_full_threshold(), % 80
  160:     Data1 = disksup:get_disk_data(),
  161:     Over1 = over_threshold(Data1, Threshold1),
  162:     Alarms1 = get_alarms(),
  163:     if
  164: 	Over1==length(Alarms1) ->
  165: 	    true;
  166: 	true ->
  167: 	    dump_info(),
  168: 	    ?t:fail({bad_alarms, Threshold1, Data1, Alarms1})
  169:     end,
  170: 
  171:     %% Try to find a disk with space usage below Threshold1,
  172:     %% lower the threshold accordingly and make sure new alarms are set
  173:     Fun1 = fun({_Id, _Kbyte, Capacity}) ->
  174: 		   if
  175: 		       Capacity>0, Capacity<Threshold1 -> true;
  176: 		       true -> false
  177: 		   end
  178: 	   end,
  179:     case until(Fun1, Data1) of
  180: 	      {_, _, Cap1} ->
  181: 		  Threshold2 = Cap1-1,
  182: 		  ok =
  183: 		      disksup:set_almost_full_threshold(Threshold2/100),
  184: 		  disksup ! timeout, % force a disk check
  185: 		  Data2 = disksup:get_disk_data(),
  186: 		  Over2 = over_threshold(Data2, Threshold2),
  187: 		  Alarms2 = get_alarms(),
  188: 		  if
  189: 		      Over2==length(Alarms2), Over2>Over1 ->
  190: 			  true;
  191: 		      true ->
  192: 			  dump_info(),
  193: 			  ?t:fail({bad_alarms, Threshold2, Data2, Alarms2})
  194: 		  end;
  195: 	      false ->
  196: 		  ignore
  197: 	  end,
  198: 
  199:     %% Find out the highest space usage among all disks
  200:     %% and try to raise the threshold above this value,
  201:     %% make sure all alarms are cleared
  202:     Fun2 = fun({_Id, _Kbyte, Capacity}, MaxAcc) ->
  203: 		   if
  204: 		       Capacity>MaxAcc -> Capacity;
  205: 		       true -> MaxAcc
  206: 		   end
  207: 	   end,
  208:     case lists:foldl(Fun2, 0, Data1) of
  209: 	      Max when Max<100 ->
  210: 		  Threshold3 = Max+1,
  211: 		  ok = disksup:set_almost_full_threshold(Threshold3/100),
  212: 		  disksup ! timeout, % force a disk check
  213: 		  Data3   = disksup:get_disk_data(),
  214: 		  Over3   = over_threshold(Data3, Threshold3),
  215: 		  Alarms3 = get_alarms(),
  216: 		  if
  217: 		      Over3==0, length(Alarms3)==0 ->
  218: 			  ok;
  219: 		      true ->
  220: 			  dump_info(),
  221: 			  ?t:fail({bad_alarms, Threshold3, Data3, Alarms3})
  222: 		  end;
  223: 	      100 ->
  224: 		  ignore
  225: 	  end,
  226: 
  227:     %% Reset threshold
  228:     ok = disksup:set_almost_full_threshold(Threshold1/100),
  229:     ok.
  230: 
  231: over_threshold(Data, Threshold) ->
  232:     Data2 = remove_duplicated_disks(lists:keysort(1, Data)),
  233:     lists:foldl(fun
  234: 	    ({_Id, _Kbyte, Cap}, N) when Cap>=Threshold -> N+1;
  235: 	    (_DiskData, N) -> N
  236: 	end, 0, Data2).
  237: 
  238: %% On some platforms (for example MontaVista) data for one disk can be
  239: %% "duplicated":
  240: %%  Linux ppb 2.4.20_mvl31-pcore680 #1 Sun Feb 1 23:12:56 PST 2004 ppc unknown
  241: %%
  242: %%  MontaVista(R) Linux(R) Professional Edition 3.1
  243: %%
  244: %%  [ppb:~]> /bin/df -lk
  245: %%  Filesystem           1k-blocks      Used Available Use% Mounted on
  246: %%  rootfs                 8066141   3023763   4961717  38% /
  247: %%  /dev/root              8066141   3023763   4961717  38% /
  248: %%  tmpfs                   192892         0    192892   0% /dev/shm
  249: %%
  250: %% disksup:
  251: %%  [{"/",8066141,38}, {"/",8066141,38}, {"/dev/shm",192892,0}]
  252: %%
  253: %% disksup will only set ONE alarm for "/".
  254: %% Therefore the list of disk data must be sorted and duplicated disk
  255: %% tuples removed before calculating how many alarms should be set, or
  256: %% the testcase will fail erroneously.
  257: remove_duplicated_disks([{Id, _, _}, {Id, Kbyte, Cap}|T]) ->
  258:     remove_duplicated_disks([{Id, Kbyte, Cap}|T]);
  259: remove_duplicated_disks([H|T]) ->
  260:     [H|remove_duplicated_disks(T)];
  261: remove_duplicated_disks([]) ->
  262:     [].
  263: 
  264: get_alarms() ->
  265:     lists:filter(fun
  266: 	    ({{disk_almost_full, _Disk},_}) -> true;
  267: 	    (_) -> false
  268: 	end, alarm_handler:get_alarms()).
  269: 
  270: until(Fun, [H|T]) ->
  271:     case Fun(H) of
  272: 	true -> H;
  273: 	false -> until(Fun, T)
  274:     end;
  275: until(_Fun, []) -> false.
  276: 
  277: port(suite) -> [];
  278: port(doc) ->
  279:     ["Test that disksup handles a terminating port program"];
  280: port(Config) when is_list(Config) ->
  281:     Str = os:cmd("ps -ef | grep '[d]isksup'"),
  282:     case io_lib:fread("~s ~s", Str) of
  283: 	{ok, [_Uid,Pid], _Rest} ->
  284: 
  285: 	    %% Monitor disksup
  286: 	    MonRef = erlang:monitor(process, disksup),
  287: 	    [{_Disk1,Kbyte1,_Cap1}|_] = disksup:get_disk_data(),
  288: 	    true = Kbyte1>0,
  289: 
  290: 	    %% Kill the port program
  291: 	    case os:cmd("kill -9 " ++ Pid) of
  292: 		[] ->
  293: 
  294: 		    %% disksup should now terminate
  295: 		    receive
  296: 			{'DOWN', MonRef, _, _, {port_died, _Reason}} ->
  297: 			    ok;
  298: 			{'DOWN', MonRef, _, _, Reason} ->
  299: 			    ?t:fail({unexpected_exit_reason, Reason})
  300: 		    after
  301: 			3000 ->
  302: 			    ?t:fail({still_alive, Str})
  303: 		    end,
  304: 
  305: 		    %% Give os_mon_sup time to restart disksup
  306: 		    ?t:sleep(?t:seconds(3)),
  307: 		    [{_Disk2,Kbyte2,_Cap2}|_] = disksup:get_disk_data(),
  308: 		    true = Kbyte2>0,
  309: 
  310: 		    ok;
  311: 
  312: 		Line ->
  313: 		    erlang:demonitor(MonRef),
  314: 		    {skip, {not_killed, Line}}
  315: 	    end;
  316: 	_ ->
  317: 	    {skip, {os_pid_not_found, Str}}
  318:     end.
  319: 
  320: terminate(suite) -> [];
  321: terminate(Config) when is_list(Config) ->
  322:     ok = application:set_env(os_mon, start_disksup, false),
  323:     ok = supervisor:terminate_child(os_mon_sup, disksup),
  324:     ok.
  325: 
  326: unavailable(suite) -> [];
  327: unavailable(doc) ->
  328:     ["Test correct behaviour when service is unavailable"];
  329: unavailable(Config) when is_list(Config) ->
  330: 
  331:     %% Make sure all API functions return their dummy values
  332:     [{"none",0,0}] = disksup:get_disk_data(),
  333:     1800000 = disksup:get_check_interval(),
  334:     ok = disksup:set_check_interval(5),
  335:     80 = disksup:get_almost_full_threshold(),
  336:     ok = disksup:set_almost_full_threshold(0.9),
  337:     ok.
  338: 
  339: restart(suite) ->
  340:     [];
  341: restart(Config) when is_list(Config) ->
  342:     ok = application:set_env(os_mon, start_disksup, true),
  343:     {ok, _Pid} = supervisor:restart_child(os_mon_sup, disksup),
  344:     ok.
  345: 
  346: otp_5910(suite) -> [];
  347: otp_5910(doc) ->
  348:     ["Test that alarms are cleared if disksup crashes or "
  349:      "if OS_Mon is stopped"];
  350: otp_5910(Config) when is_list(Config) ->
  351: 
  352:     %% Make sure disksup sets at least one alarm
  353:     Data = disksup:get_disk_data(),
  354:     Threshold0 = disksup:get_almost_full_threshold(),
  355:     Threshold  = case over_threshold(Data, Threshold0) of
  356: 	0 ->
  357: 	    [{_Id,_Kbyte,Cap}|_] = Data,
  358: 	    ok = disksup:set_almost_full_threshold((Cap-1)/100),
  359: 	    Cap-1;
  360: 	_N -> Threshold0
  361:     end,
  362:     ok = application:set_env(os_mon, disk_almost_full_threshold, Threshold/100),
  363:     disksup ! timeout, % force a disk check
  364:     Data2 = disksup:get_disk_data(),
  365:     Over = over_threshold(Data2, Threshold),
  366:     Alarms = get_alarms(),
  367:     if
  368: 	Over==0 ->
  369: 	    ?t:fail({threshold_too_low, Data2, Threshold});
  370: 	Over==length(Alarms) ->
  371: 	    ok;
  372: 	true ->
  373: 	    dump_info(),
  374: 	    ?t:fail({bad_alarms, Threshold, Data2, Alarms})
  375:     end,
  376: 
  377:     %% Kill disksup
  378:     exit(whereis(disksup), faked_disksup_crash),
  379: 
  380:     %% Wait a little to make sure disksup has been restarted,
  381:     %% then make sure the alarms are set once, but not twice
  382:     ?t:sleep(?t:seconds(1)),
  383:     Data3   = disksup:get_disk_data(),
  384:     Alarms2 = get_alarms(),
  385:     if
  386: 	length(Alarms2)==length(Alarms) -> ok;
  387: 	true ->
  388: 	    dump_info(),
  389: 	    ?t:fail({bad_alarms,Threshold,Data3,Alarms,Alarms2})
  390:     end,
  391: 
  392:     %% Stop OS_Mon and make sure all disksup alarms are cleared
  393:     ok = application:stop(os_mon),
  394:     ?t:sleep(?t:seconds(1)),
  395:     Alarms3 = get_alarms(),
  396:     case get_alarms() of
  397: 	[] -> ok;
  398: 	_  -> ?t:fail({alarms_not_cleared, Alarms3})
  399:     end,
  400: 
  401:     %% Reset threshold and restart OS_Mon
  402:     ok = application:set_env(os_mon, disksup_almost_full_threshold, 0.8),
  403:     ok = disksup:set_almost_full_threshold(0.8),
  404:     ok = application:start(os_mon),
  405:     ok.
  406: 
  407: dump_info() ->
  408:     io:format("Status: ~p~n", [sys:get_status(disksup)]).
  409: 
  410: % filter get_disk_data and remove entriew with zero capacity
  411: % "non-normal" filesystems report zero capacity
  412: % - Perhaps errorneous 'df -k -l'?
  413: % - Always list filesystems by type '-t ufs,zfs,..' instead?
  414: % It is unclear what the intention was from the beginning.
  415: get_disk_data() ->
  416:     get_disk_data(disksup:get_disk_data()).
  417: 
  418: get_disk_data([{"none",0,0}=E]) -> [E];
  419: get_disk_data([{_,_,0}|Es]) -> get_disk_data(Es);
  420: get_disk_data([E|Es]) -> [E|get_disk_data(Es)];
  421: get_disk_data([]) -> [].