1: %% 2: %% %CopyrightBegin% 3: %% 4: %% Copyright Ericsson AB 1996-2011. All Rights Reserved. 5: %% 6: %% The contents of this file are subject to the Erlang Public License, 7: %% Version 1.1, (the "License"); you may not use this file except in 8: %% compliance with the License. You should have received a copy of the 9: %% Erlang Public License along with this software. If not, it can be 10: %% retrieved online at http://www.erlang.org/. 11: %% 12: %% Software distributed under the License is distributed on an "AS IS" 13: %% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See 14: %% the License for the specific language governing rights and limitations 15: %% under the License. 16: %% 17: %% %CopyrightEnd% 18: %% 19: -module(disksup_SUITE). 20: -include_lib("test_server/include/test_server.hrl"). 21: 22: %% Test server specific exports 23: -export([all/0, suite/0,groups/0,init_per_group/2,end_per_group/2]). 24: -export([init_per_suite/1, end_per_suite/1]). 25: -export([init_per_testcase/2, end_per_testcase/2]). 26: 27: %% Test cases 28: -export([api/1, config/1, alarm/1]). 29: -export([port/1]). 30: -export([terminate/1, unavailable/1, restart/1]). 31: -export([otp_5910/1]). 32: 33: %% Default timetrap timeout (set in init_per_testcase) 34: -define(default_timeout, ?t:minutes(1)). 35: 36: init_per_suite(Config) when is_list(Config) -> 37: ok = application:start(os_mon), 38: Config. 39: 40: end_per_suite(Config) when is_list(Config) -> 41: ok = application:stop(os_mon), 42: Config. 43: 44: init_per_testcase(unavailable, Config) -> 45: terminate(Config), 46: init_per_testcase(dummy, Config); 47: init_per_testcase(_Case, Config) -> 48: Dog = ?t:timetrap(?default_timeout), 49: [{watchdog,Dog} | Config]. 50: 51: end_per_testcase(unavailable, Config) -> 52: restart(Config), 53: end_per_testcase(dummy, Config); 54: end_per_testcase(_Case, Config) -> 55: Dog = ?config(watchdog, Config), 56: ?t:timetrap_cancel(Dog), 57: ok. 58: 59: suite() -> [{ct_hooks,[ts_install_cth]}]. 60: 61: all() -> 62: Bugs = [otp_5910], 63: case test_server:os_type() of 64: {unix, sunos} -> 65: [api, config, alarm, port, unavailable] ++ Bugs; 66: {unix, _OSname} -> [api, alarm] ++ Bugs; 67: {win32, _OSname} -> [api, alarm] ++ Bugs; 68: _OS -> [unavailable] 69: end. 70: 71: groups() -> 72: []. 73: 74: init_per_group(_GroupName, Config) -> 75: Config. 76: 77: end_per_group(_GroupName, Config) -> 78: Config. 79: 80: 81: api(suite) -> []; 82: api(doc) -> ["Test of API functions"]; 83: api(Config) when is_list(Config) -> 84: 85: %% get_disk_data() 86: [{Id,KByte,Capacity}|_] = get_disk_data(), 87: true = io_lib:printable_list(Id), 88: true = is_integer(KByte), 89: true = is_integer(Capacity), 90: true = Capacity>0, 91: true = KByte>0, 92: 93: %% get_check_interval() 94: 1800000 = disksup:get_check_interval(), 95: 96: %% set_check_interval(Minutes) 97: ok = disksup:set_check_interval(20), 98: 1200000 = disksup:get_check_interval(), 99: {'EXIT',{badarg,_}} = (catch disksup:set_check_interval(0.5)), 100: 1200000 = disksup:get_check_interval(), 101: ok = disksup:set_check_interval(30), 102: 103: %% get_almost_full_threshold() 104: 80 = disksup:get_almost_full_threshold(), 105: 106: %% set_almost_full_threshold(Float) 107: ok = disksup:set_almost_full_threshold(0.90), 108: 90 = disksup:get_almost_full_threshold(), 109: {'EXIT',{badarg,_}} = 110: (catch disksup:set_almost_full_threshold(-0.5)), 111: 90 = disksup:get_almost_full_threshold(), 112: ok = disksup:set_almost_full_threshold(0.80), 113: 114: ok. 115: 116: config(suite) -> []; 117: config(doc) -> ["Test configuration"]; 118: config(Config) when is_list(Config) -> 119: 120: %% Change configuration parameters and make sure change is reflected 121: %% when disksup is restarted 122: ok = application:set_env(os_mon, disk_space_check_interval, 29), 123: ok = application:set_env(os_mon, disk_almost_full_threshold, 0.81), 124: 125: ok = supervisor:terminate_child(os_mon_sup, disksup), 126: {ok, _Child1} = supervisor:restart_child(os_mon_sup, disksup), 127: 128: 1740000 = disksup:get_check_interval(), 129: 81 = disksup:get_almost_full_threshold(), 130: 131: %% Also try this with bad parameter values, should be ignored 132: ok = 133: application:set_env(os_mon, disk_space_check_interval, 0.5), 134: ok = 135: application:set_env(os_mon, disk_almost_full_threshold, -0.81), 136: 137: ok = supervisor:terminate_child(os_mon_sup, disksup), 138: {ok, _Child2} = supervisor:restart_child(os_mon_sup, disksup), 139: 140: 1800000 = disksup:get_check_interval(), 141: 80 = disksup:get_almost_full_threshold(), 142: 143: %% Reset configuration parameters 144: ok = application:set_env(os_mon, disk_space_check_interval, 30), 145: ok = application:set_env(os_mon, disk_almost_full_threshold, 0.80), 146: ok. 147: 148: %%---------------------------------------------------------------------- 149: %% NOTE: The test case is a bit weak as it will fail if the disk usage 150: %% changes too much during its course, or if there are timing problems 151: %% with the alarm_handler receiving the alarms too late 152: %%---------------------------------------------------------------------- 153: alarm(suite) -> []; 154: alarm(doc) -> ["Test that alarms are set and cleared"]; 155: alarm(Config) when is_list(Config) -> 156: 157: %% Find out how many disks exceed the threshold 158: %% and make sure the corresponding number of alarms is set 159: Threshold1 = disksup:get_almost_full_threshold(), % 80 160: Data1 = disksup:get_disk_data(), 161: Over1 = over_threshold(Data1, Threshold1), 162: Alarms1 = get_alarms(), 163: if 164: Over1==length(Alarms1) -> 165: true; 166: true -> 167: dump_info(), 168: ?t:fail({bad_alarms, Threshold1, Data1, Alarms1}) 169: end, 170: 171: %% Try to find a disk with space usage below Threshold1, 172: %% lower the threshold accordingly and make sure new alarms are set 173: Fun1 = fun({_Id, _Kbyte, Capacity}) -> 174: if 175: Capacity>0, Capacity<Threshold1 -> true; 176: true -> false 177: end 178: end, 179: case until(Fun1, Data1) of 180: {_, _, Cap1} -> 181: Threshold2 = Cap1-1, 182: ok = 183: disksup:set_almost_full_threshold(Threshold2/100), 184: disksup ! timeout, % force a disk check 185: Data2 = disksup:get_disk_data(), 186: Over2 = over_threshold(Data2, Threshold2), 187: Alarms2 = get_alarms(), 188: if 189: Over2==length(Alarms2), Over2>Over1 -> 190: true; 191: true -> 192: dump_info(), 193: ?t:fail({bad_alarms, Threshold2, Data2, Alarms2}) 194: end; 195: false -> 196: ignore 197: end, 198: 199: %% Find out the highest space usage among all disks 200: %% and try to raise the threshold above this value, 201: %% make sure all alarms are cleared 202: Fun2 = fun({_Id, _Kbyte, Capacity}, MaxAcc) -> 203: if 204: Capacity>MaxAcc -> Capacity; 205: true -> MaxAcc 206: end 207: end, 208: case lists:foldl(Fun2, 0, Data1) of 209: Max when Max<100 -> 210: Threshold3 = Max+1, 211: ok = disksup:set_almost_full_threshold(Threshold3/100), 212: disksup ! timeout, % force a disk check 213: Data3 = disksup:get_disk_data(), 214: Over3 = over_threshold(Data3, Threshold3), 215: Alarms3 = get_alarms(), 216: if 217: Over3==0, length(Alarms3)==0 -> 218: ok; 219: true -> 220: dump_info(), 221: ?t:fail({bad_alarms, Threshold3, Data3, Alarms3}) 222: end; 223: 100 -> 224: ignore 225: end, 226: 227: %% Reset threshold 228: ok = disksup:set_almost_full_threshold(Threshold1/100), 229: ok. 230: 231: over_threshold(Data, Threshold) -> 232: Data2 = remove_duplicated_disks(lists:keysort(1, Data)), 233: lists:foldl(fun 234: ({_Id, _Kbyte, Cap}, N) when Cap>=Threshold -> N+1; 235: (_DiskData, N) -> N 236: end, 0, Data2). 237: 238: %% On some platforms (for example MontaVista) data for one disk can be 239: %% "duplicated": 240: %% Linux ppb 2.4.20_mvl31-pcore680 #1 Sun Feb 1 23:12:56 PST 2004 ppc unknown 241: %% 242: %% MontaVista(R) Linux(R) Professional Edition 3.1 243: %% 244: %% [ppb:~]> /bin/df -lk 245: %% Filesystem 1k-blocks Used Available Use% Mounted on 246: %% rootfs 8066141 3023763 4961717 38% / 247: %% /dev/root 8066141 3023763 4961717 38% / 248: %% tmpfs 192892 0 192892 0% /dev/shm 249: %% 250: %% disksup: 251: %% [{"/",8066141,38}, {"/",8066141,38}, {"/dev/shm",192892,0}] 252: %% 253: %% disksup will only set ONE alarm for "/". 254: %% Therefore the list of disk data must be sorted and duplicated disk 255: %% tuples removed before calculating how many alarms should be set, or 256: %% the testcase will fail erroneously. 257: remove_duplicated_disks([{Id, _, _}, {Id, Kbyte, Cap}|T]) -> 258: remove_duplicated_disks([{Id, Kbyte, Cap}|T]); 259: remove_duplicated_disks([H|T]) -> 260: [H|remove_duplicated_disks(T)]; 261: remove_duplicated_disks([]) -> 262: []. 263: 264: get_alarms() -> 265: lists:filter(fun 266: ({{disk_almost_full, _Disk},_}) -> true; 267: (_) -> false 268: end, alarm_handler:get_alarms()). 269: 270: until(Fun, [H|T]) -> 271: case Fun(H) of 272: true -> H; 273: false -> until(Fun, T) 274: end; 275: until(_Fun, []) -> false. 276: 277: port(suite) -> []; 278: port(doc) -> 279: ["Test that disksup handles a terminating port program"]; 280: port(Config) when is_list(Config) -> 281: Str = os:cmd("ps -ef | grep '[d]isksup'"), 282: case io_lib:fread("~s ~s", Str) of 283: {ok, [_Uid,Pid], _Rest} -> 284: 285: %% Monitor disksup 286: MonRef = erlang:monitor(process, disksup), 287: [{_Disk1,Kbyte1,_Cap1}|_] = disksup:get_disk_data(), 288: true = Kbyte1>0, 289: 290: %% Kill the port program 291: case os:cmd("kill -9 " ++ Pid) of 292: [] -> 293: 294: %% disksup should now terminate 295: receive 296: {'DOWN', MonRef, _, _, {port_died, _Reason}} -> 297: ok; 298: {'DOWN', MonRef, _, _, Reason} -> 299: ?t:fail({unexpected_exit_reason, Reason}) 300: after 301: 3000 -> 302: ?t:fail({still_alive, Str}) 303: end, 304: 305: %% Give os_mon_sup time to restart disksup 306: ?t:sleep(?t:seconds(3)), 307: [{_Disk2,Kbyte2,_Cap2}|_] = disksup:get_disk_data(), 308: true = Kbyte2>0, 309: 310: ok; 311: 312: Line -> 313: erlang:demonitor(MonRef), 314: {skip, {not_killed, Line}} 315: end; 316: _ -> 317: {skip, {os_pid_not_found, Str}} 318: end. 319: 320: terminate(suite) -> []; 321: terminate(Config) when is_list(Config) -> 322: ok = application:set_env(os_mon, start_disksup, false), 323: ok = supervisor:terminate_child(os_mon_sup, disksup), 324: ok. 325: 326: unavailable(suite) -> []; 327: unavailable(doc) -> 328: ["Test correct behaviour when service is unavailable"]; 329: unavailable(Config) when is_list(Config) -> 330: 331: %% Make sure all API functions return their dummy values 332: [{"none",0,0}] = disksup:get_disk_data(), 333: 1800000 = disksup:get_check_interval(), 334: ok = disksup:set_check_interval(5), 335: 80 = disksup:get_almost_full_threshold(), 336: ok = disksup:set_almost_full_threshold(0.9), 337: ok. 338: 339: restart(suite) -> 340: []; 341: restart(Config) when is_list(Config) -> 342: ok = application:set_env(os_mon, start_disksup, true), 343: {ok, _Pid} = supervisor:restart_child(os_mon_sup, disksup), 344: ok. 345: 346: otp_5910(suite) -> []; 347: otp_5910(doc) -> 348: ["Test that alarms are cleared if disksup crashes or " 349: "if OS_Mon is stopped"]; 350: otp_5910(Config) when is_list(Config) -> 351: 352: %% Make sure disksup sets at least one alarm 353: Data = disksup:get_disk_data(), 354: Threshold0 = disksup:get_almost_full_threshold(), 355: Threshold = case over_threshold(Data, Threshold0) of 356: 0 -> 357: [{_Id,_Kbyte,Cap}|_] = Data, 358: ok = disksup:set_almost_full_threshold((Cap-1)/100), 359: Cap-1; 360: _N -> Threshold0 361: end, 362: ok = application:set_env(os_mon, disk_almost_full_threshold, Threshold/100), 363: disksup ! timeout, % force a disk check 364: Data2 = disksup:get_disk_data(), 365: Over = over_threshold(Data2, Threshold), 366: Alarms = get_alarms(), 367: if 368: Over==0 -> 369: ?t:fail({threshold_too_low, Data2, Threshold}); 370: Over==length(Alarms) -> 371: ok; 372: true -> 373: dump_info(), 374: ?t:fail({bad_alarms, Threshold, Data2, Alarms}) 375: end, 376: 377: %% Kill disksup 378: exit(whereis(disksup), faked_disksup_crash), 379: 380: %% Wait a little to make sure disksup has been restarted, 381: %% then make sure the alarms are set once, but not twice 382: ?t:sleep(?t:seconds(1)), 383: Data3 = disksup:get_disk_data(), 384: Alarms2 = get_alarms(), 385: if 386: length(Alarms2)==length(Alarms) -> ok; 387: true -> 388: dump_info(), 389: ?t:fail({bad_alarms,Threshold,Data3,Alarms,Alarms2}) 390: end, 391: 392: %% Stop OS_Mon and make sure all disksup alarms are cleared 393: ok = application:stop(os_mon), 394: ?t:sleep(?t:seconds(1)), 395: Alarms3 = get_alarms(), 396: case get_alarms() of 397: [] -> ok; 398: _ -> ?t:fail({alarms_not_cleared, Alarms3}) 399: end, 400: 401: %% Reset threshold and restart OS_Mon 402: ok = application:set_env(os_mon, disksup_almost_full_threshold, 0.8), 403: ok = disksup:set_almost_full_threshold(0.8), 404: ok = application:start(os_mon), 405: ok. 406: 407: dump_info() -> 408: io:format("Status: ~p~n", [sys:get_status(disksup)]). 409: 410: % filter get_disk_data and remove entriew with zero capacity 411: % "non-normal" filesystems report zero capacity 412: % - Perhaps errorneous 'df -k -l'? 413: % - Always list filesystems by type '-t ufs,zfs,..' instead? 414: % It is unclear what the intention was from the beginning. 415: get_disk_data() -> 416: get_disk_data(disksup:get_disk_data()). 417: 418: get_disk_data([{"none",0,0}=E]) -> [E]; 419: get_disk_data([{_,_,0}|Es]) -> get_disk_data(Es); 420: get_disk_data([E|Es]) -> [E|get_disk_data(Es)]; 421: get_disk_data([]) -> [].