Skip to content
Snippets Groups Projects
Commit 8d2c0b23 authored by Niko Ehrenfeuchter's avatar Niko Ehrenfeuchter :keyboard:
Browse files

Implement I/O queue length performance monitoring

Refers to #35
parent 8e33745a
No related branches found
No related tags found
No related merge requests found
......@@ -48,6 +48,7 @@
<Compile Include="Conv.cs" />
<Compile Include="FsUtils.cs" />
<Compile Include="Monitoring\Cpu.cs" />
<Compile Include="Monitoring\PhysicalDisk.cs" />
<Compile Include="NLog\RateLimitWrapper.cs" />
<Compile Include="Serializables\DriveToCheck.cs" />
<Compile Include="Serializables\ServiceConfig.cs" />
......
using System;
using System.Diagnostics;
using System.Linq;
// using System.Threading;
using System.Timers;
using NLog;
using Timer = System.Timers.Timer;
namespace ATxCommon.Monitoring
{
/// <summary>
/// Load monitoring class for physical disks, constantly checking the queue length at the
/// given <see cref="Interval"/> in a separate (timer-based) thread.
///
/// The load (=queue length) is determined using a <see cref="PerformanceCounter"/>, and is
/// compared against a configurable <see cref="Limit"/>. If the load changes from below the
/// limit to above, a <see cref="LoadAboveLimit"/> event will be raised. If the load has been
/// above the limit and is then dropping below, an <see cref="OnLoadBelowLimit"/> event will
/// be raised as soon as a given number of consecutive load measurements (defined via <see
/// cref="Probation"/>) were found to be below the limit.
/// </summary>
public class PhysicalDisk
{
private static readonly Logger Log = LogManager.GetCurrentClassLogger();
/// <summary>
/// The generic event handler delegate for PhysicalDisk events.
/// </summary>
public delegate void EventHandler(object sender, EventArgs e);
/// <summary>
/// Event raised when the PhysicalDisk load exceeds the limit for any measurement.
/// </summary>
public event EventHandler LoadAboveLimit;
/// <summary>
/// Event raised when the PhysicalDisk load is below the configured limit for at least
/// the number of consecutive measurements configured in <see cref="Probation"/> after
/// having exceeded this limit before.
/// </summary>
public event EventHandler LoadBelowLimit;
private readonly Timer _monitoringTimer;
private readonly PerformanceCounter _diskQueueLength;
private readonly float[] _loadReadings = {0F, 0F, 0F, 0F};
private int _interval;
private int _behaving;
private int _probation;
private float _limit;
/// <summary>
/// Current PhysicalDisk Queue Length, averaged of the last four readings.
/// </summary>
/// <returns>The average PhysicalDisk Queue Length from the last four readings.</returns>
public float Load { get; private set; }
/// <summary>
/// Flag representing whether the load is considered to be high or low.
/// </summary>
public bool HighLoad { get; private set; }
/// <summary>
/// How often (in ms) to check the PhysicalDisk Queue Length.
/// </summary>
public int Interval {
get => _interval;
set {
_interval = value;
_monitoringTimer.Interval = value;
Log.Debug("PhysicalDisk Queue Length monitoring interval: {0}", _interval);
}
}
/// <summary>
/// Upper limit of PhysicalDisk load before it is classified as "high".
/// </summary>
public float Limit {
get => _limit;
set {
_limit = value;
Log.Debug("PhysicalDisk monitoring limit: {0:0.000}", _limit);
}
}
/// <summary>
/// Number of cycles where the PhysicalDisk load value has to be below the limit before it is
/// classified as "low" again.
/// </summary>
public int Probation {
get => _probation;
set {
_probation = value;
Log.Debug("PhysicalDisk monitoring probation cycles when violating limit: {0}", _probation);
}
}
/// <summary>
/// Indicating whether the PhysicalDisk load monitoring is active.
/// </summary>
public bool Enabled {
get => _monitoringTimer.Enabled;
set {
Log.Debug("{0} PhysicalDisk monitoring.", value ? "Enabling" : "Disabling");
_monitoringTimer.Enabled = value;
}
}
/// <summary>
/// Create performance counter and initialize it.
/// </summary>
/// <param name="counterName">The counter to use for the monitoring, default is the
/// overall "Avg. Disk Queue Length", other reasonable options are the corresponding read
/// or write queues ("Avg. Disk Read Queue Length" and "Avg. Disk Write Queue Length").
/// </param>
public PhysicalDisk(string counterName = "Avg. Disk Queue Length") {
_interval = 250;
_limit = 0.5F;
_probation = 40;
Log.Info($"Initializing PhysicalDisk performance monitoring for [{counterName}]...");
try {
Log.Debug("PhysicalDisk monitoring initializing PerformanceCounter...");
_diskQueueLength = new PerformanceCounter("PhysicalDisk", counterName, "_Total");
var curLoad = _diskQueueLength.NextValue();
Log.Debug("PhysicalDisk Queue Length initial value: {0:0.000}", curLoad);
/* this initialization doesn't seem to be necessary for PhysicalDisk, so we just
* disable those calls for now:
Thread.Sleep(1000);
curLoad = _diskQueueLength.NextValue();
Log.Debug("PhysicalDisk monitoring current queue length: {0:0.000}", curLoad);
*/
// now initialize the load state:
HighLoad = curLoad > _limit;
_monitoringTimer = new Timer(_interval);
_monitoringTimer.Elapsed += UpdatePhysicalDiskLoad;
}
catch (Exception) {
Log.Error("Initializing PhysicalDisk monitoring failed!");
throw;
}
Log.Debug("Initializing PhysicalDisk monitoring completed.");
}
/// <summary>
/// Check current PhysicalDisk queue length, update the history of readings and trigger
/// the corresponding events if the required criteria are met.
/// </summary>
private void UpdatePhysicalDiskLoad(object sender, ElapsedEventArgs e) {
_monitoringTimer.Enabled = false;
try {
// ConstrainedCopy seems to be the most efficient approach to shift the array:
Array.ConstrainedCopy(_loadReadings, 1, _loadReadings, 0, 3);
_loadReadings[3] = _diskQueueLength.NextValue();
Load = _loadReadings.Average();
if (_loadReadings[3] > _limit) {
if (_behaving > _probation) {
// this means the load was considered as "low" before, so raise an event:
OnLoadAboveLimit();
Log.Trace("PhysicalDisk Queue Length ({0:0.00}) violating limit ({1})!",
_loadReadings[3], _limit);
} else if (_behaving > 0) {
// this means we were still in probation, so no need to trigger again...
Log.Trace("PhysicalDisk: resetting behaving counter (was {0}).", _behaving);
}
_behaving = 0;
} else {
_behaving++;
if (_behaving == _probation) {
Log.Trace("PhysicalDisk Queue Length below limit for {0} cycles, " +
"passing probation!", _probation);
OnLoadBelowLimit();
} else if (_behaving > _probation) {
Log.Trace("PhysicalDisk Queue Length behaving well since {0} cycles.",
_behaving);
} else if (_behaving < 0) {
Log.Info("PhysicalDisk Queue Length: integer wrap around happened, " +
"resetting probation counter (no reason to worry).");
_behaving = _probation + 1;
}
}
}
catch (Exception ex) {
Log.Error("UpdatePhysicalDiskLoad failed: {0}", ex.Message);
}
finally {
_monitoringTimer.Enabled = true;
}
Log.Trace("PhysicalDisk Queue Length: {0:0.000} {1}", _loadReadings[3],
_loadReadings[3] < Limit ? " [" + _behaving + "]" : "");
}
/// <summary>
/// Raise the "LoadAboveLimit" event.
/// </summary>
protected virtual void OnLoadAboveLimit() {
HighLoad = true;
LoadAboveLimit?.Invoke(this, EventArgs.Empty);
}
/// <summary>
/// Raise the "LoadBelowLimit" event.
/// </summary>
protected virtual void OnLoadBelowLimit() {
HighLoad = false;
LoadBelowLimit?.Invoke(this, EventArgs.Empty);
}
}
}
\ No newline at end of file
......@@ -63,6 +63,13 @@ namespace ATxCommon.Serializables
/// </summary>
public int MaxCpuUsage { get; set; }
/// <summary>
/// Maximum length of the disk queue multiplied by 1000 (so a value of "25" here means the
/// queue length is required to be "0.025" or less). Running transfers will be paused if
/// this limit is exceeded.
/// </summary>
public int MaxDiskQueue { get; set; }
/// <summary>
/// Minimum amount of free RAM (in MB) required for the service to operate.
/// </summary>
......@@ -405,6 +412,7 @@ namespace ATxCommon.Serializables
errmsg += CheckMinValue(c.ServiceTimer, nameof(c.ServiceTimer), 1000);
errmsg += CheckMinValue(c.MaxCpuUsage, nameof(c.MaxCpuUsage), 5);
errmsg += CheckMinValue(c.MaxDiskQueue, nameof(c.MaxDiskQueue), 1);
errmsg += CheckMinValue(c.MinAvailableMemory, nameof(c.MinAvailableMemory), 256);
// if any of the required parameter checks failed we terminate now as many of the
......@@ -454,6 +462,7 @@ namespace ATxCommon.Serializables
WarnOnHighValue(c.ServiceTimer, nameof(c.ServiceTimer), 10000);
WarnOnHighValue(c.MaxCpuUsage, nameof(c.MaxCpuUsage), 75);
WarnOnHighValue(c.MaxDiskQueue, nameof(c.MaxDiskQueue), 2000);
WarnOnHighValue(c.MinAvailableMemory, nameof(c.MinAvailableMemory), 8192);
WarnOnHighValue(c.AdminNotificationDelta, nameof(c.AdminNotificationDelta), 1440);
WarnOnHighValue(c.GraceNotificationDelta, nameof(c.GraceNotificationDelta), 10080);
......@@ -493,6 +502,7 @@ namespace ATxCommon.Serializables
$"DestinationDirectory: {DestinationDirectory}\n" +
$"TmpTransferDir: {TmpTransferDir}\n" +
$"MaxCpuUsage: {MaxCpuUsage}%\n" +
$"MaxDiskQueue: {MaxDiskQueue} / 1000 (effectively {MaxDiskQueue/1000})\n" +
$"MinAvailableMemory: {MinAvailableMemory} MB\n" +
"\n" +
"############### OPTIONAL PARAMETERS ###############\n" +
......
......@@ -45,6 +45,8 @@ namespace ATxService
/// </summary>
private readonly Cpu _cpu;
private readonly PhysicalDisk _phyDisk;
private RoboCommand _roboCommand;
/// <summary>
......@@ -62,6 +64,8 @@ namespace ATxService
/// </summary>
private int _waitCyclesBeforeNextTx;
private int _exceedingLoadLimit = 0;
private DateTime _lastUserDirCheck = DateTime.MinValue;
// the transfer state:
......@@ -131,6 +135,21 @@ namespace ATxService
throw;
}
try {
_phyDisk = new PhysicalDisk {
Interval = 250,
Limit = (float) _config.MaxDiskQueue / 1000,
Probation = 16,
Enabled = true
};
_phyDisk.LoadAboveLimit += OnLoadAboveLimit;
_phyDisk.LoadBelowLimit += OnLoadBelowLimit;
}
catch (Exception ex) {
Log.Error("Unexpected error initializing PhysicalDisk monitoring: {0}", ex.Message);
throw;
}
if (_config.DebugRoboSharp) {
Debugger.Instance.DebugMessageEvent += HandleDebugMessage;
......@@ -542,18 +561,25 @@ namespace ATxService
#region general methods
/// <summary>
/// Event handler for CPU load dropping below the configured limit.
/// Event handler for load dropping below the configured limit(s).
/// </summary>
private void OnLoadBelowLimit(object sender, EventArgs e) {
Log.Trace("Received a low-CPU-load event!");
ResumePausedTransfer();
_exceedingLoadLimit--;
if (_exceedingLoadLimit < 0)
_exceedingLoadLimit = 0;
Log.Log(_config.MonitoringLogLevel,
"Received 'low-load' from {0} (exceeding: {1})", sender, _exceedingLoadLimit);
if (_exceedingLoadLimit == 0)
ResumePausedTransfer();
}
/// <summary>
/// Event handler for CPU load exceeding the configured limit.
/// Event handler for load exceeding the configured limits.
/// </summary>
private void OnLoadAboveLimit(object sender, EventArgs e) {
Log.Trace("Received a high-CPU-load event!");
_exceedingLoadLimit++;
Log.Log(_config.MonitoringLogLevel,
"Received 'high-load' from {0} (exceeding: {1})", sender, _exceedingLoadLimit);
PauseTransfer();
}
......@@ -565,6 +591,9 @@ namespace ATxService
// check all system parameters for valid ranges and remember the reason in a string
// if one of them is failing (to report in the log why we're suspended)
if (_phyDisk.HighLoad)
suspendReasons.Add("Disk I/O");
if (_cpu.HighLoad)
suspendReasons.Add("CPU");
......
......@@ -24,6 +24,11 @@
<!-- MaxCpuUsage: pause transfer if CPU usage is above this value (in %)-->
<MaxCpuUsage>25</MaxCpuUsage>
<!-- MaxDiskQueue: maximum allowed length of disk I/O queue, multiplied
by 1000 (so a value of "500" here means the maximum queue length
allowed is "0.500"), otherwise running transfers will be paused. -->
<MaxDiskQueue>500</MaxDiskQueue>
<!-- MinAvailableMemory: pause transfer if free RAM is below (in MB) -->
<MinAvailableMemory>512</MinAvailableMemory>
......
......@@ -24,6 +24,11 @@
<!-- MaxCpuUsage: pause transfer if CPU usage is above this value (in %)-->
<MaxCpuUsage>25</MaxCpuUsage>
<!-- MaxDiskQueue: maximum allowed length of disk I/O queue, multiplied
by 1000 (so a value of "500" here means the maximum queue length
allowed is "0.500"), otherwise running transfers will be paused. -->
<MaxDiskQueue>500</MaxDiskQueue>
<!-- MinAvailableMemory: pause transfer if free RAM is below (in MB) -->
<MinAvailableMemory>512</MinAvailableMemory>
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment