@@ -119,22 +119,30 @@ protected void runInContext() {
119119 }
120120
121121 String result = null ;
122- for (int i = 0 ; i < 5 ; i ++) {
122+ // Try multiple times, but sleep in between tries to ensure it isn't a short lived transient error
123+ for (int i = 1 ; i <= _heartBeatUpdateMaxTries ; i ++) {
123124 Script cmd = new Script (s_heartBeatPath , _heartBeatUpdateTimeout , s_logger );
124125 cmd .add ("-i" , primaryStoragePool ._poolIp );
125126 cmd .add ("-p" , primaryStoragePool ._poolMountSourcePath );
126127 cmd .add ("-m" , primaryStoragePool ._mountDestPath );
127128 cmd .add ("-h" , _hostIP );
128129 result = cmd .execute ();
129130 if (result != null ) {
130- s_logger .warn ("write heartbeat failed: " + result + ", retry: " + i );
131+ s_logger .warn ("write heartbeat failed: " + result + ", try: " + i + " of " + _heartBeatUpdateMaxTries );
132+ try {
133+ Thread .sleep (_heartBeatUpdateRetrySleep );
134+ } catch (InterruptedException e ) {
135+ s_logger .debug ("[ignored] interupted between heartbeat retries." );
136+ }
131137 } else {
132138 break ;
133139 }
134140 }
135141
136142 if (result != null ) {
137- s_logger .warn ("write heartbeat failed: " + result + "; reboot the host" );
143+ // Stop cloudstack-agent if can't write to heartbeat file.
144+ // This will raise an alert on the mgmt server
145+ s_logger .warn ("write heartbeat failed: " + result + "; stopping cloudstack-agent" );
138146 Script cmd = new Script (s_heartBeatPath , _heartBeatUpdateTimeout , s_logger );
139147 cmd .add ("-i" , primaryStoragePool ._poolIp );
140148 cmd .add ("-p" , primaryStoragePool ._poolMountSourcePath );
0 commit comments