Commit ecc659a6 authored by Michael Ott's avatar Michael Ott
Browse files

Improve error handling and reconnects

parent 833a4ab8
......@@ -55,21 +55,22 @@ IPMIHost::~IPMIHost() {
}
int IPMIHost::connect() {
if (_ipmiCtx) {
return 0;
}
if (!(_ipmiCtx = ipmi_ctx_create())) {
throw std::runtime_error("ipmi_ctx_create() Error: " + std::string(strerror(errno)));
_errorMsg = "Error creating IPMI context" + std::string(strerror(errno));
return 1;
}
int workaround_flags = 0;
int flags = 0;
if (ipmi_ctx_open_outofband(_ipmiCtx, _hostName.c_str(), _userName.c_str(),
_password.c_str(), _auth, _priv, _sessionTimeout,
_retransmissionTimeout, workaround_flags, flags) < 0) {
std::string errorMsg(ipmi_ctx_errormsg(_ipmiCtx));
if (ipmi_ctx_open_outofband(_ipmiCtx, _hostName.c_str(), _userName.c_str(), _password.c_str(), _auth, _priv, _sessionTimeout, _retransmissionTimeout, workaround_flags, flags) < 0) {
_errorMsg = "Error opening IPMI connection: " + std::string(ipmi_ctx_errormsg(_ipmiCtx));
ipmi_ctx_close(_ipmiCtx);
ipmi_ctx_destroy(_ipmiCtx);
_ipmiCtx = NULL;
throw std::runtime_error("ipmi_ctx_open_outofband() Error: " + errorMsg);
return 2;
}
......@@ -91,97 +92,61 @@ int IPMIHost::disconnect() {
return 1;
}
void IPMIHost::checkConnection() {
/*
if (_ipmiCtx) {
uint64_t ts = getTimestamp();
if ((ts - _lastRead) > MS_TO_NS(_sessionTimeout)) {
LOG(debug) << _hostName << ": Last read was " << NS_TO_MS((ts - _lastRead)) << "ms ago, timeout is " << _sessionTimeout << "ms. Disconnecting.";
disconnect();
}
}
*/
if (!_ipmiCtx) {
try {
connect();
} catch (const std::runtime_error& e) {
increaseErrorCount();
throw e;
return;
}
}
}
bool IPMIHost::getSdrRecord(uint16_t recordId, std::vector<uint8_t>& record) {
try {
checkConnection();
} catch (const std::runtime_error& e) {
increaseErrorCount();
}
ipmi_sdr_ctx_t sdrCtx = ipmi_sdr_ctx_create();
if (!sdrCtx) {
throw std::runtime_error("Error creating SDR context: " + std::string(strerror(errno)));
return false;
}
std::string errorMsg;
bool success = false;
int retries = RETRIES;
while (retries--) {
if (ipmi_sdr_cache_open(sdrCtx, _ipmiCtx, _cache.c_str()) < 0) {
if ((ipmi_sdr_ctx_errnum(sdrCtx) == IPMI_SDR_ERR_CACHE_READ_CACHE_DOES_NOT_EXIST) || (ipmi_sdr_ctx_errnum(sdrCtx) == IPMI_SDR_ERR_CACHE_INVALID) || (ipmi_sdr_ctx_errnum(sdrCtx) == IPMI_SDR_ERR_CACHE_OUT_OF_DATE)) {
if ((ipmi_sdr_ctx_errnum(sdrCtx) == IPMI_SDR_ERR_CACHE_INVALID) || (ipmi_sdr_ctx_errnum(sdrCtx) == IPMI_SDR_ERR_CACHE_OUT_OF_DATE)) {
LOG(debug) << _hostName << "Deleting SDR cache " << _cache;
ipmi_sdr_cache_close(sdrCtx);
ipmi_sdr_cache_delete(sdrCtx, _cache.c_str());
}
if (ipmi_sdr_cache_create(sdrCtx, _ipmiCtx, _cache.c_str(), IPMI_SDR_CACHE_CREATE_FLAGS_DEFAULT, NULL, NULL) == 0) {
LOG(debug) << _hostName << ": Created new SDR cache " << _cache;
while (retries-- && !success) {
if (connect() == 0) {
if (ipmi_sdr_cache_open(sdrCtx, _ipmiCtx, _cache.c_str()) < 0) {
if ((ipmi_sdr_ctx_errnum(sdrCtx) == IPMI_SDR_ERR_CACHE_READ_CACHE_DOES_NOT_EXIST) || (ipmi_sdr_ctx_errnum(sdrCtx) == IPMI_SDR_ERR_CACHE_INVALID) || (ipmi_sdr_ctx_errnum(sdrCtx) == IPMI_SDR_ERR_CACHE_OUT_OF_DATE)) {
if ((ipmi_sdr_ctx_errnum(sdrCtx) == IPMI_SDR_ERR_CACHE_INVALID) || (ipmi_sdr_ctx_errnum(sdrCtx) == IPMI_SDR_ERR_CACHE_OUT_OF_DATE)) {
LOG(debug) << _hostName << "Deleting SDR cache " << _cache;
ipmi_sdr_cache_close(sdrCtx);
ipmi_sdr_cache_delete(sdrCtx, _cache.c_str());
}
if (ipmi_sdr_cache_create(sdrCtx, _ipmiCtx, _cache.c_str(), IPMI_SDR_CACHE_CREATE_FLAGS_DEFAULT, NULL, NULL) == 0) {
LOG(debug) << _hostName << ": Created new SDR cache " << _cache;
} else {
LOG(debug) << _hostName << ": Error creating new SDR cache " << _cache;
}
} else {
LOG(debug) << _hostName << ": Error creating new SDR cache " << _cache;
}
} else {
if (retries == 0) {
errorMsg = "Error opening SDR cache: " + std::string(ipmi_sdr_ctx_errormsg(sdrCtx));
}
increaseErrorCount();
disconnect();
try {
connect();
} catch (const std::runtime_error& e) {
_errorMsg = "Error opening SDR cache: " + std::string(ipmi_sdr_ctx_errormsg(sdrCtx));
increaseErrorCount();
}
}
} else {
int recordLength = 0;
uint8_t recordBuf[IPMI_SDR_MAX_RECORD_LENGTH];
if (ipmi_sdr_cache_search_record_id(sdrCtx, recordId) < 0) {
increaseErrorCount();
if (retries == 0) {
errorMsg = "Error searching SDR record: " + std::string(ipmi_sdr_ctx_errormsg(sdrCtx));
disconnect();
}
} else {
if ((recordLength = ipmi_sdr_cache_record_read(sdrCtx, recordBuf, IPMI_SDR_MAX_RECORD_LENGTH)) < 0) {
increaseErrorCount();
if (retries == 0) {
errorMsg = "Error reading SDR record: " + std::string(ipmi_sdr_ctx_errormsg(sdrCtx));
}
int recordLength = 0;
uint8_t recordBuf[IPMI_SDR_MAX_RECORD_LENGTH];
if (ipmi_sdr_cache_search_record_id(sdrCtx, recordId) < 0) {
_errorMsg = "Error searching SDR record: " + std::string(ipmi_sdr_ctx_errormsg(sdrCtx));
} else {
_lastRead = getTimestamp();
record.insert(record.end(), &recordBuf[0], &recordBuf[recordLength]);
success = true;
if ((recordLength = ipmi_sdr_cache_record_read(sdrCtx, recordBuf, IPMI_SDR_MAX_RECORD_LENGTH)) < 0) {
_errorMsg = "Error reading SDR record: " + std::string(ipmi_sdr_ctx_errormsg(sdrCtx));
} else {
_lastRead = getTimestamp();
record.insert(record.end(), &recordBuf[0], &recordBuf[recordLength]);
success = true;
}
}
ipmi_sdr_cache_close(sdrCtx);
}
ipmi_sdr_cache_close(sdrCtx);
} else {
increaseErrorCount();
disconnect();
}
}
ipmi_sdr_ctx_destroy(sdrCtx);
if (!success) {
throw std::runtime_error(errorMsg);
throw std::runtime_error(_errorMsg);
}
return success;
}
......@@ -189,42 +154,32 @@ bool IPMIHost::getSdrRecord(uint16_t recordId, std::vector<uint8_t>& record) {
uint64_t IPMIHost::sendRawCmd(const std::vector<uint8_t>& rawCmd,
uint16_t start, uint16_t stop) {
uint8_t buf[256];
int len;
int len = -1;
int i;
if (!IPMI_NET_FN_RQ_VALID(rawCmd[1])) {
throw std::runtime_error("Error sending raw IPMI command: Invalid netfn value");
return 0;
}
try {
checkConnection();
} catch (const std::runtime_error& e) {
increaseErrorCount();
}
bool success = false;
int retries = RETRIES;
while (retries--) {
if ((len = ipmi_cmd_raw(_ipmiCtx, rawCmd[0], rawCmd[1], &rawCmd[2], rawCmd.size() - 2, buf, sizeof(buf))) < 0) {
if (retries == 0) {
throw std::runtime_error("Error sending IPMI raw command: " + std::string(ipmi_ctx_errormsg(_ipmiCtx)));
return 0;
}
increaseErrorCount();
disconnect();
try {
connect();
} catch (const std::runtime_error& e) {
while (retries-- && !success) {
if (connect() == 0) {
if ((len = ipmi_cmd_raw(_ipmiCtx, rawCmd[0], rawCmd[1], &rawCmd[2], rawCmd.size() - 2, buf, sizeof(buf))) < 0) {
_errorMsg = "Error sending IPMI raw command: " + std::string(ipmi_ctx_errormsg(_ipmiCtx));
increaseErrorCount();
disconnect();
} else {
_lastRead = getTimestamp();
success = true;
}
} else {
break;
increaseErrorCount();
disconnect();
}
}
_errorCount = 0;
_lastRead = getTimestamp();
#if 0
std::cout << "IPMIHost::sendRawCmd() received " << len << " bytes: " << std::setw(2) << std::setfill('0') << std::hex;
for (i = 0; i < len; i++) {
......@@ -233,11 +188,20 @@ uint64_t IPMIHost::sendRawCmd(const std::vector<uint8_t>& rawCmd,
std::cout << std::dec << std::endl;
#endif
if ((stop > len) || ((stop - start) >= 8)) {
throw std::runtime_error("Error processing IPMI raw data");
if (!success) {
throw std::runtime_error(_errorMsg);
return 0;
}
if (stop > len) {
std::stringstream ss;
ss << "Error processing IPMI raw data: stop=" << stop << " > len=" << len;
throw std::runtime_error(ss.str());
return 0;
}
_errorCount = 0;
uint64_t val = 0;
for (i = start; i <= stop; i++) {
val |= ((uint64_t) buf[i]) << (stop - i) * 8;
......@@ -251,37 +215,33 @@ double IPMIHost::readSensorRecord(std::vector<uint8_t>& record) {
double *reading = NULL;
uint16_t eventBitmask = 0;
try {
checkConnection();
} catch (const std::runtime_error& e) {
increaseErrorCount();
}
bool success = false;
int retries = RETRIES;
while (retries--) {
if (!_sensorReadCtx) {
_sensorReadCtx = ipmi_sensor_read_ctx_create(_ipmiCtx);
}
if (ipmi_sensor_read(_sensorReadCtx, &record[0], record.size(), 0, &rawReading, &reading, &eventBitmask) < 0) {
if (retries == 0) {
throw std::runtime_error("Error reading IPMI record: " + std::string(ipmi_sensor_read_ctx_errormsg(_sensorReadCtx)));
return 0;
while (retries-- && !success) {
if (connect() == 0) {
if (!_sensorReadCtx) {
_sensorReadCtx = ipmi_sensor_read_ctx_create(_ipmiCtx);
}
increaseErrorCount();
disconnect();
try {
connect();
} catch (const std::runtime_error& e) {
increaseErrorCount();
if (_sensorReadCtx) {
if (ipmi_sensor_read(_sensorReadCtx, &record[0], record.size(), 0, &rawReading, &reading, &eventBitmask) < 0) {
_errorMsg = "Error reading IPMI record: " + std::string(ipmi_sensor_read_ctx_errormsg(_sensorReadCtx));
increaseErrorCount();
disconnect();
} else {
success = true;
}
} else {
_errorMsg = "Error creating sensor context: " + std::string(ipmi_ctx_errormsg(_ipmiCtx));
}
} else {
break;
increaseErrorCount();
disconnect();
}
}
double ret = .0;
if (reading) {
if (success && reading) {
_errorCount = 0;
_lastRead = getTimestamp();
ret = *reading;
......
......@@ -147,6 +147,7 @@ namespace DCDB {
volatile uint64_t _delayNextReadUntil;
uint64_t _lastRead;
std::list<DCDB::IPMISensor*> _sensors;
std::string _errorMsg;
};
} /* namespace DCDB */
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment