最近在用Nagios监控Xen PV虚拟机的时候出现了问题,在被监控的服务器上是采用nrpe来采集数据的。但是在进程里无法看到PV虚拟机的进程,虽然可以通过xm top vpsname的方式来获取名为vpsname虚拟机的cpu使用率情况,但是不便于采集数据,通过xm list可以采集到cpu时间,根据CPU时间的差值,可以计算CPU使用率,可是该命令只能root执行,因为该命令可以进行关闭,重启虚拟机等重要操作,所以如果把权限给了nrpe,将可能造成严重的安全问题。
幸好livirt提供了API,所以我打算尝试用API写一个Nagios的插件来满足的我的需求,我的想法就是分别2次获得虚拟机的CPU时间,并分别记录2次取得数据时的系统时间,然后根据差值来计算,在理论上是存在一些误差的。
1.要使用API,首先需要安装libvirt-devel
[root@test ~]# yum -y install libvirt-devel
2.我的代码如下,文件名为vCpu.c
/**
* Program Name: vCpu.c
* Author: steptodream
* Description:A simple plugin to get vps cpu usage
* for nagios(nrpe) by libvirt api
* Compile:gcc -o vCpu vCpu.c -lvirt
*/
#include <stdlib .h>
#include <stdio .h>
#include <libvirt /libvirt.h>
/* define the exit status for nagios */
#define OK 0
#define WARNING 1
#define CRITICAL 2
#define UNKNOWN 3
/* get cpu time of the given name */
double getCpuTime(char *vpsName,virConnectPtr conn) {
virDomainInfo info;
virDomainPtr domain = NULL;
int ret;
/* Find the domain of the given name */
domain = virDomainLookupByName(conn, vpsName);
if (domain == NULL) {
printf("Failed to find the vps called %s\n", vpsName);
exit(OK);
}
/* Get the information of the domain */
ret = virDomainGetInfo(domain, &info);
virDomainFree(domain);
if (ret < 0) {
printf("Failed to get information for %s\n", vpsName);
exit(OK);
}
return info.cpuTime;
}
int main(int argc,char * argv[])
{
char *vpsName; /* vps name */
int interval = 1; /* check interval */
double warning; /* warning value */
double critical; /* critical value */
double cpuUsage; /* cpu usage of the vps */
struct timeval startTime; /* time of the first time to get cpu time */
struct timeval endTime; /* time of the second time to get cpu time */
int realTime; /* real interval between two times */
long long startCpuTime; /* cpu time of the first time */
long long endCpuTime; /* cpu time of the second time */
int cpuTime; /* value of startCpuTime - endCpuTime */
char *output; /* output data for nagios */
int ret; /* exit status for nagios */
virConnectPtr conn; /* connection pointer */
switch (argc){
case 5:
interval = atoi(argv[4]);
case 4:
vpsName = argv[1];
warning = atof(argv[2]);
critical = atof(argv[3]);
break;
default:
printf("Usage:vCpu <vName> <warning> <critical> [interval]\n\n");
return OK;
}
/* connect to local Xen Host */
conn = virConnectOpenReadOnly(NULL);
if (conn == NULL) {
printf("Failed to connect to local Xen Host\n");
return OK;
}
/* get cpu time the first time */
startCpuTime = getCpuTime(vpsName, conn);
/* get start time */
if (gettimeofday(&startTime, NULL) == -1) {
printf("Failed to get start time\n");
return OK;
}
/* wait for some seconds */
sleep(interval);
/* get cpu time the second time */
endCpuTime = getCpuTime(vpsName, conn);
/* get end time */
if (gettimeofday(&endTime, NULL) == -1) {
printf("Failed to get end time\n");
return OK;
}
/* colose connection */
virConnectClose(conn);
/* calculate the usage of cpu */
cpuTime = (startCpuTime - endCpuTime) / 1000;
realTime = 1000000 * (startTime.tv_sec - endTime.tv_sec) +
(startTime.tv_usec - endTime.tv_usec);
cpuUsage = cpuTime / (double)(realTime);
/* display cpuUsage by percentage */
cpuUsage *= 100;
/* make output data and exit status for nagios*/
if (cpuUsage > critical) {
output = "CRITICAL";
ret = CRITICAL;
} else if (cpuUsage > warning){
output = "WARNING";
ret = WARNING;
} else {
output = "OK";
ret = OK;
}
printf("%s CPU:%.2f%|CPU=%.2f",output,cpuUsage,cpuUsage);
return ret;
}
继续阅读 →