Nagios是一款用于系统和网络监控的应用程序。它可以在你设定的条件下对主机和服务进行监控,在状态变差和变好的时候给出告警信息。参考中文手册:http://nagios-cn.sourceforge.net/nagios-cn/bookfirst.html
安装nagios-core,官网下载地址 http://pkgs.repoforge.org/rpmforge-release/ ,首先要更新下rpmforge,安装些基础编码依赖包
rpm -ivh http://pkgs.repoforge.org/rpmforge-release/rpmforge-release-0.5.3-1.el7.rf.x86_64.rpm
yum install gd fontconfig-devel libjpeg-devel libpng-devel gd-devel perl-GD \
openssl-devel php mailx postfix cpp gcc gcc-c++ libstdc++ glib2-devel libtoul-ltdl-devel
#add user and group
groupadd -g 6000 nagios
groupadd -g 6001 nagcmd
useradd -u 6000 -g nagios -G nagcmd -d /home/nagios -c "Nagios Admin" nagios
#https://www.nagios.org/download/core/thanks
wget http://prdownloads.sourceforge.net/sourceforge/nagios/nagios-4.0.8.tar.gz
tar xzfv nagios-4.0.8.tar.gz
cd nagios-4.0.8
./configure --prefix=/usr/local/nagios --with-nagios-user=nagios \
--with-nagios-group=nagios --with-command-user=nagios
--with-command-group=nagcmd --enable-event-broker --enable-nanosleep
--enable-embedded-perl --with-perlcache
make all
make install
make install-init
make install-commandmode
make install-webconf
make install-config
服务器选择,nagios需要php解析服务器,文件路劲在/usr/local/nagios/share下,采用apache httpd 加php解析比较简单
#http php
yum install httpd php*
#配置httpd.conf,支持php
vim /etc/httpd/conf/httpd.conf
DirectoryIndex index.html index.html.var
将其修改为:
DirectoryIndex index.html index.php
再在 Apache 配置文件下增加如下内容:
AddType application/x-httpd-php .php
#设置用户访问控制
htpasswd -c /usr/local/nagios/etc/htpasswd.users nagiosadmin
chown nagios:nagcmd /usr/local/nagios/etc/htpasswd.users
usermod -a -G nagios,nagcmd apache
service httpd restart
#Postfix
chkconfig postfix on
service postfix start
采用插件方式可以自定义监控,很容易上手,由于服务端只是调用nrpe,因此make install-plugin可以调用调试即可。
#安装plugin
wget http://nagios-plugins.org/download/nagios-plugins-2.0.3.tar.gz
./configure --with-nagios-user=nagios --with-nagios-group=nagios --with-command-user=nagios --with-command-group=nagcmd --prefix=/usr/local/nagios
make all
make install
chmod 755 /usr/local/nagios
#安装nrpe http://sourceforge.net/projects/nagios/files/nrpe-2.x/
wget http://sourceforge.net/projects/nagios/files/nrpe-2.x/nrpe-2.15/
./configure --with-command-group=nagios --prefix=/usr/local/nagios
make all
make install-plugin
chkconfig httpd on
chkconfig nagios on
service httpd restart
service nagios restart
让nagios支持绘图,需要安装pnp,由于没使用,暂时作为参考。
#安装rrdtool,pnp依赖rrd http://oss.oetiker.ch/rrdtool/pub/?M=D
yum -y install libxml2-devel pango-devel perl-ExtUtils-CBuilder perl-ExtUtils-MakeMaker
wget http://oss.oetiker.ch/rrdtool/pub/rrdtool-1.5.3.tar.gz
./configure --prefix=/usr/local/rrdtool
make all
make install
#安装pnp http://sourceforge.net/projects/pnp4nagios/
yum -y install perl-Time-HiRes
wget http://sourceforge.net/projects/pnp4nagios/files/latest/download
./configure --with-nagios-user=nagios --with-nagios-group=nagios --with-rrdtool=/usr/local/rrdtool/bin/rrdtool --with-perfdata-dir=/usr/local/nagios/share/perfdata
make all
make install
make install-config
make install-init
被监控端安装nagios-plugins和nrpe
#添加使用账户
useradd -M -s /sbin/nulgin nagios
yum -y install openssl-devel
#安装nagios-plugins
./configure
make && make install
#安装nrpe http://sourceforge.net/projects/nagios/files/nrpe-2.x/
wget http://sourceforge.net/projects/nagios/files/nrpe-2.x/nrpe-2.15/
./configure --with-command-group=nagios --prefix=/usr/local/nagios
make all
make install-plugin
make install-daemon
make install-daemon-config
设置服务器地址
vi /usr/local/nagios/etc/nrpe.cfg
找到 allowed_hosts=127.0.0.1,后面加nagios服务器的IP, 用“,”隔开,加了之后如下:
allowed_hosts=127.0.0.1,192.168.100.17
如果服务器使用nat,真实访问外部机器的是网关那台机器的IP地址,因此allowed_hosts应加上网关IP地址。
#启动nrpe
/usr/local/nagios/bin/nrpe -c /usr/local/nagios/etc/nrpe.cfg -d
lsof -i:5666
#服务端测试手段
libexec/check_nrpe -H vm14 -c check_load
NRPE v2.15
监控项在客户端添加,常规检测报警参数,w--warning,c--critic,u--unkown,r--recovery。
vim /etc/local/nagios/etc/nrpe.cfg
command[check_swap]=/usr/local/nagios/libexec/check_swap -w 20% -c 10%
command[check_disk_/]=/usr/local/nagios/libexec/check_disk -w 20% -c 10% -p /
command[check_disk_web]=/usr/local/nagios/libexec/check_disk -w 20% -c 10% -p /web
解释上条命令,意思为:检测此磁盘,free<20%,就warning,free<10%,就critic。具体报警要参考对应检测服务选择的模板,以自带generic-service模板为例,截取部分模板内容:
check_period 24x7
max_check_attempts 3
normal_check_interval 10
retry_check_interval 2
contact_groups admins
notification_options w,u,c,r
notification_interval 60
notification_period 24x7
解释为:
使用独立配置
cd /usr/lcoal/nagios/etc
cp -r objects monitor
修改nagios.cfg,使用cfg_dir添加配置目录,注释objects中重复项
cfg_dir=/usr/local/nagios/etc/monitor
邮件发送者设置
tail /etc/mail.rc
set from=nagios@hf.nagios.com
利用nrpe收集数据
在command.cfg中添加使用nrpe的命令
define command{
command_name check_nrpe
command_line $USER1$/check_nrpe -H $HOSTADDRESS$ -c $ARG1$
}
#鉴于犯过参数写为$HOSTNAME$,主机无法解析主机名,一直bound报错,特此备注警告!
新建hosts.cfg,创建自己的监控节点
define host{
check_command check-host-alive
notification_options d,u,r
max_check_attempts 5
name generichosttemplate
register 0
contact_groups users
}
define host{
host_name vm15
address 192.168.100.15
use linux-server
}
define host{
host_name vm14
address 192.168.100.14
use linux-server
}
define host{
host_name vm16
address 192.168.100.16
use linux-server
}
define hostgroup{
hostgroup_name vm
alias this is vm host group
members vm14,vm15,vm16
}
新建services.cfg,添加自定义监控服务列表
define service{
use generic-service
hostgroup_name vm
service_description swap
check_command check_nrpe!check_swap
}
define service{
use generic-service
hostgroup_name vm
service_description load
check_command check_nrpe!check_load
}
define service{
use generic-service
hostgroup_name vm
service_description users
check_command check_nrpe!check_users
}
define service{
use generic-service
hostgroup_name vm
service_description disk_/
check_command check_nrpe!check_disk_/
}
define service{
use generic-service
hostgroup_name vm
service_description disk_web
check_command check_nrpe!check_disk_web
}
define servicegroup{
servicegroup_name linux_disk_services
alias linux server disk check
members vm15,disk_/,vm15,disk_web
}
服务端测试获取
# /usr/local/nagios/libexec/check_nrpe -H vm15 -c check_disk_/
DISK OK - free space: / 27143 MB (93% inode=96%);| /=1782MB;24378;27425;0;30473
基于clush的批量安装nagios客户端,由于爆了很多warning,就不介绍了。
# ls /root/nagios/
nagios_client.sh nagios-plugins-2.0.3.tar.gz nrpe-2.15.tar.gz nrpe.cfg nrpe_restart
clush -b -w @hadoop -c /root/nagios --dest=/web
clush -b -w @hadoop /web/nagios/nagios_client.sh
#服务端测试
/usr/local/nagios/libexec/check_nrpe -H HSlave1 -c check_disk_/
nagios_client.sh
#!/bin/sh
echo "check nagios user nagios..."
grep nagios /etc/passwd 1>/dev/null 2>/dev/null
if [ $? -ne 0 ];then
echo "Now add user nagios"
useradd -M -s /sbin/nologin nagios
fi
echo "check gcc rpm..."
rpm -q gcc 1>/dev/null 2>/dev/null
if [ $? -ne 0 ];then
echo " Now yum -y install gcc..."
yum -y install gcc
fi
echo "check openssl rpm..."
rpm -q openssl-devel 1>/dev/null 2>/dev/null
if [ $? -ne 0 ];then
echo "Now yum -y install openssl-devel.."
yum -y install openssl-devel
fi
cd /web/nagios
tar zxf nagios-plugins-2.0.3.tar.gz
cd nagios-plugins-2.0.3
./configure
make && make install
cd ..
tar zxf nrpe-2.15.tar.gz
cd nrpe-2.15
./configure --with-command-group=nagios --prefix=/usr/local/nagios
make all
make install-plugin
make install-daemon
make install-daemon-config
cp /web/nagios/nrpe.cfg /usr/local/nagios/etc
mkdir /root/bin
cp /web/nagios/nrpe_restart /root/bin
chmod +x /root/bin/nrpe_restart
sh /root/bin/nrpe_restart
nrpe_restart
#!/bin/sh
pid=`cat /var/run/nrpe.pid `
if [ -d /proc/$pid -a /var/run/nrpe.pid ];then
echo "Now kill nrpe..."
kill -9 $pid
fi
sleep 1
echo "Now start nrpe..."
/usr/local/nagios/bin/nrpe -c /usr/local/nagios/etc/nrpe.cfg -d
###插件###
check_memory.pl
检测应用占用的内存,源码地址为:
http://www.thorko.de/git/public/plain/perl/master/v1.0/check_memory.pl
#./check_memory.pl -w 100000 -c 200000 -p httpd
OK: httpd, memory: 92288 KB | memory: 92288 KB
check_mem.sh
检测系统内存使用情况,源码地址为:
https://exchange.nagios.org/components/com_mtree/attachment.php?link_id=4174&cf_id=24
#./check_mem -w 80 -c 90
nscp是windows下nagios监控client的一个合集。其包含check plugins 、check_nt、check_nrpe 、nsca client 、wmi checks 。一般需要的监测项基本都包含在内了。下载地址 http://sourceforge.net/projects/nscplus/files/nscplus/
默认安装路径 C:\Program Files\NSClient++ ,配置文件nsclient.ini
[/modules]
CheckDisk = 1
CheckEventLog = 1
CheckExternalScripts = 1
CheckNSCP = 1
CheckHelpers = 1
CheckSystem = 1
NRPEServer = 1
NSCAClient = 1
NSClientServer = 1
; Undocumented section
[/settings/default]
; ALLOWED HOSTS - A comaseparated list of allowed hosts. You can use netmasks (/ syntax) or * to create ranges.
allowed hosts = 192.168.8.200
[/modules]
启动前设置,如果选择了其他插件,除了check_nt,则需要
services.msc --> NSClient++ 属性--> 登录 ---> 允许服务与桌面交互 --> 确定
检测方式
./check_nt -H 192.168.1.113 -p 12489 -v UPTIME
System Uptime - 22 day(s) 2 hour(s) 8 minute(s) |uptime=31808
./check_nrpe -H 192.168.1.113
I (0,4,1,90 2013-02-04) seem to be doing fine...
客户机检测
nestat -ano|findstr 5666
nestat -ano|findstr 12489