#!/bin/bash
#队列目录
QUEUE_DIR_NAES="/incoming/
/active/
/bounce/
/defer/
/deferred/
/corrupt/
/hold/
/trace/"
ADMIN="it@zze.com
op@zzme.com"
#最大警戒值
MAXNUM=800
#日志
LOG="/var/log/monitor_queue.log"
#主机IP
LOCAL_IP=$(ifconfig |grep "inet addr"| head -n 1 | cut -f 2 -d ":"|cut -f 1 -d " ")
echo_red()
{
echo -e "\033[1;31;40m$*\033[0m"
}
echo_yellow()
{
echo -e "\033[1;33;40m$*\033[0m"
}
send_mail()
{
subject="Mail Queue monitor Warning"
content="[$(date +"%Y-%m-%d %H:%M:%S")] Warning: $LOCAL_IP, The message queue number has more than the warning value quantity, ($1) ===> ($2)."
mailto=$ADMIN
for mail_account in "$ADMIN"
do
echo "$content" | mail -s "$subject" $mail_account
done
}
echo "############### start mon queue [$(date +"%Y-%m-%d %H:%M:%S")] ############### " >> $LOG
while :
do
for dir_name in $QUEUE_DIR_NAES
do
COUNT=$(find /var/spool/postfix/ -type f | grep "$dir_name" | wc -l)
if [ $COUNT -gt $MAXNUM ]
then
export LANG="zh_CN"
dir2queue=${dir_name##/}
queue_name=${dir2queue%%/}
echo_yellow "[$(date +"%Y-%m-%d %H:%M:%S")] Queue number has exceeded the set warning value:">> $LOG
echo_red "$queue_name queue num $COUNT. " >> $LOG
echo "" >> $LOG
send_mail $queue_name $COUNT
fi
done
sleep 60
echo "############### poll mon queue [$(date +"%Y-%m-%d %H:%M:%S")] ############### " >> $LOG
done
#!/bin/sh
cd /tmp/DB_slave__monitor
INIT()
{
IPADD=`ifconfig|grep inet|awk '{print $2}'|sed 's/addr://g'| grep -Ev "^$"`;
Province_name="web";
host_name="server1";
NOWTIME=`date +%Y-%m-%d-%H-%M`;
MAILTO="xxx@xxx.com";
MAIL_SUB="slave is down $NOWTIME !";
tmpfile=mysql_mon.tmp
}
mhead()
{
echo "" >$tmpfile;
echo "From :$Province_name $host_name" >>$tmpfile
echo "State :Report" >>$tmpfile
echo "ReportBy:$0" >> $tmpfile
echo "DateTime:$NOWTIME" >> $tmpfile
echo "Info :$Province_name $host_name Disk Monitor" >> $tmpfile
echo "IP :$IPADD" >> $tmpfile
echo "" >>$tmpfile;
}
smail()
{
mail -s "$MAIL_SUB" $MAILTO < $tmpfile -- -f mo@xxx.com
}
INIT;
for x in `mysql -e "show slave status\G" | grep Running | grep -ie "IO" | awk '{print $2}'`
do
if [ $x = Yes ]; then
mhead;
mysql -e "show slave status\G" >>$tmpfile;
smail;
exit 0;
fi
done
for y in `mysql -e "show slave status\G" | grep Running | grep -ie "SQL" | awk '{print $2}'`
do
if [ $y = No ]; then
mhead;
mysql -e "show slave status\G" >>$tmpfile;
smail;
exit 0;
fi
done
cd /tmp/DB_slave__monitor
INIT()
{
IPADD=`ifconfig|grep inet|awk '{print $2}'|sed 's/addr://g'| grep -Ev "^$"`;
Province_name="web";
host_name="server1";
NOWTIME=`date +%Y-%m-%d-%H-%M`;
MAILTO="xxx@xxx.com";
MAIL_SUB="slave is down $NOWTIME !";
tmpfile=mysql_mon.tmp
}
mhead()
{
echo "" >$tmpfile;
echo "From :$Province_name $host_name" >>$tmpfile
echo "State :Report" >>$tmpfile
echo "ReportBy:$0" >> $tmpfile
echo "DateTime:$NOWTIME" >> $tmpfile
echo "Info :$Province_name $host_name Disk Monitor" >> $tmpfile
echo "IP :$IPADD" >> $tmpfile
echo "" >>$tmpfile;
}
smail()
{
mail -s "$MAIL_SUB" $MAILTO < $tmpfile -- -f mo@xxx.com
}
INIT;
for x in `mysql -e "show slave status\G" | grep Running | grep -ie "IO" | awk '{print $2}'`
do
if [ $x = Yes ]; then
mhead;
mysql -e "show slave status\G" >>$tmpfile;
smail;
exit 0;
fi
done
for y in `mysql -e "show slave status\G" | grep Running | grep -ie "SQL" | awk '{print $2}'`
do
if [ $y = No ]; then
mhead;
mysql -e "show slave status\G" >>$tmpfile;
smail;
exit 0;
fi
done
1) 检查当前僵尸进程信息
# ps -ef | grep defunct | grep -v grep | wc -l
175
# top | head -2
top - 15:05:54 up 97 days, 23:49, 4 users, load average: 0.66, 0.45, 0.39
Tasks: 829 total, 1 running, 479 sleeping, 174 stopped, 175 zombie
# ps -ef | grep defunct | grep -v grep
2) 获得杀僵尸进程语句
# ps -ef | grep defunct | grep -v grep | awk '{print "kill -9 " $2,$3}'
执行上面获得的语句即可, 使用信号量9, 僵尸进程数会大大减少.
3) 过一会儿检查当前僵尸进程信息
# ps -ef | grep defunct | grep -v grep | wc -l
125
# top | head -2
top - 15:29:26 up 98 days, 12 min, 7 users, load average: 0.27, 0.54, 0.56
Tasks: 632 total, 1 running, 381 sleeping, 125 stopped, 125 zombie
发现僵尸进程数减少了一些, 但还有不少啊.
4) 再次获得杀僵尸进程语句
# ps -ef | grep defunct | grep -v grep | awk '{print "kill -18 " $3}'
执行上面获得的语句即可, 这次使用信号量18杀其父进程, 僵尸进程应该会全部消失.
5) 过一会儿再检查当前僵尸进程信息
# ps -ef | grep defunct | grep -v grep | wc -l
0
# top | head -2
top - 15:39:46 up 98 days, 23 min, 7 users, load average: 5.46, 2.20, 1.12
Tasks: 134 total, 1 running, 133 sleeping, 0 stopped, 0 zombie
6) 清除ZOMBIE(僵尸)进程原理
# kill -18 PPID
PPID是其父进程, 这个信号是告诉父进程, 该子进程已经死亡了, 请收回分配给他的资源. 如果还不行则看先看其父进程又无其他子进程, 如果有, 可能需要先kill其他子进程, 也就是兄弟进程.
方法是:
# kill -15 PID1 PID2
PID1,PID2是僵尸进程的父进程的其它子进程.
然后再kill父进程:
# kill -15 PPID
# ps -ef | grep defunct | grep -v grep | wc -l
175
# top | head -2
top - 15:05:54 up 97 days, 23:49, 4 users, load average: 0.66, 0.45, 0.39
Tasks: 829 total, 1 running, 479 sleeping, 174 stopped, 175 zombie
# ps -ef | grep defunct | grep -v grep
2) 获得杀僵尸进程语句
# ps -ef | grep defunct | grep -v grep | awk '{print "kill -9 " $2,$3}'
执行上面获得的语句即可, 使用信号量9, 僵尸进程数会大大减少.
3) 过一会儿检查当前僵尸进程信息
# ps -ef | grep defunct | grep -v grep | wc -l
125
# top | head -2
top - 15:29:26 up 98 days, 12 min, 7 users, load average: 0.27, 0.54, 0.56
Tasks: 632 total, 1 running, 381 sleeping, 125 stopped, 125 zombie
发现僵尸进程数减少了一些, 但还有不少啊.
4) 再次获得杀僵尸进程语句
# ps -ef | grep defunct | grep -v grep | awk '{print "kill -18 " $3}'
执行上面获得的语句即可, 这次使用信号量18杀其父进程, 僵尸进程应该会全部消失.
5) 过一会儿再检查当前僵尸进程信息
# ps -ef | grep defunct | grep -v grep | wc -l
0
# top | head -2
top - 15:39:46 up 98 days, 23 min, 7 users, load average: 5.46, 2.20, 1.12
Tasks: 134 total, 1 running, 133 sleeping, 0 stopped, 0 zombie
6) 清除ZOMBIE(僵尸)进程原理
# kill -18 PPID
PPID是其父进程, 这个信号是告诉父进程, 该子进程已经死亡了, 请收回分配给他的资源. 如果还不行则看先看其父进程又无其他子进程, 如果有, 可能需要先kill其他子进程, 也就是兄弟进程.
方法是:
# kill -15 PID1 PID2
PID1,PID2是僵尸进程的父进程的其它子进程.
然后再kill父进程:
# kill -15 PPID





