当前位置: 首页 > 网络技术 > 网络应用 > 正文

结合Ansible技术监控Storm集群

时间:2016-12-29

1、我的hosts配置

# vim /etc/hosts

123456789101112

192.168.1.100 storm_zk1

192.168.1.101 storm_zk2

192.168.1.102 storm_zk3

192.168.1.103 storm_nimbus

192.168.1.104 storm_supervisor1

192.168.1.105 storm_supervisor2

192.168.1.106 storm_supervisor3

192.168.1.107 storm_supervisor4

192.168.1.108 storm_supervisor5

192.168.1.109 storm_supervisor6

2、我的storm配置

# vim /usr/local/storm/conf/storm.yaml

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970

drpc.servers:

- "storm_supervisor1"

- "storm_supervisor2"

- "storm_supervisor3"

storm.zookeeper.servers:

- "storm_zk1"

- "storm_zk2"

- "storm_zk3"

storm.local.dir: "/data/storm/workdir"

nimbus.host: "storm_nimbus"

nimbus.thrift.port: 6627

nimbus.thrift.max_buffer_size: 1048576

nimbus.childopts: "-Xmx1024m"

nimbus.task.timeout.secs: 30

nimbus.supervisor.timeout.secs: 60

nimbus.monitor.freq.secs: 10

nimbus.cleanup.inbox.freq.secs: 600

nimbus.inbox.jar.expiration.secs: 3600

nimbus.task.launch.secs: 240

nimbus.reassign: true

nimbus.file.copy.expiration.secs: 600

nimbus.topology.validator: "backtype.storm.nimbus.DefaultTopologyValidator"

storm.zookeeper.port: 2181

storm.zookeeper.root: "/data/storm/zkinfo"

storm.cluster.mode: "distributed"

storm.local.mode.zmq: false

ui.port: 8080

ui.childopts: "-Xmx768m"

supervisor.slots.ports:

- 6700

- 6701

- 6702

- 6703

- 6704

- 6705

- 6706

- 6707

- 6708

- 6709

supervisor.childopts: "-Xmx2048m"

supervisor.worker.start.timeout.secs: 240

supervisor.worker.timeout.secs: 30

supervisor.monitor.frequency.secs: 3

supervisor.heartbeat.frequency.secs: 5

supervisor.enable: true

worker.childopts: "-Xmx4096m"

topology.max.spout.pending: 5000

storm.zookeeper.session.timeout: 5000

storm.zookeeper.connection.timeout: 3000

storm.zookeeper.retry.times: 6

storm.zookeeper.retry.interval: 2000

storm.zookeeper.retry.intervalceiling.millis: 30000

storm.thrift.transport: "backtype.storm.security.auth.SimpleTransportPlugin"

storm.messaging.transport: "backtype.storm.messaging.netty.Context"

storm.messaging.netty.server_worker_threads: 50

storm.messaging.netty.client_worker_threads: 50

storm.messaging.netty.buffer_size: 20971520

storm.messaging.netty.max_retries: 100

storm.messaging.netty.max_wait_ms: 1000

storm.messaging.netty.min_wait_ms: 100

3、nimbus节点部署

# vim /data/scripts/monitor_status_for_storm.sh

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263

#!/bin/sh

PATH=/sbin:/bin:/usr/sbin:/usr/bin:/usr/local/bin:/usr/local/sbin

. /etc/profile

## 监控页面地址参数

MON_SRV_IPADDR="192.168.1.103"

MON_SRV_PORT="8080"

## 是否已正确扫描

SCAN_FLAG=0

## 工作基路径

BASE_PATH="/data/scripts"

## 异常 storm Supervisor 主机地址列表

FAIL_SUPERVISOR_LIST="${BASE_PATH}/fail_supervisor.txt"

#---------------------------------------------------------------------------------------------------

## 重启storm的nimbus服务

function restart_storm_nimbus_server()

{

[[ -n `ps aux | grep java | grep storm` ]] && kill -9 `ps aux | grep java | grep storm | awk '{print $2}'`

nohup /usr/local/storm/bin/storm nimbus >/dev/null 2>&1 &

nohup /usr/local/storm/bin/storm ui >/dev/null 2>&1 &

sleep 30

}

#---------------------------------------------------------------------------------------------------

## 1、检查监控页面是否正常【8080端口不通的情况】

for ((i=0; i<3; i++)); do

RETVAL=`/usr/bin/nmap -n -sS -p ${MON_SRV_PORT} ${MON_SRV_IPADDR} | grep open`

[[ -n "${RETVAL}" ]] && SCAN_FLAG=1;break || sleep 10

done

[[ ${SCAN_FLAG} -ne 1 ]] && restart_storm_nimbus_server

#---------------------------------------------------------------------------------------------------

## 2、将监控页面抓取内容与本地hosts内容进行差异比较,以确定是否存在异常的 storm supervisor 服务

curl -s http://${MON_SRV_IPADDR}:${MON_SRV_PORT}/ | sed 's/<td>/<td>\n/g' | awk -F '<' '/^storm_/{print $1}' | awk '!/nimbus/{print}' | sort > ${BASE_PATH}/supervisor_list_from_page.txt

## 如果获取的storm nimbus监控页面数据为空,代表storm nimbus服务存在异常

[[ -z `sed '/^$/d' ${BASE_PATH}/supervisor_list_from_page.txt` ]] && restart_storm_nimbus_server

sort -nr ${BASE_PATH}/supervisor_list_from_page.txt ${BASE_PATH}/supervisor_list.txt | uniq -u > ${BASE_PATH}/supervisor_list_for_failed.txt

[[ -z `sed '/^$/d' ${BASE_PATH}/supervisor_list_for_failed.txt` ]] && rm -f ${BASE_PATH}/supervisor_list_for_failed.txt && exit 0

#---------------------------------------------------------------------------------------------------

## 3、获得异常的 storm supervisor 服务的IP地址列表

echo "[fail_supervisor]" >> ${FAIL_SUPERVISOR_LIST}

for SUPERVISOR_NAMEADDR in `cat ${BASE_PATH}/supervisor_list_for_failed.txt`

do

TEMP_IPADDR=`grep -w ${SUPERVISOR_NAMEADDR} /etc/hosts | grep -v '#' | awk '{print $1}' | tail -1`

echo "${TEMP_IPADDR}" >> ${FAIL_SUPERVISOR_LIST}

IPLIST="${IPLIST} ${TEMP_IPADDR}"

done

#---------------------------------------------------------------------------------------------------

## 4、远程重启 storm supervisor 服务

/usr/local/bin/ansible -i ${FAIL_SUPERVISOR_LIST} fail_supervisor -m shell -a "/data/scripts/restart_storm_service.sh"

rm -f ${FAIL_SUPERVISOR_LIST}

# vim /data/scripts/supervisor_list.txt

123456

storm_supervisor1

storm_supervisor2

storm_supervisor3

storm_supervisor4

storm_supervisor5

storm_supervisor6

# touch /var/run/check_storm.lock

# crontab -e

*/2 * * * * (flock --timeout=0 /var/run/check_storm.lock /data/scripts/monitor_status_for_storm.sh >/dev/null 2>&1)

4、supervisor节点部署

# vim /data/scripts/restart_storm_service.sh

123456

#!/bin/sh

PATH=/sbin:/bin:/usr/sbin:/usr/bin:/usr/local/bin:/usr/local/sbin

. /etc/profile

[[ -n `ps aux | grep java | grep storm` ]] && kill -9 `ps aux | grep java | grep storm | awk '{print $2}'`

nohup /usr/local/storm/bin/storm supervisor >/dev/null 2>&1 &

本文出自 “人生理想在于坚持不懈” 博客,请务必保留此出处http://sofar.blog.51cto.com/353572/1579897