/mysql-mha

Primary LanguagePerlOtherNOASSERTION

高可用环境准备

准备两台centos6.9系统的虚拟机
IP地址规划为
192.168.56.200 node1 master
192.168.56.201 node2 slave
192.168.56.111 vip


[mysqld]

####symi replication settings##
plugin_dir=/usr/local/mysql/lib/plugin
plugin_load = "rpl_semi_sync_master=semisync_master.so;rpl_semi_sync_slave=semisync_slave.so"
rpl_semi_sync_master_enabled=1
rpl_semi_sync_master_timeout=10000 #10s
loose_rpl_semi_sync_slave_enabled = 1

关闭防火墙
service iptables stop
chkconfig --del iptables

设置hosts
cat >/etc/hosts <<EOF
127.0.0.1   localhost localhost.localdomain localhost4 localhost4.localdomain4
::1         localhost localhost.localdomain localhost6 localhost6.localdomain6
192.168.56.200 node1
192.168.56.201 node2
EOF

在192.168.56.200上执行
ssh-keygen -t rsa
 
cd ~/.ssh 
cat id_rsa.pub >>authorized_keys
scp ./* root@node2:/root/.ssh/
ssh node2
ssh node1

创建同步账号
grant replication slave on *.* to 'repl'@'192.168.56.%' identified by 'lrepl';
flush privileges;

在node2上
执行change master语句
change master to master_host='192.168.56.200',master_user='rep',master_password='repl',master_auto_position=1;
start slave;
show slave status\G
设置从库只读
mysql -uroot -pletian318 -e "set global read_only=1"
====================================================================================
创建监控用户(在master上执行,也就是mha120 192.168.56.120):
grant all privileges on *.* to 'monitor'@'192.168.56.%' identified  by 'letian318'; 
flush  privileges;


在node1  node2 上执行
配置好base 和epel源

yum localinstall -y  mha4mysql-node-0.57-0.el7.noarch.rpm

在node2上执行
yum localinstall -y mha4mysql-manager-0.57-0.el7.noarch.rpm
mkdir -p  /var/log/masterha/app1

运行
解压在/etc下masterha.zip
bash  check_ssh.sh
[root@node2 masterha]# bash check_ssh.sh 
Mon Nov 27 20:42:33 2017 - [info] Reading default configuration from /etc/masterha/masterha_default.conf..
Mon Nov 27 20:42:33 2017 - [info] Reading application default configuration from /etc/masterha/app1.conf..
Mon Nov 27 20:42:33 2017 - [info] Reading server configuration from /etc/masterha/app1.conf..
Mon Nov 27 20:42:33 2017 - [info] Starting SSH connection tests..
Mon Nov 27 20:42:34 2017 - [debug] 
Mon Nov 27 20:42:33 2017 - [debug]  Connecting via SSH from root@node1(192.168.56.200:22) to root@node2(192.168.56.201:22)..
Mon Nov 27 20:42:33 2017 - [debug]   ok.
Mon Nov 27 20:42:34 2017 - [debug] 
Mon Nov 27 20:42:34 2017 - [debug]  Connecting via SSH from root@node2(192.168.56.201:22) to root@node1(192.168.56.200:22)..
Mon Nov 27 20:42:34 2017 - [debug]   ok.
Mon Nov 27 20:42:34 2017 - [info] All SSH connection tests passed successfully.
[root@node2 masterha]# 

[root@node1 masterha]# bash check_repl.sh 
Mon Nov 27 20:42:50 2017 - [info] Reading default configuration from /etc/masterha/masterha_default.conf..
Mon Nov 27 20:42:50 2017 - [info] Reading application default configuration from /etc/masterha/app1.conf..
Mon Nov 27 20:42:50 2017 - [info] Reading server configuration from /etc/masterha/app1.conf..
Mon Nov 27 20:42:50 2017 - [info] MHA::MasterMonitor version 0.57.
Mon Nov 27 20:42:50 2017 - [info] GTID failover mode = 1
Mon Nov 27 20:42:50 2017 - [info] Dead Servers:
Mon Nov 27 20:42:50 2017 - [info] Alive Servers:
Mon Nov 27 20:42:50 2017 - [info]   node1(192.168.56.200:3306)
Mon Nov 27 20:42:50 2017 - [info]   node2(192.168.56.201:3306)
Mon Nov 27 20:42:50 2017 - [info] Alive Slaves:
Mon Nov 27 20:42:50 2017 - [info]   node2(192.168.56.201:3306)  Version=5.7.20-log (oldest major version between slaves) log-bin:enabled
Mon Nov 27 20:42:50 2017 - [info]     GTID ON
Mon Nov 27 20:42:50 2017 - [info]     Replicating from 192.168.56.200(192.168.56.200:3306)
Mon Nov 27 20:42:50 2017 - [info]     Primary candidate for the new Master (candidate_master is set)
Mon Nov 27 20:42:50 2017 - [info] Current Alive Master: node1(192.168.56.200:3306)
Mon Nov 27 20:42:50 2017 - [info] Checking slave configurations..
Mon Nov 27 20:42:50 2017 - [info]  read_only=1 is not set on slave node2(192.168.56.201:3306).
Mon Nov 27 20:42:50 2017 - [info] Checking replication filtering settings..
Mon Nov 27 20:42:50 2017 - [info]  binlog_do_db= , binlog_ignore_db= 
Mon Nov 27 20:42:50 2017 - [info]  Replication filtering check ok.
Mon Nov 27 20:42:50 2017 - [info] GTID (with auto-pos) is supported. Skipping all SSH and Node package checking.
Mon Nov 27 20:42:50 2017 - [info] Checking SSH publickey authentication settings on the current master..
Mon Nov 27 20:42:50 2017 - [info] HealthCheck: SSH to node1 is reachable.
Mon Nov 27 20:42:50 2017 - [info] 
node1(192.168.56.200:3306) (current master)
 +--node2(192.168.56.201:3306)

Mon Nov 27 20:42:50 2017 - [info] Checking replication health on node2..
Mon Nov 27 20:42:50 2017 - [info]  ok.
Mon Nov 27 20:42:50 2017 - [info] Checking master_ip_failover_script status:
Mon Nov 27 20:42:50 2017 - [info]   /etc/masterha/master_ip_failover --command=status --ssh_user=root --orig_master_host=node1 --orig_master_ip=192.168.56.200 --orig_master_port=3306 


IN SCRIPT TEST====/sbin/ifconfig eth0:1 down==/sbin/ifconfig eth0:1 192.168.56.111/24===

Checking the Status of the script.. OK 
Mon Nov 27 20:42:50 2017 - [info]  OK.
Mon Nov 27 20:42:50 2017 - [warning] shutdown_script is not defined.
Mon Nov 27 20:42:50 2017 - [info] Got exit code 0 (Not master dead).

MySQL Replication Health is OK.
[root@node2 masterha]# 
发现主库挂上了我们定义好的vip 
[root@node1 masterha]# ip add
1: lo: <LOOPBACK,UP,LOWER_UP> mtu 65536 qdisc noqueue state UNKNOWN 
    link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
    inet 127.0.0.1/8 scope host lo
    inet6 ::1/128 scope host 
       valid_lft forever preferred_lft forever
2: eth0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc pfifo_fast state UP qlen 1000
    link/ether 00:0c:29:6f:b6:53 brd ff:ff:ff:ff:ff:ff
    inet 192.168.56.200/24 brd 192.168.56.255 scope global eth0
    inet 192.168.56.111/24 brd 192.168.56.255 scope global secondary eth0:1
    inet6 fe80::20c:29ff:fe6f:b653/64 scope link 
       valid_lft forever preferred_lft forever
[root@node1 masterha]# 

运行管理监控脚本
bash    start-manager.sh


模拟主库挂掉
在node1上
/etc/init.d/mysqld stop

查看mannager  日志
Mon Nov 27 18:59:15 2017 - [info] Ping(SELECT) succeeded, waiting until MySQL doesn't respond..
Mon Nov 27 19:00:24 2017 - [warning] Got error on MySQL select ping: 2006 (MySQL server has gone away)
Mon Nov 27 19:00:24 2017 - [info] Executing SSH check script: exit 0
Mon Nov 27 19:00:24 2017 - [info] HealthCheck: SSH to node1 is reachable.
Mon Nov 27 19:00:25 2017 - [warning] Got error on MySQL connect: 2013 (Lost connection to MySQL server at 'reading initial communication packet', system error: 111)
Mon Nov 27 19:00:25 2017 - [warning] Connection failed 2 time(s)..
Mon Nov 27 19:00:26 2017 - [warning] Got error on MySQL connect: 2013 (Lost connection to MySQL server at 'reading initial communication packet', system error: 111)
Mon Nov 27 19:00:26 2017 - [warning] Connection failed 3 time(s)..
Mon Nov 27 19:00:27 2017 - [warning] Got error on MySQL connect: 2013 (Lost connection to MySQL server at 'reading initial communication packet', system error: 111)
Mon Nov 27 19:00:27 2017 - [warning] Connection failed 4 time(s)..
Mon Nov 27 19:00:27 2017 - [warning] Master is not reachable from health checker!
Mon Nov 27 19:00:27 2017 - [warning] Master node1(192.168.56.200:3306) is not reachable!
Mon Nov 27 19:00:27 2017 - [warning] SSH is reachable.
Mon Nov 27 19:00:27 2017 - [info] Connecting to a master server failed. Reading configuration file /etc/masterha/masterha_default.conf and /etc/masterha/app1.conf again, and trying to connect to all servers to check server status..
Mon Nov 27 19:00:27 2017 - [info] Reading default configuration from /etc/masterha/masterha_default.conf..
Mon Nov 27 19:00:27 2017 - [info] Reading application default configuration from /etc/masterha/app1.conf..
Mon Nov 27 19:00:27 2017 - [info] Reading server configuration from /etc/masterha/app1.conf..
Mon Nov 27 19:00:27 2017 - [info] GTID failover mode = 1
Mon Nov 27 19:00:27 2017 - [info] Dead Servers:
Mon Nov 27 19:00:27 2017 - [info]   node1(192.168.56.200:3306)
Mon Nov 27 19:00:27 2017 - [info] Alive Servers:
Mon Nov 27 19:00:27 2017 - [info]   node2(192.168.56.201:3306)
Mon Nov 27 19:00:27 2017 - [info] Alive Slaves:
Mon Nov 27 19:00:27 2017 - [info]   node2(192.168.56.201:3306)  Version=5.7.20-log (oldest major version between slaves) log-bin:enabled
Mon Nov 27 19:00:27 2017 - [info]     GTID ON
Mon Nov 27 19:00:27 2017 - [info]     Replicating from 192.168.56.200(192.168.56.200:3306)
Mon Nov 27 19:00:27 2017 - [info]     Primary candidate for the new Master (candidate_master is set)
Mon Nov 27 19:00:27 2017 - [info] Checking slave configurations..
Mon Nov 27 19:00:27 2017 - [info]  read_only=1 is not set on slave node2(192.168.56.201:3306).
Mon Nov 27 19:00:27 2017 - [info] Checking replication filtering settings..
Mon Nov 27 19:00:27 2017 - [info]  Replication filtering check ok.
Mon Nov 27 19:00:27 2017 - [info] Master is down!
Mon Nov 27 19:00:27 2017 - [info] Terminating monitoring script.
Mon Nov 27 19:00:27 2017 - [info] Got exit code 20 (Master dead).
Mon Nov 27 19:00:27 2017 - [info] MHA::MasterFailover version 0.57.
Mon Nov 27 19:00:27 2017 - [info] Starting master failover.
Mon Nov 27 19:00:27 2017 - [info] 
Mon Nov 27 19:00:27 2017 - [info] * Phase 1: Configuration Check Phase..
Mon Nov 27 19:00:27 2017 - [info] 
Mon Nov 27 19:00:27 2017 - [info] GTID failover mode = 1
Mon Nov 27 19:00:27 2017 - [info] Dead Servers:
Mon Nov 27 19:00:27 2017 - [info]   node1(192.168.56.200:3306)
Mon Nov 27 19:00:27 2017 - [info] Checking master reachability via MySQL(double check)...
Mon Nov 27 19:00:27 2017 - [info]  ok.
Mon Nov 27 19:00:27 2017 - [info] Alive Servers:
Mon Nov 27 19:00:27 2017 - [info]   node2(192.168.56.201:3306)
Mon Nov 27 19:00:27 2017 - [info] Alive Slaves:
Mon Nov 27 19:00:27 2017 - [info]   node2(192.168.56.201:3306)  Version=5.7.20-log (oldest major version between slaves) log-bin:enabled
Mon Nov 27 19:00:27 2017 - [info]     GTID ON
Mon Nov 27 19:00:27 2017 - [info]     Replicating from 192.168.56.200(192.168.56.200:3306)
Mon Nov 27 19:00:27 2017 - [info]     Primary candidate for the new Master (candidate_master is set)
Mon Nov 27 19:00:27 2017 - [info] Starting GTID based failover.
Mon Nov 27 19:00:27 2017 - [info] 
Mon Nov 27 19:00:27 2017 - [info] ** Phase 1: Configuration Check Phase completed.
Mon Nov 27 19:00:27 2017 - [info] 
Mon Nov 27 19:00:27 2017 - [info] * Phase 2: Dead Master Shutdown Phase..
Mon Nov 27 19:00:27 2017 - [info] 
Mon Nov 27 19:00:27 2017 - [info] Forcing shutdown so that applications never connect to the current master..
Mon Nov 27 19:00:27 2017 - [info] Executing master IP deactivation script:
Mon Nov 27 19:00:27 2017 - [info]   /etc/masterha/master_ip_failover --orig_master_host=node1 --orig_master_ip=192.168.56.200 --orig_master_port=3306 --command=stopssh --ssh_user=root  


IN SCRIPT TEST====/sbin/ifconfig eth0:1 down==/sbin/ifconfig eth0:1 192.168.56.111/24===

Disabling the VIP on old master: node1 
Mon Nov 27 19:00:27 2017 - [info]  done.
Mon Nov 27 19:00:27 2017 - [warning] shutdown_script is not set. Skipping explicit shutting down of the dead master.
Mon Nov 27 19:00:27 2017 - [info] * Phase 2: Dead Master Shutdown Phase completed.
Mon Nov 27 19:00:27 2017 - [info] 
Mon Nov 27 19:00:27 2017 - [info] * Phase 3: Master Recovery Phase..
Mon Nov 27 19:00:27 2017 - [info] 
Mon Nov 27 19:00:27 2017 - [info] * Phase 3.1: Getting Latest Slaves Phase..
Mon Nov 27 19:00:27 2017 - [info] 
Mon Nov 27 19:00:27 2017 - [info] The latest binary log file/position on all slaves is mysql-bin.000009:194
Mon Nov 27 19:00:27 2017 - [info] Latest slaves (Slaves that received relay log files to the latest):
Mon Nov 27 19:00:27 2017 - [info]   node2(192.168.56.201:3306)  Version=5.7.20-log (oldest major version between slaves) log-bin:enabled
Mon Nov 27 19:00:27 2017 - [info]     GTID ON
Mon Nov 27 19:00:27 2017 - [info]     Replicating from 192.168.56.200(192.168.56.200:3306)
Mon Nov 27 19:00:27 2017 - [info]     Primary candidate for the new Master (candidate_master is set)
Mon Nov 27 19:00:27 2017 - [info] The oldest binary log file/position on all slaves is mysql-bin.000009:194
Mon Nov 27 19:00:27 2017 - [info] Oldest slaves:
Mon Nov 27 19:00:27 2017 - [info]   node2(192.168.56.201:3306)  Version=5.7.20-log (oldest major version between slaves) log-bin:enabled
Mon Nov 27 19:00:27 2017 - [info]     GTID ON
Mon Nov 27 19:00:27 2017 - [info]     Replicating from 192.168.56.200(192.168.56.200:3306)
Mon Nov 27 19:00:27 2017 - [info]     Primary candidate for the new Master (candidate_master is set)
Mon Nov 27 19:00:27 2017 - [info] 
Mon Nov 27 19:00:27 2017 - [info] * Phase 3.3: Determining New Master Phase..
Mon Nov 27 19:00:27 2017 - [info] 
Mon Nov 27 19:00:27 2017 - [info] Searching new master from slaves..
Mon Nov 27 19:00:27 2017 - [info]  Candidate masters from the configuration file:
Mon Nov 27 19:00:27 2017 - [info]   node2(192.168.56.201:3306)  Version=5.7.20-log (oldest major version between slaves) log-bin:enabled
Mon Nov 27 19:00:27 2017 - [info]     GTID ON
Mon Nov 27 19:00:27 2017 - [info]     Replicating from 192.168.56.200(192.168.56.200:3306)
Mon Nov 27 19:00:27 2017 - [info]     Primary candidate for the new Master (candidate_master is set)
Mon Nov 27 19:00:27 2017 - [info]  Non-candidate masters:
Mon Nov 27 19:00:27 2017 - [info]  Searching from candidate_master slaves which have received the latest relay log events..
Mon Nov 27 19:00:27 2017 - [info] New master is node2(192.168.56.201:3306)
Mon Nov 27 19:00:27 2017 - [info] Starting master failover..
Mon Nov 27 19:00:27 2017 - [info] 
From:
node1(192.168.56.200:3306) (current master)
 +--node2(192.168.56.201:3306)

To:
node2(192.168.56.201:3306) (new master)
Mon Nov 27 19:00:27 2017 - [info] 
Mon Nov 27 19:00:27 2017 - [info] * Phase 3.3: New Master Recovery Phase..
Mon Nov 27 19:00:27 2017 - [info] 
Mon Nov 27 19:00:27 2017 - [info]  Waiting all logs to be applied.. 
Mon Nov 27 19:00:27 2017 - [info]   done.
Mon Nov 27 19:00:27 2017 - [info] Getting new master's binlog name and position..
Mon Nov 27 19:00:27 2017 - [info]  mysql-bin.000003:234
Mon Nov 27 19:00:27 2017 - [info]  All other slaves should start replication from here. Statement should be: CHANGE MASTER TO MASTER_HOST='node2 or 192.168.56.201', MASTER_PORT=3306, MASTER_AUTO_POSITION=1, MASTER_USER='repl', MASTER_PASSWORD='xxx';
Mon Nov 27 19:00:27 2017 - [info] Master Recovery succeeded. File:Pos:Exec_Gtid_Set: mysql-bin.000003, 234, c9a69d41-d3ba-11e7-94ff-000c296fb653:1-9,
f33d56b8-d3ba-11e7-8643-000c292e5673:1-7
Mon Nov 27 19:00:27 2017 - [info] Executing master IP activate script:
Mon Nov 27 19:00:27 2017 - [info]   /etc/masterha/master_ip_failover --command=start --ssh_user=root --orig_master_host=node1 --orig_master_ip=192.168.56.200 --orig_master_port=3306 --new_master_host=node2 --new_master_ip=192.168.56.201 --new_master_port=3306 --new_master_user='monitor'   --new_master_password=xxx
Unknown option: new_master_user
Unknown option: new_master_password


IN SCRIPT TEST====/sbin/ifconfig eth0:1 down==/sbin/ifconfig eth0:1 192.168.56.111/24===

Enabling the VIP - 192.168.56.111/24 on the new master - node2 
Mon Nov 27 19:00:27 2017 - [info]  OK.
Mon Nov 27 19:00:27 2017 - [info] ** Finished master recovery successfully.
Mon Nov 27 19:00:27 2017 - [info] * Phase 3: Master Recovery Phase completed.
Mon Nov 27 19:00:27 2017 - [info] 
Mon Nov 27 19:00:27 2017 - [info] * Phase 4: Slaves Recovery Phase..
Mon Nov 27 19:00:27 2017 - [info] 
Mon Nov 27 19:00:27 2017 - [info] 
Mon Nov 27 19:00:27 2017 - [info] * Phase 4.1: Starting Slaves in parallel..
Mon Nov 27 19:00:27 2017 - [info] 
Mon Nov 27 19:00:27 2017 - [info] All new slave servers recovered successfully.
Mon Nov 27 19:00:27 2017 - [info] 
Mon Nov 27 19:00:27 2017 - [info] * Phase 5: New master cleanup phase..
Mon Nov 27 19:00:27 2017 - [info] 
Mon Nov 27 19:00:27 2017 - [info] Resetting slave info on the new master..
Mon Nov 27 19:00:27 2017 - [info]  node2: Resetting slave info succeeded.
Mon Nov 27 19:00:27 2017 - [info] Master failover to node2(192.168.56.201:3306) completed successfully.
Mon Nov 27 19:00:27 2017 - [info] 

----- Failover Report -----

app1: MySQL Master failover node1(192.168.56.200:3306) to node2(192.168.56.201:3306) succeeded

Master node1(192.168.56.200:3306) is down!

Check MHA Manager logs at node1:/var/log/masterha/app1/app1.log for details.

Started automated(non-interactive) failover.
Invalidated master IP address on node1(192.168.56.200:3306)
Selected node2(192.168.56.201:3306) as a new master.
node2(192.168.56.201:3306): OK: Applying all logs succeeded.
node2(192.168.56.201:3306): OK: Activated master IP address.
node2(192.168.56.201:3306): Resetting slave info succeeded.
Master failover to node2(192.168.56.201:3306) completed successfully.


发现vip 已经挂在了node2节点上,

1: lo: <LOOPBACK,UP,LOWER_UP> mtu 65536 qdisc noqueue state UNKNOWN 
    link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
    inet 127.0.0.1/8 scope host lo
    inet6 ::1/128 scope host 
       valid_lft forever preferred_lft forever
2: eth0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc pfifo_fast state UP qlen 1000
    link/ether 00:0c:29:2e:56:73 brd ff:ff:ff:ff:ff:ff
    inet 192.168.56.201/24 brd 192.168.56.255 scope global eth0
    inet 192.168.56.111/24 brd 192.168.56.255 scope global secondary eth0:1
    inet6 fe80::20c:29ff:fe6f:b653/64 scope link 
       valid_lft forever preferred_lft forever

===============================================================
配置文件
===============================================================
[root@node1 masterha]# cat app1.conf 
[server default]
#mha manager工作目录
manager_workdir = /var/log/masterha/app1
manager_log = /var/log/masterha/app1/app1.log
remote_workdir = /var/log/masterha/app1

[server1]
hostname=node1
master_binlog_dir = /data/mysql/mysql3306/logs
candidate_master = 1
check_repl_delay = 0     #用防止master故障时,切换时slave有延迟,卡在那里切不过来。

[server2]
hostname=node2
master_binlog_dir=/data/mysql/mysql3306/logs
candidate_master=1
check_repl_delay=0
==========================================================
[root@node1 masterha]# cat masterha_default.conf 
[server default]
#MySQL的用户和密码
user=monitor
password=letian318

#系统ssh用户
ssh_user=root

#复制用户
repl_user=repl
repl_password= repl


#监控
ping_interval=1
#shutdown_script=""

#切换调用的脚本
master_ip_failover_script= /etc/masterha/master_ip_failover
master_ip_online_change_script= /etc/masterha/master_ip_online_change
===============================================================
[root@node1 masterha]# cat check_ssh.sh 
/usr/bin/masterha_check_ssh --global_conf=/etc/masterha/masterha_default.conf --conf=/etc/masterha/app1.conf


=========================================================================
[root@node1 masterha]# cat check_repl.sh 
/usr/bin/masterha_check_repl --global_conf=/etc/masterha/masterha_default.conf --conf=/etc/masterha/app1.conf
[root@node1 masterha]# 

=============================================================================
[root@node1 masterha]# cat start-manager.sh 
nohup /usr/bin/masterha_manager --global_conf=/etc/masterha/masterha_default.conf --conf=/etc/masterha/app1.conf  > /tmp/mha_manager.log 2>&1 &
[root@node1 masterha]#