forked from digoal/PostgreSQL_HA_with_primary_standby_2vip
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathINSTALL.txt
582 lines (496 loc) · 20.1 KB
/
INSTALL.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
已测试环境 :
OS : CentOS 6.x x64
DB : PostgreSQL 9.3.x
Server : HP DL360
HOST1 : 192.168.111.37
HOST2 : 192.168.111.42
VIPM : 192.168.111.130
VIPS : 192.168.111.131
HOST1 FENCE DEVICE : fence_ilo
HOST2 FENCE DEVICE : fence_ilo
GateWay IP : 192.168.111.1
配置, 测试fence设备
FENCE DEVICE配置 (HOST1, HOST2) :
IP : 192.168.111.37 ilo : 192.168.112.56 USER : digoal PWD : digoal_pwd
IP : 192.168.111.42 ilo : 192.168.112.51 USER : digoal PWD : digoal_pwd
开启ipmi功能, 用户赋予ipmi可开关机的角色(OPERATOR 或 ADMINISTRATOR, 本例使用的是OPERATOR).
OS配置(HOST1, HOST2) :
关闭acpi服务, 避免fence慢或者fence不成功的可能.
chkconfig acpid off
增加yum源, 安装需要的包. (HOST1, HOST2)
视OS版本环境配置, 可能与以下不一致
/etc/yum.repos.d/rhel-sky.repo
[Cluster]
name=Cluster Directory
baseurl=http://192.168.164.38/rhel-server-5.4-i386/Cluster
enabled=1
gpgcheck=0
[ClusterStorage]
name=ClusterStorage Directory
baseurl=http://192.168.164.38/rhel-server-5.4-i386/ClusterStorage
enabled=1
gpgcheck=0
[Server]
name=Server Directory
baseurl=http://192.168.164.38/rhel-server-5.4-i386/Server
enabled=1
gpgcheck=0
[VT]
name=VT Directory
baseurl=http://192.168.164.38/rhel-server-5.4-i386/VT
enabled=1
gpgcheck=0
安装需要的包.(HOST1, HOST2)
yum -y install rsync coreutils glib2 lrzsz sysstat e4fsprogs xfsprogs ntp readline-devel zlib zlib-devel openssl openssl-devel pam-devel libxml2-devel libxslt-devel python-devel tcl-devel gcc make smartmontools flex bison perl perl-devel perl-ExtUtils* OpenIPMI-tools openldap openldap-devel cman logrotate
系统配置
vi /etc/sysctl.conf (HOST1, HOST2)
# add by digoal.zhou
kernel.shmmax=135497418752
kernel.shmmni = 4096
kernel.sem = 50100 64128000 50100 1280
fs.file-max = 7672460
fs.aio-max-nr = 1048576
net.ipv4.ip_local_port_range = 9000 65000
net.core.rmem_default = 262144
net.core.rmem_max = 4194304
net.core.wmem_default = 262144
net.core.wmem_max = 4194304
net.ipv4.tcp_max_syn_backlog = 4096
net.core.netdev_max_backlog = 10000
net.ipv4.netfilter.ip_conntrack_max = 655360
net.ipv4.tcp_timestamps = 0
net.ipv4.tcp_tw_recycle=1
net.ipv4.tcp_timestamps=1
net.ipv4.tcp_keepalive_time = 72
net.ipv4.tcp_keepalive_probes = 9
net.ipv4.tcp_keepalive_intvl = 7
vm.zone_reclaim_mode=0
vm.dirty_background_bytes = 102400000
vm.dirty_ratio = 80
vm.dirty_expire_centisecs = 6000
vm.dirty_writeback_centisecs = 50
vm.swappiness=0
vm.overcommit_memory = 0
vm.overcommit_ratio = 90
# sysctl -p
vi /etc/security/limits.conf (HOST1, HOST2)
# add by digoal.zhou
* soft nofile 131072
* hard nofile 131072
* soft nproc 131072
* hard nproc 131072
* soft core unlimited
* hard core unlimited
* soft memlock 500000000
* hard memlock 500000000
如果是centos 6.x的话还需要修改 /etc/security/limits.d/90-nproc.conf
vi /etc/security/limits.d/90-nproc.conf
#* soft nproc 1024
#root soft nproc unlimited
* soft nproc 131072
* hard nproc 131072
同步时钟 (HOST1, HOST2)
crontab -e
8 * * * * /usr/sbin/ntpdate asia.pool.ntp.org && /sbin/hwclock --systohc
配置DNS : (HOST1, HOST2)
vi /etc/resolv.conf
nameserver xxx.xxx.xxx.xxx
配置网络, 新增vipm和vips的接口配置 : (HOST1, HOST2)
注意子接口使用ONBOOT=no, ONPARENT=no
eth0:1对应VIPM
eth0:2对应VIPS
cd /etc/sysconfig/network-scripts/
cp ifcfg-eth0 ifcfg-eth0:1
cp ifcfg-eth0 ifcfg-eth0:2
vi ifcfg-eth0:1
cat /etc/sysconfig/network-scripts/ifcfg-eth0:1
DEVICE=eth0:1
ONBOOT=no
ONPARENT=no
BOOTPROTO=static
HWADDR=D4:BE:D9:AD:9A:B6
IPADDR=192.168.111.130
NETMASK=255.255.255.0
cat /etc/sysconfig/network-scripts/ifcfg-eth0:2
DEVICE=eth0:2
ONBOOT=no
ONPARENT=no
BOOTPROTO=static
HWADDR=D4:BE:D9:AD:9A:B6
IPADDR=192.168.111.131
NETMASK=255.255.255.0
按需配置iptables (HOST1, HOST2)
允许node1,node2 相互访问PostgreSQL监听端口,
vi /etc/sysconfig/iptables
# 私有网段
-A INPUT -s 192.168.0.0/16 -j ACCEPT
-A INPUT -s 10.0.0.0/8 -j ACCEPT
-A INPUT -s 172.16.0.0/16 -j ACCEPT
编译port_probe : (HOST1, HOST2):
gcc -O3 -Wall -Wextra -Werror -g -o port_probe ./port_probe.c
chmod 555 port_probe
mv port_probe /usr/local/bin
测试port_probe是否正常:
port_probe $node_ip $port
新增postgres用户 (HOST1, HOST2)
useradd postgres
配置sudo命令 : (HOST1, HOST2)
visudo -f /etc/sudoers
# 注释 requiretty
# Defaults requiretty
# 末尾添加
# add by digoal
postgres ALL=(ALL) NOPASSWD: /sbin/ifup
postgres ALL=(ALL) NOPASSWD: /sbin/ifdown
postgres ALL=(ALL) NOPASSWD: /sbin/arping
postgres ALL=(ALL) NOPASSWD: /bin/mount
postgres ALL=(ALL) NOPASSWD: /bin/umount
配置postgres用户的ssh无密钥认证 (HOST1, HOST2)
# vi /etc/ssh/sshd_config
PubkeyAuthentication yes
# service sshd restart
# su - postgres
$ ssh-keygen -t rsa
不要输入passphrase
$ cd ~/.ssh
$ 将 id_rsa.pub 内容拷贝到对方主机的postgres用户下的 ~/.ssh/authorized_keys
$ chmod 600 ~/.ssh/authorized_keys
# 验证无密码配置是否正确
ssh 192.168.111.42 date
Sun Jan 4 15:54:26 CST 2015
ssh 192.168.111.37 date
Sun Jan 4 15:54:26 CST 2015
# PostgreSQL 安装 : (HOST1, HOST2)
vi /home/postgres/.bash_profile
# add by digoal
export PS1="$USER@`/bin/hostname -s`-> "
export PGPORT=1921
export PGDATA=/opt/pg_root
export LANG=en_US.utf8
export PGHOME=/opt/pgsql
export LD_LIBRARY_PATH=$PGHOME/lib:/lib64:/usr/lib64:/usr/local/lib64:/lib:/usr/lib:/usr/local/lib:$LD_LIBRARY_PATH
export DATE=`date +"%Y%m%d%H%M"`
export PATH=$PGHOME/bin:$PATH:.
export MANPATH=$PGHOME/share/man:$MANPATH
export PGHOST=$PGDATA
export PGDATABASE=postgres
alias rm='rm -i'
alias ll='ls -lh'
unalias vi
# wget https://ftp.postgresql.org/pub/source/v9.3.5/postgresql-9.3.5.tar.bz2
# tar -jxvf postgresql-9.3.5.tar.bz2
# cd postgresql-9.3.5
# ./configure --prefix=/opt/pgsql9.3.5 --with-pgport=1921 --with-perl --with-python --with-tcl --with-openssl --with-pam --with-ldap --with-libxml --with-libxslt --enable-thread-safety
# gmake world
# gmake install-world
# ln -s /opt/pgsql9.3.5 /opt/pgsql
初始化数据库 : (HOST1, HOST2)
创建数据目录, 归档目录, 挂载对端归档目录的目录.
mkdir /opt/pg_root
mkdir /opt/arch
mkdir /opt/peer_arch
chown postgres:postgres /opt/pg_root
chown postgres:postgres /opt/arch
chown postgres:postgres /opt/peer_arch
注意:
在$PGDATA中用到的软链接必须一致, 例如 以下是不行的, 会导致数据库rsync后异常.
主机A /opt/pg_root/pg_xlog -> /pg_xlog
主机B /opt/pg_root/pg_xlog -> /data01/pg_xlog
主机A /opt/pg_root/pg_tblspc/12345 -> /pg_tbs1
主机B /opt/pg_root/pg_tblspc/12345 -> /data01/pg_tbs1
配置 NFS : (HOST1, HOST2)
# vi /etc/exports
/opt/arch 192.168.111.37/32(ro,no_root_squash,sync)
/opt/arch 192.168.111.42/32(ro,no_root_squash,sync)
# service nfs start
# chkconfig nfs on
加载peer目录 (HOST1, HOST2)
on HOST1:
mount -t nfs -o tcp 192.168.111.42:/opt/arch /opt/peer_arch
on HOST2:
mount -t nfs -o tcp 192.168.111.37:/opt/arch /opt/peer_arch
添加到/etc/rc.local
HOST1
vi /etc/rc.local
/bin/mount -t nfs -o tcp 192.168.111.42:/opt/arch /opt/peer_arch
HOST2
/bin/mount -t nfs -o tcp 192.168.111.37:/opt/arch /opt/peer_arch
初始化数据库
su - postgres
initdb -D $PGDATA -E UTF8 --locale=C -U postgres -W
配置流复制 : (HOST1)
配置 流复制HBA, 心跳HBA :
务必同时配置虚拟IP的连接心跳, 除非固定出口IP.
cd $PGDATA
vi pg_hba.conf
host replication replica 192.168.111.37/32 md5
host replication replica 192.168.111.42/32 md5
host replication replica 192.168.111.130/32 md5
host replication replica 192.168.111.131/32 md5
host sky_pg_cluster sky_pg_cluster 192.168.111.37/32 md5
host sky_pg_cluster sky_pg_cluster 192.168.111.42/32 md5
host sky_pg_cluster sky_pg_cluster 192.168.111.130/32 md5
host sky_pg_cluster sky_pg_cluster 192.168.111.131/32 md5
host sky_pg_cluster sky_pg_cluster 127.0.0.1/32 md5
# 其他
host all all 0.0.0.0/0 md5
配置归档, hot_standby, sender
必须配置的项 :
listen_addresses = '0.0.0.0' # what IP address(es) to listen on;
port = 1921 # (change requires restart)
tcp_keepalives_idle = 60 # TCP_KEEPIDLE, in seconds;
tcp_keepalives_interval = 10 # TCP_KEEPINTVL, in seconds;
tcp_keepalives_count = 10 # TCP_KEEPCNT;
wal_level = hot_standby
archive_mode = on
archive_command = 'DIR=/opt/arch/`date +%F`; test ! -d $DIR && mkdir -p $DIR; chmod 755 $DIR; test ! -f $DIR/%f && cp %p $DIR/%f; chmod 755 $DIR/%f' # command to use to archive a logfile segment
max_wal_senders = 10 # max number of walsender processes
hot_standby = on # "on" allows queries during recovery
wal_receiver_status_interval = 1s # send replies at least this often
hot_standby_feedback = on # send info from standby to prevent
模板
cd $PGDATA
vi postgresql.conf
listen_addresses = '0.0.0.0' # what IP address(es) to listen on;
port = 1921 # (change requires restart)
max_connections = 1000 # (change requires restart)
superuser_reserved_connections = 13 # (change requires restart)
unix_socket_directories = '.' # comma-separated list of directories
unix_socket_permissions = 0700 # begin with 0 to use octal notation
tcp_keepalives_idle = 60 # TCP_KEEPIDLE, in seconds;
tcp_keepalives_interval = 10 # TCP_KEEPINTVL, in seconds;
tcp_keepalives_count = 10 # TCP_KEEPCNT;
shared_buffers = 2048MB # min 128kB
maintenance_work_mem = 512MB # min 1MB
vacuum_cost_delay = 10 # 0-100 milliseconds
bgwriter_delay = 10ms # 10-10000ms between rounds
wal_level = hot_standby # minimal, archive, or hot_standby
synchronous_commit = off # synchronization level;
wal_writer_delay = 10ms # 1-10000 milliseconds
checkpoint_segments = 128 # in logfile segments, min 1, 16MB each
archive_mode = on # allows archiving to be done
archive_command = 'DIR=/opt/arch/`date +%F`; test ! -d $DIR && mkdir -p $DIR; chmod 755 $DIR; test ! -f $DIR/%f && cp %p $DIR/%f; chmod 755 $DIR/%f' # command to use to archive a logfile segment
max_wal_senders = 10 # max number of walsender processes
hot_standby = on # "on" allows queries during recovery
wal_receiver_status_interval = 1s # send replies at least this often
hot_standby_feedback = on # send info from standby to prevent
effective_cache_size = 8192MB
log_destination = 'csvlog' # Valid values are combinations of
logging_collector = on # Enable capturing of stderr and csvlog
log_directory = 'pg_log' # directory where log files are written,
log_truncate_on_rotation = on # If on, an existing log file with the
log_rotation_age = 1d # Automatic rotation of logfiles will
log_rotation_size = 10MB # Automatic rotation of logfiles will
log_checkpoints = on
log_connections = on
log_disconnections = on
log_error_verbosity = verbose # terse, default, or verbose messages
log_timezone = 'PRC'
datestyle = 'iso, mdy'
timezone = 'PRC'
lc_messages = 'C' # locale for system error message
lc_monetary = 'C' # locale for monetary formatting
lc_numeric = 'C' # locale for number formatting
lc_time = 'C' # locale for time formatting
default_text_search_config = 'pg_catalog.english'
配置恢复文件, primary_conninfo中使用vipm连接 :
$ cp /opt/pgsql/share/recovery.conf.sample $PGDATA/recovery.done
$ chmod 700 $PGDATA/recovery.done
$ vi $PGDATA/recovery.done
restore_command = 'PEER_DIR=/opt/peer_arch; cp $PEER_DIR/*/%f %p' # e.g. 'cp /mnt/server/archivedir/%f %p'
recovery_target_timeline = 'latest'
standby_mode = on
primary_conninfo = 'host=192.168.111.130 port=1921 user=replica keepalives_idle=60' # e.g. 'host=localhost port=5432'
配置密码文件 : (HOST1, HOST2)
流复制1条, 连接到VIPM
心跳5条, 连接到VIPM, VIPS, LOCAL, PEER_IP
# su - postgres
$ vi ~/.pgpass
192.168.111.130:1921:replication:replica:REPLICA321
192.168.111.130:1921:sky_pg_cluster:sky_pg_cluster:SKY_PG_cluster_321
192.168.111.131:1921:sky_pg_cluster:sky_pg_cluster:SKY_PG_cluster_321
192.168.111.37:1921:sky_pg_cluster:sky_pg_cluster:SKY_PG_cluster_321
192.168.111.42:1921:sky_pg_cluster:sky_pg_cluster:SKY_PG_cluster_321
127.0.0.1:1921:sky_pg_cluster:sky_pg_cluster:SKY_PG_cluster_321
$ chmod 400 .pgpass
启动数据库, 添加replication数据库角色 : (HOST1)
pg_ctl start
psql postgres postgres
create extension pg_stat_statements;
create role replica nosuperuser nocreatedb nocreaterole noinherit replication connection limit 32 login encrypted password 'REPLICA321';
启动VIPM接口 : (HOST1)
sudo /sbin/ifup eth0:1
启动数据库 : (HOST1)
pg_ctl start
配置心跳用户, 数据库, 表, 函数(HOST1) :
将记录时间延迟, 流复制延迟的信息到心跳表 :
-- 建议使用superuser, 原因见http://blog.163.com/digoal@126/blog/static/163877040201331995623214/
create role sky_pg_cluster superuser nocreatedb nocreaterole noinherit login encrypted password 'SKY_PG_cluster_321';
create database sky_pg_cluster with template template0 encoding 'UTF8' owner sky_pg_cluster;
\c sky_pg_cluster sky_pg_cluster
create schema sky_pg_cluster authorization sky_pg_cluster;
create table cluster_status (id int unique default 1, last_alive timestamp(0) without time zone, rep_lag int8);
-- 限制cluster_status表有且只有一行 :
CREATE FUNCTION cannt_delete ()
RETURNS trigger
LANGUAGE plpgsql AS $$
BEGIN
RAISE EXCEPTION 'You can not delete!';
END; $$;
CREATE TRIGGER cannt_delete
BEFORE DELETE ON cluster_status
FOR EACH ROW EXECUTE PROCEDURE cannt_delete();
CREATE TRIGGER cannt_truncate
BEFORE TRUNCATE ON cluster_status
FOR STATEMENT EXECUTE PROCEDURE cannt_delete();
-- 插入初始数据
insert into cluster_status values (1, now(), 9999999999);
-- 创建测试函数, 用于测试数据库是否正常, 包括所有表空间的测试
-- (注意原来的函数使用alter table set tablespace来做测试, 产生了较多的xlog, 同时需要排他锁, 现在改成update).
-- 使用update不同的表空间中的数据, 并不能立刻反应表空间的问题. 因为大多数数据在shared_buffer中.
-- 如果表空间对应的文件系统io有问题, 那么在checkpoint时会产生58类的错误.
-- 使用pg_stat_file函数可以立刻暴露io的问题.
create or replace function cluster_keepalive_test(i_peer_ip inet) returns void as $$
declare
v_spcname text;
v_spcoid oid;
v_nspname name := 'sky_pg_cluster';
v_rep_lag int8;
v_t timestamp without time zone;
begin
if ( pg_is_in_recovery() ) then
raise notice 'this is standby node.';
return;
end if;
select pg_xlog_location_diff(pg_current_xlog_insert_location(),sent_location) into v_rep_lag from pg_stat_replication where client_addr=i_peer_ip;
if found then
-- standby 已启动
update cluster_status set last_alive=now(), rep_lag=v_rep_lag;
else
-- standby 未启动
update cluster_status set last_alive=now();
end if;
-- 临时禁止检测表空间, return
return;
-- 表空间相关心跳检测1分钟一次, 减轻更新压力
FOR v_spcname,v_spcoid IN
select spcname,oid from pg_tablespace where spcname <> 'pg_global'
LOOP
perform 1 from pg_class where
( reltablespace=v_spcoid or reltablespace=0 )
and relname='t_'||v_spcname
and relkind='r'
and relnamespace=(select oid from pg_namespace where nspname=v_nspname)
limit 1;
if not found then
execute 'create table '||v_nspname||'.t_'||v_spcname||' (crt_time timestamp) tablespace '||v_spcname;
execute 'insert into '||v_nspname||'.t_'||v_spcname||' values ('''||now()||''')';
perform pg_stat_file(pg_relation_filepath(v_nspname||'.t_'||v_spcname));
else
execute 'update '||v_nspname||'.t_'||v_spcname||' set crt_time='||''''||now()||''' where now()-crt_time> interval ''1 min'' returning crt_time' into v_t;
if v_t is not null then
perform pg_stat_file(pg_relation_filepath(v_nspname||'.t_'||v_spcname));
end if;
end if;
END LOOP;
end;
$$ language plpgsql strict;
-- 在创建测试函数后, 最好测试一下是否正常, 因为某些版本的系统表可能不通用, 需要调整.
-- 9.2和9.3是没有问题的.
复制主库, 创建peer节点 standby : (HOST2)
su - postgres
pg_basebackup -D $PGDATA -F p -h 192.168.111.130 -p 1921 -U replica
cd $PGDATA
mv recovery.done recovery.conf
启动standby : (HOST2)
su - postgres
pg_ctl start
正常的话, node1上面会多一个sender进程, node2上面会多一个receiver进程 :
postgres: wal sender process replica 192.168.111.130(45020) streaming 0/4047700
postgres: wal receiver process streaming 0/4047700
配置HA脚本, nagios脚本 (HOST1, HOST2) :
1. /usr/local/bin/sky_pg_cluster.sh
2. /usr/local/nagios/libexec/check_sky_pg_cluster_alive.sh
3. /usr/local/nagios/libexec/check_standby_lag.sh
chmod 555 /usr/local/bin/sky_pg_cluster.sh
chmod 555 /usr/local/nagios/libexec/check_sky_pg_cluster_alive.sh
chmod 555 /usr/local/nagios/libexec/check_standby_lag.sh
配置日志清理crontab (HOST1, HOST2) :
# vi /etc/logrotate.d/sky_pg_cluster
/tmp/sky_pg_clusterd.log
{
size 10M
create
start 10
rotate 4
compress
copytruncate
}
# which logrotate
/usr/sbin/logrotate
配置为每小时执行一次rotate, 当日志文件大于10M时压缩rotate
# crontab -e
1 * * * * /usr/sbin/logrotate /etc/logrotate.d/sky_pg_cluster
配置HA脚本自启动
chmod +x /etc/rc.d/rc.local
vi /etc/rc.d/rc.local
su - postgres -c "/usr/local/bin/sky_pg_cluster.sh start"
启动HA(按步骤) : (HOST1, HOST2)
1. HOST1 HOST2 同时执行 :
su - postgres -c "/usr/local/bin/sky_pg_cluster.sh stop"
为了测试方便, 初次启动前, 可以手工先启动VIP和postgresql. 然后再启动以下脚本.
HOST1, primary
ifup eth0:1
pg_ctl start
HOST2, standby
ifup eth0:2
pg_ctl start
实际生产时, 可以不用以上步骤, 只是启动时间可能较长, 因为要执行keepalived
2. HOST1 HOST2 同时执行 :
su - postgres -c "/usr/local/bin/sky_pg_cluster.sh start"
3. 检查sky_pg_cluster.sh是否正常启动
ps -ewf|grep sky_pg_cluster.sh
配置nagios : (HOST1, HOST2)
监控如下项 :
1. 端口的监控在nagios服务端配置
HOST1 IP+PORT
HOST2 IP+PORT
VIPM IP+port
VIPS IP+port
2. 以下脚本的调用配置在 /usr/local/nagios/etc/nrpe.cfg
sky_pg_clusterd是否存活 (/usr/local/nagios/libexec/check_sky_pg_cluster_alive.sh)
standby同步延时监控 (/usr/local/nagios/libexec/check_standby_lag.sh)
vi /usr/local/nagios/etc/nrpe.cfg
command[check_sky_pg_clusterd]=/usr/local/nagios/libexec/check_sky_pg_cluster_alive.sh
command[check_standby_lag]=/usr/local/nagios/libexec/check_standby_lag.sh
service xinetd restart
注意, 这些nagios监控脚本务必使用postgres数据库启动用户调用. (修改nrpe user, group)
vi /etc/xinetd.d/nrpe
# default: on
# description: NRPE (Nagios Remote Plugin Executor)
service nrpe
{
flags = REUSE
socket_type = stream
port = 5666
wait = no
user = postgres
group = postgres
server = /usr/local/nagios/bin/nrpe
server_args = -c /usr/local/nagios/etc/nrpe.cfg --inetd
log_on_failure += USERID
disable = no
only_from = 127.0.0.1 192.168.1.111 其他略
log_type = FILE /dev/null
}
service xinetd restart
测试failover :
1. 模拟网络故障
1.1 单一主机网络故障
1.2 双主机同时发生网络故障
2. 模拟数据库故障
2.1 单主机数据库故障
2.2 双主机同时发生数据库故障
# Author : Digoal zhou
# Email : [email protected]
# Blog : http://blog.163.com/digoal@126/