Node Type: OLDNFSServer
Serves home directories and experiment's software directories to
t3wn*
and
t3ui*
servers.
Firewall requirements
Emergency Measures
- check nagios
- If
t3fs05
will fail then the t3ui1*
and the t3wn*
servers that mount t3fs05:/swshare
will be immediately affected ; if you can't quickly recover t3fs05:/swshare
( e.g. due to a failed motherboard ) you'll have to umount /swshare
from those servers and mount t3fs06:/swshare
that is suppose to be an identical copy of t3fs05:/swshare
- On
t3fs06
obviously stop the cron sending by rsync /shome
to t3fs05
.
- On
t3dcachedb03
you'll have to change /var/postgresql-backups/pgsql_backups/dcache-db-backup.sh
to send the backups to t3fs06
instead of t3fs05
- Tweak
t3nagios
to forget about t3fs05
Regular Maintenance work
Nagios
check nagios
crontab -l root
#ident "@(#)root 1.21 04/03/23 SMI"
#
# The root crontab should be used to perform accounting data collection.
#
#
10 3 * * * /usr/sbin/logadm
15 3 * * 0 /usr/lib/fs/nfs/nfsfind
30 3 * * * [ -x /usr/lib/gss/gsscred_clean ] && /usr/lib/gss/gsscred_clean
#
# The rtc command is run to adjust the real time clock if and when
# daylight savings time changes.
#
1 2 * * * [ -x /usr/sbin/rtc ] && /usr/sbin/rtc -c > /dev/null 2>&1
#10 3 * * * /usr/lib/krb5/kprop_script ___slave_kdcs___
# Added by cswcrontab for CSWlogwatch
02 4 * * * /opt/csw/bin/logwatch
# cron to replicate fs /swshare vs t3fs06; please note that there is an equivalent cron on t3fs06, it must to be a delay between these 2 crons because they heavily use the network and there is a command 'kill mbuffer' that can crash the other cron
00 06 * * * /root/psit3-tools/regular-snapshot-new -f swshare -v -s t3fs06 -r swshare2/swsharebup 2>&1 | /usr/bin/tee /var/cron/lastsnap.txt 2>&1 ; [[ $? -ne 0 ]] && /usr/bin/mail cms-tier3@lists.psi.ch < /var/cron/lastsnap.txt
43 3 * * * [ -x /opt/csw/bin/gupdatedb ] && /opt/csw/bin/gupdatedb --prunepaths="/shome2 /swshare /dev /devices /proc /tmp /var/tmp" 1>/dev/null 2>&1 # Added by CSWfindutils
#
# for ganglia monitoring of swshare space
59 * * * * /root/gmetric/gmetric_partition_space-cron.sh
#
# 5th June 2013 - F.Martinelli - to get the EMI2 WN tarball CRLs updated
#39 12 * * * /opt/fetch-crl/fetch-crl -c /opt/fetch-crl/fetch-crl.cnf -v 2>&1 | /usr/bin/tee /var/cron/fetch-crl.log 2>&1
# 13th Feb 2014 - F.Martinelli - to get the EMI3 WN tarball CRLs updated
#39 22 * * * /opt/fetch-crl/fetch-crl -c /opt/fetch-crl/fetch-crl.emi3.cnf -v 2>&1 | /usr/bin/tee /var/cron/fetch-crl.emi3.log 2>&1
#
# 09/03/2015 - F.Martinelli
22 02 * * * /opt/zfsnap/zfssnap -v shome2 2>&1 | /usr/bin/tee /var/cron/zfssnap.shome2.log 2>&1 <-- since Dec 2014 the ZFS send/receive mechanism about t3fs06:/shome stopped to work ; so we copy by a plain rsync from t3fs06 to t3fs05 and on t3fs05 we regularly take snapshots by zfssnap
And cron jobs with the
cmssgm
user to create backups of external resources (make sure the user has a
NP
entry in
/etc/shadow/
and not an
*LK*
entry otherwise the cron job will not run)
crontab -l cmssgm
https://wiki.chipp.ch/twiki/bin/view/CmsTier3/RemoteRepositoryBackupAdmin
# fetch backups of remote repositories
# by Daniel Meister
37 2 * * * PATH=/opt/csw/sbin:/opt/csw/bin:/usr/sbin:/usr/bin /swshare/rrbackup/bin/run_rrb.py > /dev/null 2>&1
Installation
SGE dir used during a new WN installation
This dir is not strictly required by
t3fs05
itself but it's going to be used during a
t3wn*
installation ( so
very very seldom ! ):
[martinelli_f@t3wn10 execution-host-configuration]$ pwd
/swshare/sge/execution-host-configuration
[martinelli_f@t3wn10 execution-host-configuration]$ find .
.
./default
./default/readme_2011.05.04_16:05:09.html
./default/spool
./default/spool/t3wnexample
./default/spool/t3wnexample/messages
./default/spool/t3wnexample/execd.pid
./default/spool/t3wnexample/active_jobs
./default/spool/t3wnexample/jobs
./default/spool/t3wnexample/job_scripts
./default/common
./default/common/sgeexecd
./default/common/cluster_name
./default/common/sgemaster
./default/common/schedule
./default/common/settings.sh
./default/common/act_qmaster
./default/common/sge_aliases
./default/common/sge_qstat
./default/common/bootstrap
./default/common/accounting
./default/common/install_logs
./default/common/install_logs/qmaster_install_t3ce.psi.ch_2011-05-04_16:04:42.log
./default/common/sgedbwriter
./default/common/settings.csh
./default/common/sge_request
./default/common/qtask
./default/common/schedd_runlog
./etc
./etc/init.d
./etc/init.d/sgeexecd.p6444
Services
Just NFSv3
Backups
zfs list -t snapshot
NAME USED AVAIL REFER MOUNTPOINT
rpool/ROOT/s10x_u8wos_08a@before_perl_CPAN 173M - 5.19G -
rpool/ROOT/s10x_u8wos_08a@09-Apr-2013 71.0M - 5.48G -
rpool/ROOT/s10x_u8wos_08a@05-Jun-2013 46.3M - 5.54G -
rpool/ROOT/s10x_u8wos_08a@24-Jul-2013 50.0M - 5.71G -
rpool/ROOT/s10x_u8wos_08a@28-Nov-2013 30.3M - 5.75G -
rpool/ROOT/s10x_u8wos_08a@21-03-2014 28.3M - 5.78G -
rpool/ROOT/s10x_u8wos_08a@24-Jun-2014 27.9M - 5.88G -
rpool/ROOT/s10x_u8wos_08a@11-Sep-2014 28.0M - 6.61G -
rpool/ROOT/s10x_u8wos_08a@20-01-2015 29.2M - 6.91G -
rpool/ROOT/s10x_u8wos_08a@30-01-2015 28.5M - 6.91G -
rpool/ROOT/s10x_u8wos_08a@03-06-2015 0 - 6.91G - <-- last one
shome2@zfssnap_2015-05-25_02.22.00--10d 1.51G - 5.00T - <-- shome2 is updated by rsync/ssh coming from t3f06 ; the local root cron calling the tool zfssnap will create and destroy these zfssnap_YYYY-MM-DD_02.22.00
shome2@zfssnap_2015-05-26_02.22.00--10d 1.57G - 5.00T -
shome2@zfssnap_2015-05-27_02.22.00--10d 1.69G - 5.01T -
shome2@zfssnap_2015-05-28_02.22.00--10d 5.29G - 4.97T -
shome2@zfssnap_2015-05-29_02.22.00--10d 6.29G - 4.97T -
shome2@zfssnap_2015-05-30_02.22.00--10d 22.1G - 4.97T -
shome2@zfssnap_2015-05-31_02.22.01--10d 1.44G - 4.96T -
shome2@zfssnap_2015-06-01_02.22.00--10d 1.44G - 4.96T -
shome2@zfssnap_2015-06-02_02.22.00--10d 5.09G - 4.97T -
shome2@zfssnap_2015-06-03_02.22.00--10d 2.29G - 4.98T -
swshare@auto2015-06-01_06:00:00 2.30G - 561G - <-- the original ZFS send/receive mechanism still works for t3f05:/swshare instead
swshare@auto2015-06-02_06:00:00 1.06M - 561G -
swshare@auto2015-06-03_06:00:00 53.9K - 561G -
Nightly ZFS send/receive toward t3fs06
The
/swshare
snapshots are nightly taken on
t3fs05
by a cron and 'ZFS sent' to
t3fs06
where they are going to be kept for 10 days.
Nagios will check on t3fs05 if a new snapshot has been taken
GitHub daily backups
Please check
RemoteRepositoryBackupAdmin for details.
root@t3fs05 $ crontab -l cmssgm
# fetch backups of remote repositories
# for more details see https://wiki.chipp.ch/twiki/bin/view/CmsTier3/RemoteRepositoryBackupAdmin
37 2 * * * /swshare/rrbackup/bin/run_rrb.py
/swshare/rrbackup/store/
/swshare/rrbackup/store/
/swshare/rrbackup/store/peruzzi_ASAnalysis
/swshare/rrbackup/store/peruzzi_ASAnalysis/2013-07-22.tar.gz
/swshare/rrbackup/store/peruzzi_SCFootprintRemoval
/swshare/rrbackup/store/peruzzi_SCFootprintRemoval/2013-07-22.tar.gz
/swshare/rrbackup/store/dmeister_datareplica
/swshare/rrbackup/store/dmeister_datareplica/2013-07-22.tar.gz
/swshare/rrbackup/store/peruzzi_UserCode
/swshare/rrbackup/store/peruzzi_UserCode/2013-07-22.tar.gz
/swshare/rrbackup/store/peruzzi_PatchesOldReleases
/swshare/rrbackup/store/peruzzi_PatchesOldReleases/2013-07-22.tar.gz
/swshare/rrbackup/store/peruzzi_ASCore
/swshare/rrbackup/store/peruzzi_ASCore/2013-07-22.tar.gz
Postgres backups from t3dcachedb
to t3fs05
There is a cron running twice in a day on
t3dcachedb
that takes a full backup of our dCache DBs and it will copy it into
t3fs05:/swshare/dcache-postgres-backups/t3dcachedb
by
rsync --delete
/swshare/dcache-postgres-backups
/swshare/dcache-postgres-backups
/swshare/dcache-postgres-backups/t3dcachedb03
/swshare/dcache-postgres-backups/t3dcachedb03/t3dcachedb03-pg_dump-postgres-20150601-0410.gz
/swshare/dcache-postgres-backups/t3dcachedb03/latest-pg_dumpall-roles
/swshare/dcache-postgres-backups/t3dcachedb03/t3dcachedb03-pg_dumpall_roles-20150528-0410.gz
/swshare/dcache-postgres-backups/t3dcachedb03/t3dcachedb03-pg_dump-dcache-20150602-0410.gz
/swshare/dcache-postgres-backups/t3dcachedb03/t3dcachedb03-pg_dumpall_roles-20150603-0410.gz
/swshare/dcache-postgres-backups/t3dcachedb03/t3dcachedb03-pg_dump-dcache-20150529-0410.gz
/swshare/dcache-postgres-backups/t3dcachedb03/t3dcachedb03-pg_dump-alarms-20150603-0410.gz
/swshare/dcache-postgres-backups/t3dcachedb03/t3dcachedb03-pg_dump-dcache-20150530-0410.gz
/swshare/dcache-postgres-backups/t3dcachedb03/latest-pg_dump-alarms
/swshare/dcache-postgres-backups/t3dcachedb03/t3dcachedb03-pg_dumpall_roles-20150531-0410.gz
/swshare/dcache-postgres-backups/t3dcachedb03/t3dcachedb03-pg_dump-alarms-20150531-0410.gz
/swshare/dcache-postgres-backups/t3dcachedb03/t3dcachedb03-pg_dump-dcache-20150531-0410.gz
/swshare/dcache-postgres-backups/t3dcachedb03/t3dcachedb03-pg_dumpall_roles-20150530-0410.gz
/swshare/dcache-postgres-backups/t3dcachedb03/t3dcachedb03-pg_dump-alarms-20150530-0410.gz
/swshare/dcache-postgres-backups/t3dcachedb03/dcache-db-backup.sh
/swshare/dcache-postgres-backups/t3dcachedb03/t3dcachedb03-pg_dumpall_roles-20150529-0410.gz
/swshare/dcache-postgres-backups/t3dcachedb03/t3dcachedb03-pg_dump-billing-20150601-0410.gz
/swshare/dcache-postgres-backups/t3dcachedb03/t3dcachedb03-pg_dump-dcache-20150603-0410.gz
/swshare/dcache-postgres-backups/t3dcachedb03/t3dcachedb03-pg_dump-alarms-20150529-0410.gz
/swshare/dcache-postgres-backups/t3dcachedb03/t3dcachedb03-pg_dumpall_roles-20150602-0410.gz
/swshare/dcache-postgres-backups/t3dcachedb03/t3dcachedb03-pg_dump-alarms-20150602-0410.gz
/swshare/dcache-postgres-backups/t3dcachedb03/t3dcachedb03-pg_dump-dcache-20150528-0410.gz
/swshare/dcache-postgres-backups/t3dcachedb03/t3dcachedb03-pg_dump-chimera-20150601-0410.gz
/swshare/dcache-postgres-backups/t3dcachedb03/latest-pg_dump-dcache
/swshare/dcache-postgres-backups/t3dcachedb03/t3dcachedb03-pg_dump-billing-20150531-0410.gz
/swshare/dcache-postgres-backups/t3dcachedb03/t3dcachedb03-pg_dump-postgres-20150530-0410.gz
/swshare/dcache-postgres-backups/t3dcachedb03/t3dcachedb03-pg_dump-chimera-20150531-0410.gz
/swshare/dcache-postgres-backups/t3dcachedb03/t3dcachedb03-pg_dump-chimera-20150528-0410.gz
/swshare/dcache-postgres-backups/t3dcachedb03/t3dcachedb03-pg_dump-dcache-20150601-0410.gz
/swshare/dcache-postgres-backups/t3dcachedb03/t3dcachedb03-pg_dump-postgres-20150602-0410.gz
/swshare/dcache-postgres-backups/t3dcachedb03/t3dcachedb03-pg_dump-billing-20150603-0410.gz
/swshare/dcache-postgres-backups/t3dcachedb03/t3dcachedb03-pg_dump-chimera-20150603-0410.gz
/swshare/dcache-postgres-backups/t3dcachedb03/t3dcachedb03-pg_dump-postgres-20150529-0410.gz
/swshare/dcache-postgres-backups/t3dcachedb03/t3dcachedb03-pg_dump-chimera-20150529-0410.gz
/swshare/dcache-postgres-backups/t3dcachedb03/t3dcachedb03-pg_dump-postgres-20150603-0410.gz
/swshare/dcache-postgres-backups/t3dcachedb03/t3dcachedb03-pg_dump-billing-20150602-0410.gz
/swshare/dcache-postgres-backups/t3dcachedb03/latest-pg_dump-postgres
/swshare/dcache-postgres-backups/t3dcachedb03/t3dcachedb03-pg_dump-chimera-20150602-0410.gz
/swshare/dcache-postgres-backups/t3dcachedb03/t3dcachedb03-pg_dump-alarms-20150601-0410.gz
/swshare/dcache-postgres-backups/t3dcachedb03/t3dcachedb03-pg_dumpall_roles-20150601-0410.gz
/swshare/dcache-postgres-backups/t3dcachedb03/t3dcachedb03-pg_dump-billing-20150529-0410.gz
/swshare/dcache-postgres-backups/t3dcachedb03/latest-pg_dump-chimera
/swshare/dcache-postgres-backups/t3dcachedb03/t3dcachedb03-pg_dump-billing-20150530-0410.gz
/swshare/dcache-postgres-backups/t3dcachedb03/latest-pg_dump-billing
/swshare/dcache-postgres-backups/t3dcachedb03/t3dcachedb03-pg_dump-postgres-20150531-0410.gz
/swshare/dcache-postgres-backups/t3dcachedb03/t3dcachedb03-pg_dump-chimera-20150530-0410.gz
/swshare/dcache-postgres-backups/.ssh
/swshare/dcache-postgres-backups/.ssh/id_rsa
/swshare/dcache-postgres-backups/.ssh/id_rsa.pub
/swshare/dcache-postgres-backups/.ssh/authorized_keys
/swshare/dcache-postgres-backups/t3dcachedb04
/swshare/dcache-postgres-backups/.Xauthority
/swshare/dcache-postgres-backups/.bash_history
/swshare/dcache-postgres-backups/t3dcachedb
/swshare/dcache-postgres-backups/t3dcachedb02
/swshare/dcache-postgres-backups/t3dcachedb03-last-PG9.3
/swshare/dcache-postgres-backups/t3dcachedb03-last-PG9.3/pg_dumpall.9.3.psql.9.4
/swshare/dcache-postgres-backups/t3dcachedb03-last-PG9.3/pg_dumpall.9.3
/swshare/dcache-postgres-backups/.ssh/authorized_keys
ssh-rsa AAAAB3NzaC1yc2EAAAABIwAAAIEAyMhmLdWCE/ZoEJ6ooj+fPNUl3NrqadlutwGE7Vbc+GCTGKClAB6Cop710s9DFgSUSCr6t0utHtdbfZ51XCtcg5fM1+ual3bZXXQOaQFQ1aOP2dwbPM8ZHk6IGRGvrKbeT4Jxq3MxhvP61oYZhK4iwdcAGMlS627Z+B/pp2XpTIM= SSH keys to be used by postgres@t3dcachedb to rsync the DB backups
/swshare/dcache-postgres-backups/t3dcachedb03/dcache-db-backup.sh
This script runs on t3dcachedb
but it involves t3fs05
during the last step
#!/bin/bash -x
#
# This script will be executed by cron , it needs to properly setup the PATH to find pg_dumpall
PATH=/usr/pgsql-9.4/bin/:/usr/local/bin:/bin:/usr/bin:/usr/local/sbin:/usr/sbin:/sbin
############################################
# CONFIGURATION
#remove_older_than_DAYS=$((30*2)) # 2 months
remove_older_than_DAYS=5
mylogger="/usr/bin/logger"
backupdir=/var/lib/pgsql_backups
############################################
#DBG="echo"
usage() {
cat <m%d)
TIME=$(date +M)
EPOCH=$(date -d "$DATE $TIME" +%s)
HOSTNAME=`hostname`
# CLEANING THE HUGE BILLING TABLES #
$DBG $mylogger -p local6.notice "$progname: cleaning the usual huge billing tables."
psql --echo-queries -U postgres -d billing < $DB_BACKUP_FILE
[ $? -ne 0 ] && $DBG $mylogger -p local6.error "$progname: FAILED the backup of $DB in $DB_BACKUP_FILE" && exit 1
LATEST_DUMP="latest-pg_dump-$DB"
[ -f $LATEST_DUMP ] && rm -f $LATEST_DUMP
ln -s $HOSTNAME-pg_dump-$DB-${DATE}-${TIME}.gz $LATEST_DUMP
done
$DBG $mylogger -p local6.notice "$progname: we also backup the DBs roles."
LATEST_DUMP="latest-pg_dumpall-roles"
DB_BACKUP_FILE="$backupdir/$HOSTNAME-pg_dumpall_roles-${DATE}-${TIME}.gz"
pg_dumpall -U postgres --roles-only | gzip > $DB_BACKUP_FILE
[ $? -ne 0 ] && $DBG $mylogger -p local6.error "$progname: FAILED the backup of DBs roles." && exit 1
[ -f $LATEST_DUMP ] && rm -f $LATEST_DUMP
ln -s $HOSTNAME-pg_dumpall_roles-${DATE}-${TIME}.gz $LATEST_DUMP
#######################################
### Removing old BACKUPs
BCKs_TO_BE_REMOVED=/tmp/remove_pg_dump_backups_older_than_$remove_older_than_DAYS
[ -f BCKs_TO_BE_REMOVED ] && $DBG $mylogger -p local6.notice "$progname: Warning, found a previous file: $BCKs_TO_BE_REMOVED , I'll remove it." && rm -f $BCKs_TO_BE_REMOVED
cd $backupdir
find . -type f -mtime +$remove_older_than_DAYS -name "$HOSTNAME-pg_dump*gz" -exec echo -n "{} " \; > $BCKs_TO_BE_REMOVED
[ ! -f $BCKs_TO_BE_REMOVED ] && $DBG $mylogger -p local6.error "$progname: FAILED the search of old backups to be removed" && exit 1
for old_pg_dump in $(cat $BCKs_TO_BE_REMOVED ) ; do $DBG $mylogger -p local6.notice "$progname: Removing old pg_dump: $old_pg_dump" && rm -f $old_pg_dump ; done
cd
######################################
########## RSYNC STEP ################
SSHSERVER=t3fs05.psi.ch
SSHKEY=/var/lib/pgsql/.ssh/id_rsa_t3fs05
[ ! -r /var/lib/pgsql/.ssh/id_rsa_t3fs05 ] && echo "Can't read SSH private key $SSHKEY" && exit 1
nc -z $SSHSERVER 22
[ $? -ne 0 ] && echo "Can't reach SSH server $SSHSERVER" && exit 1
/usr/bin/rsync --delete -a -e "/usr/bin/ssh -i $SSHKEY -o StrictHostKeyChecking=no" $backupdir/ postgres@$SSHSERVER:/swshare/dcache-postgres-backups/$HOSTNAME/
[ $? -ne 0 ] && echo "Error during: /usr/bin/rsync --delete -a -e \"/usr/bin/ssh -i $SSHKEY -o StrictHostKeyChecking=no \" $backupdir/ postgres@$SSHSERVER:/swshare/dcache-postgres-backups/$HOSTNAME/" && exit 1
#####################################
$DBG $mylogger -p local6.notice "$progname: Successfully rsynced /var/lib/pgsql_backups to postgres@$SSHSERVER:/swshare/dcache-postgres-backups/$HOSTNAME/"
exit 0