/var/spool/pbs/mom_logs/
/var/log/glexec/lcas_lcmaps.log
nfs:/experiment_software/atlas on /experiment_software/atlas
nfs:/experiment_software/cms on /experiment_software/cms
nfs:/experiment_software/lhcb on /experiment_software/lhcb
nfs:/experiment_software/others/dech on /experiment_software/dech
nfs:/experiment_software/others/dteam on /experiment_software/dteam
nfs:/experiment_software/others/gear on /experiment_software/gear
nfs:/experiment_software/others/ops on /experiment_software/ops
nfs:/experiment_software/others/hone on /experiment_software/hone
/tmpdir_slurm
and /home/wlcg
.
Apr 18 15:29 [root@wn101:~]# ls -ld /tmpdir_slurm lrwxrwxrwx 1 root root 16 Mar 29 10:36 /tmpdir_slurm -> /gpfs/tmpdir_slurm
/gpfs
is mounted via NFS from ppnfs
. Here there is information about GPFS on Phoenix: ServiceGPFS
# cfagent -q umount /lcg.cscs.ch/packages/rpms echo "touch /var/lock/subsys/local" > /etc/rc.d/rc.local rm -fv /etc/yum.repos.d/sl-security.repo # old SL59 security repo yum update -y #Update all possible from sl-security but NOT any IB or kernel related package. yum install ca-policy-egi-core -y yum install libtorque-2.4.16-1.cri $(ssh lrms01 'rpm -qa |grep torque-client') torque --disableexcludes=main --enablerepo=cscs -y yum install emi-torque-client --enablerepo=epel -y yum install cvmfs cvmfs-keys cvmfs-init-scripts emi-wn emi-glexec_wn --enablerepo=epel -y chkconfig autofs on service autofs start scp ppnfs:/var/mmfs/gen/mmsdrfs /var/mmfs/gen/ mmrefresh -f mmstartup mmgetstate
reboot
mmstartup ; sleep 5s; ls /gpfs; df -h |grep gpfs cvmfs_config probe Probing /cvmfs/atlas.cern.ch... OK Probing /cvmfs/atlas-condb.cern.ch... OK Probing /cvmfs/lhcb.cern.ch... OK Probing /cvmfs/hone.cern.ch... Failed! Probing /cvmfs/cms.cern.ch... OK mount |grep 'gpfs\|experiment' ppnfs:/gpfs on /gpfs type nfs (ro,bg,proto=tcp,rsize=32768,wsize=32768,soft,intr,nfsvers=3,addr=148.187.64.227) ppnfs:/gpfs/preproduction on /gpfs_pp type nfs (rw,bg,proto=tcp,rsize=32768,wsize=32768,soft,intr,nfsvers=3,addr=148.187.64.227) nfs:/experiment_software/atlas on /experiment_software/atlas type nfs (ro,bg,proto=tcp,rsize=32768,wsize=32768,soft,intr,nfsvers=3,addr=148.187.67.100) nfs:/experiment_software/cms on /experiment_software/cms type nfs (ro,bg,proto=tcp,rsize=32768,wsize=32768,soft,intr,nfsvers=3,addr=148.187.67.100) nfs:/experiment_software/lhcb on /experiment_software/lhcb type nfs (ro,bg,proto=tcp,rsize=32768,wsize=32768,soft,intr,nfsvers=3,addr=148.187.67.100) nfs:/experiment_software/others/dech on /experiment_software/dech type nfs (ro,bg,proto=tcp,rsize=32768,wsize=32768,soft,intr,nfsvers=3,addr=148.187.67.100) nfs:/experiment_software/others/dteam on /experiment_software/dteam type nfs (ro,bg,proto=tcp,rsize=32768,wsize=32768,soft,intr,nfsvers=3,addr=148.187.67.100) nfs:/experiment_software/others/gear on /experiment_software/gear type nfs (ro,bg,proto=tcp,rsize=32768,wsize=32768,soft,intr,nfsvers=3,addr=148.187.67.100) nfs:/experiment_software/others/ops on /experiment_software/ops type nfs (ro,bg,proto=tcp,rsize=32768,wsize=32768,soft,intr,nfsvers=3,addr=148.187.67.100) nfs:/experiment_software/others/hone on /experiment_software/hone type nfs (ro,bg,proto=tcp,rsize=32768,wsize=32768,soft,intr,nfsvers=3,addr=148.187.67.100)
## /opt/glite/yaim/bin/yaim -c -s /opt/cscs/siteinfo/site-info.def -n WN -n TORQUE_client -n GLEXEC_wn nohup /opt/glite/yaim/bin/yaim -c -s /opt/cscs/siteinfo/site-info.def -n WN -n TORQUE_client -n GLEXEC_wn 2>&1 | tee /root/yaim.log &
cfengine
and =grid-service2 restart=cfagent -q; grid-service2 restart
# yum install emi-slurm-client emi-wn emi-glexec_wn globus-proxy-utils globus-gass-copy-progs --enablerepo=epel
/opt/cscs/siteinfo/wn-list.conf
which is under the SLURM group in cfengine. Also, do not forget to make sure /etc/ssh/shosts.equiv
properly reflects the values in wn-list.conf
.
cd /srv/cfengine/DSHGROUPS touch INPUT/groups/ALL/WN_SLURM/wn01.lcg.cscs.ch make all
/opt/glite/yaim/bin/yaim -c -s /opt/cscs/siteinfo/site-info.def -n WN -n GLEXEC_wn -n SLURM_utils -n SLURM_client
# service munge status munged (pid 1651) is running... # service slurm status slurmd (pid 25551) is running...
service autofs start #autofs is chkconfig'd by cfengine but we need to start it has the machine hasn't rebooted cvmfs_config probe
Apr 18 15:13 [root@wn101:~]# cat /var/spool/pbs/mom_priv/config $logevent 255 # MOM interval in seconds. Should be <= servers job_stat_rate $check_poll_time 90 # Interval of information update to server. Should be <= scheduling interval $status_update_time 90 $timeout 30 # Moab takes care about killing jobs. This allows jobs to overrun walltime by some time $ignwalltime true $usecp arc01.lcg.cscs.ch:/home/nordugrid-atlas /home/nordugrid-atlas $usecp arc02.lcg.cscs.ch:/home/nordugrid-atlas /home/nordugrid-atlas $usecp ce01.lcg.cscs.ch:/home /home $usecp ce02.lcg.cscs.ch:/home /home # gLite 3.2 CREAM $usecp cream01.lcg.cscs.ch:/opt/glite/var/cream_sandbox /lustre/scratch/CREAM_CE/cream01/cream_sandbox # For EMI 1 (UMD 1.0.0 & UMD 1.1.0 releases) this line must be the following: $usecp cream02.lcg.cscs.ch:/var/cream_sandbox /lustre/scratch/CREAM_CE/cream02/cream_sandbox $tmpdir /tmpdir_pbs # Torque's default connection timeout is 10ms instead of 10s... should be fixed in a later release, but for now: # 4s works fine in productin at Cyfronet (should be fine for Phoenix too) $max_conn_timeout_micro_sec 4000000 # scale cputime and walltime to average HEP-SPEC06 published # Average HEP-SPEC06/core (C+D): 9.69 # PhaseC: 10 --> 1.03 # PhaseD: 8.2 --> 0.85 $cputmult 1.03 $wallmult 1.03 # in case Lustre is slow we want to prevent that the job get's requed $prologalarm 600
/usr/local/bin/yum-with-glite groupupdate --enablerepo=cscs glite-WN
emi-wn
and emi-glexec_wn
. Do not attemp to do it with all packages as there is a newer version of libtorque in CSCS repo that wants to be installed.
yum update --enablerepo=cscs --enablerepo=epel emi-wn emi-glexec_wnMake sure that the torque packages are taken from the CSCS repo!
/var/log/messages
of the node:
Sep 7 17:37:37 ppwn04 pbs_mom: LOG_ERROR::sys_copy, command '/usr/bin/scp -rpB dteam001@ppcream02.lcg.cscs.ch:/var/local_cream_sandbox/dteam/_DC_com_DC_quovadisglobal_DC_grid_DC_switch_DC_users_C_CH_O_ETH_Zuerich_CN_Miguel_Angel_Gila_Arrondo_dteam_Role_NULL_Capability_NULL_dteam001/proxy/005d0b069e96cba166a0f1caf82a7ad25cc7b77612719093722029 crpp2_788806913.proxy' failed with status=1, giving up after 4 attempts Sep 7 17:37:37 ppwn04 pbs_mom: LOG_ERROR::req_cpyfile, Unable to copy file dteam001@ppcream02.lcg.cscs.ch:/var/local_cream_sandbox/dteam/_DC_com_DC_quovadisglobal_DC_grid_DC_switch_DC_users_C_CH_O_ETH_Zuerich_CN_Miguel_Angel_Gila_Arrondo_dteam_Role_NULL_Capability_NULL_dteam001/proxy/005d0b069e96cba166a0f1caf82a7ad25cc7b77612719093722029 to crpp2_788806913.proxyThen, make sure that the
ssh_known_hosts
file has been generated recently and contains the new keys by running the following command on cfengine server
/srv/cfengine/scripts/new_known_hosts
# dmraid -s -d DEBUG: _find_set: searching isw_bgidcceegc DEBUG: _find_set: not found isw_bgidcceegc DEBUG: _find_set: searching isw_bgidcceegc_Volume0 DEBUG: _find_set: searching isw_bgidcceegc_Volume0 DEBUG: _find_set: not found isw_bgidcceegc_Volume0 DEBUG: _find_set: not found isw_bgidcceegc_Volume0 DEBUG: _find_set: searching isw_bgidcceegc DEBUG: _find_set: found isw_bgidcceegc DEBUG: _find_set: searching isw_bgidcceegc_Volume0 DEBUG: _find_set: searching isw_bgidcceegc_Volume0 DEBUG: _find_set: found isw_bgidcceegc_Volume0 DEBUG: _find_set: found isw_bgidcceegc_Volume0 DEBUG: set status of set "isw_bgidcceegc_Volume0" to 8 *** Group superset isw_bgidcceegc --> Active Subset name : isw_bgidcceegc_Volume0 size : 927985664 stride : 128 type : mirror status : nosync subsets: 0 devs : 2 spares : 0 DEBUG: freeing devices of RAID set "isw_bgidcceegc_Volume0" DEBUG: freeing device "isw_bgidcceegc_Volume0", path "/dev/sda" DEBUG: freeing device "isw_bgidcceegc_Volume0", path "/dev/sdb" DEBUG: freeing devices of RAID set "isw_bgidcceegc" DEBUG: freeing device "isw_bgidcceegc", path "/dev/sda" DEBUG: freeing device "isw_bgidcceegc", path "/dev/sdb" # dmraid -ay # dmraid -s *** Group superset isw_bgidcceegc --> Active Subset name : isw_bgidcceegc_Volume0 size : 927985664 stride : 128 type : mirror status : ok subsets: 0 devs : 2 spares : 0
ServiceCardForm | |
---|---|
Service name | WN |
Machines this service is installed in | wn[01-79] |
Is Grid service | Yes |
Depends on the following services | cvmfs, gpfs, nfs, lrms |
Expert |