:D 获取中...

1 Installation

1.1 Install Torque PBS On Server

su root
# Download torque-6.1.2.tar.gz and maui-3.3.1.tar.gz
yum install libxml2-devel openssl-devel gcc gcc-c++ boost-devel libtool 
tar -xvf torque-6.1.2.tar.gz
cd torque-6.1.2
./configure --prefix=/opt/torque --with-scp --with-default-server=node01

make install
make packages   # for compute nodes
libtool --finish /opt/torque/lib

#作业调度服务拷贝到/etc/init.d/目录,添加系统自动启动服务
cp contrib/init.d/{pbs_{server,sched,mom},trqauthd} /etc/init.d/
systemctl enable pbs_server  
systemctl enable pbs_mom 
systemctl enable trqauthd

# 加载环境变量
source /etc/profile.d/torque.sh  

# 初始化和启动torque服务
./torque.setup root

## Set node01 also as compute node
vi /var/spool/torque/mom_priv/config
## Add
$pbsserver node01
$logevent 255
$spool_as_final_name true

vi /var/spool/torque/server_priv/nodes
## Add, feature public for scheduling
node01 np=16 public 

chmod -Rf 1777 /var/spool/torque/undelivered/ 
chmod -Rf 1777 /var/spool/torque/spool/ 

systemctl start pbs_server  
systemctl start pbs_mom 
systemctl start trqauthd

1.2 Install maui On Server

su root
tar -xvf maui-3.3.1.tar.gz
cd maui-3.3.1
./configure --prefix=/opt/maui --with-pbs=/opt/torque
make 
make install

vi contrib/service-scripts/redhat-maui.d
## Modify 
MAUI_PREFIX=/opt/maui
daemon --user maui => daemon --user root

cp contrib/service-scripts/redhat.maui.d /etc/init.d/maui.d
chmod a+x /etc/init.d/maui.d
systemctl enable maui.d 
 systemctl start maui.d 

1.3 Add queue on Server

su root
## Add Firewall ports
for port in $(seq 15001 15009 ) ; do firewall-cmd --permanent --add-port=$port/tcp ; done
for port in $(seq 15001 15009 ) ; do firewall-cmd --permanent --add-port=$port/udp ; done
firewall-cmd --reload
firewall-cmd --list-all


## create queue public 
quename=public
qmgr -c "create queue $quename queue_type=execution"
qmgr -c "set queue $quename started=true"
qmgr -c "set queue $quename enabled=true"
qmgr -c "set queue $quename resources_default.nodes=1"
qmgr -c "set queue $quename resources_default.ncpus = 16"
qmgr -c "set queue $quename resources_default.walltime=2400:00:00"
qmgr -c "set queue $quename max_running=5"
qmgr -c "set queue $quename max_queuable=20"
qmgr -c "set queue $quename keep_completed=360000"
qmgr -c "set queue $quename resources_default.neednodes=$quename"
qmgr -c "set queue $quename acl_user_enable = True"
qmgr -c "set queue $quename acl_users=user1@node01"
qmgr -c "list queue $quename"
qmgr -c "set queue $quename acl_users+=user2@node01"

## for server
qmgr -c "set server scheduling=true"
qmgr -c "set server default_queue=public" 
qmgr -c "delete queue batch" 
#qmgr -c "set server query_other_jobs = true"

1.4 Instal Torque PBS on Computing Nodes (e.g. node11)

## On server
## Shift 172.xx.xx.xx node01 before 127.0.0.1 ... in /etc/hosts
## and copy it to computing nodes 

## On computing nodes
torquehome=home of torque
scp node01:$torquehome/torque-package-mom-linux-x86_64.sh .
scp node01:$torquehome/torque-package-clients-linux-x86_64.sh .
./torque-package-mom-linux-x86_64.sh --install 
./torque-package-clients-linux-x86_64.sh --install 
systemctl enable pbs_mom
systemctl start pbs_mom
systemctl enable trqauthd
systemctl start trqauthd

vi /var/spool/torque/mom_priv/config
# Add
$pbsserver node01
$logevent 255
$spool_as_final_name true
  • Return Server
vi /var/spool/torque/server_priv/nodes
## Add
node01 np=16 public 
node11 np=16 public 

system restart pbs_server

2 erros

  1. job queued not run, comment “raining system to allow starving job to run” Solve: vi /var/spool/torque/sched_priv/sched_config, change help_starving_jobs true ALL to help_starving_jobs false ALL.