===== Emulating a Grid'5000 OAR setup =====
This setup fakes the boot stages (post-deploy or standby) with ''at'' commands.
It uses [[oar-docker]].
==== On the host ====
Start the oar-docker cluster from within a oar git clone.
* cd .../oar
* oardocker -v$PWD:/mnt -n 20
==== On the frontend container ====
* connect from the host to the frontend container:
$ oardocker connect frontend
* setup the atd service:
$ apt-get install -y at
$ /etc/init.d/atd start
* setup epilogue in file /etc/oar/epilogue:
#!/bin/bash
# Usage:
# Script is run under uid of oar who is sudo
# argv[1] is the jobid
# argv[2] is the user's name
# argv[3] is the file which contains the list of nodes used
# argv[4] is the job walltime in seconds
exec 1> /tmp/epilogue.$1.log
exec 2>&1
set -x
OARNODESETTINGCMD=/usr/local/sbin/oarnodesetting
# simulate the boot of the nodes with an at command -> put the node Alive asynchronously.
for n in $(uniq $3); do
nodes="$nodes -h $n"
echo "bash -xc '$OARNODESETTINGCMD -h $n -s Alive -p available_upto=2147483646' > /tmp/oarnodesetting.$1.$n 2>&1" | at now +1 minute;
done
#$OARNODESETTINGCMD $nodes -s Absent -p available_upto=0
# set the nodes down, using the command used on g5k:
$OARNODESETTINGCMD -n -s Absent -p available_upto=0 --sql "resource_id IN (select assigned_resources.resource_id from jobs,assigned_resources,resources where assigned_resource_index = 'CURRENT' AND jobs.state = 'Running' AND jobs.job_id = $1 and moldable_job_id = jobs.assigned_moldable_job AND (resources.resource_id = assigned_resources.resource_id AND resources.type='default'))"
exit 0
* Changes to drawgantt-svg
--- /mnt/sources/visualization_interfaces/DrawGantt-SVG/drawgantt-config.inc.php2015-10-22 00:26:56.132034018 +0200
+++ /etc/oar/drawgantt-config.inc.php 2015-10-21 23:43:25.228911673 +0200
@@ -71,10 +71,10 @@
'network_address' => '/^([^.]+)\..*$/',
);
$CONF['label_cmp_regex'] = array( // substring selection regex for comparing and sorting labels (resources)
- 'network_address' => '/^([^-]+)-(\d+)\..*$/',
+ 'network_address' => '/^([^-]+)(\d+)$/',
);
$CONF['resource_properties'] = array( // properties to display in the pop-up on top of the resources labels (on the left)
- 'deploy', 'cpuset', 'besteffort', 'network_address', 'type', 'drain');
+ 'deploy', 'cpuset', 'besteffort', 'network_address', 'type', 'drain', 'state', 'available_upto');
$CONF['resource_hierarchy'] = array( // properties to use to build the resource hierarchy drawing
'network_address','cpuset',
);
@@ -156,7 +156,7 @@
EOT;
// Standby job display options for the part shown in the future
-$CONF['standby_truncate_state_to_now'] = 1; // default: 1
+$CONF['standby_truncate_state_to_now'] = 0; // default: 1
// Besteffort job display options for the part shown in the future
$CONF['besteffort_truncate_job_to_now'] = 1; // default: 1
$CONF['besteffort_pattern'] = <<
==== On the server container ====
* connect from the host to the OAR server container:
$ oardocker connect server
* apply changes to oar.conf: set cosystem frontend, epilogue, energy saving
* we use the cosystem type instead of deploy (or we need to add a deploy preoperty)
--- /root/oar.conf 2015-10-21 22:45:52.973104493 +0200
+++ /etc/oar/oar.conf 2015-10-21 22:44:05.752902069 +0200
@@ -63,10 +63,10 @@
LOG_FILE="/var/log/oar.log"
# Specify where we are connected with a job of the deploy type
-DEPLOY_HOSTNAME="127.0.0.1"
+DEPLOY_HOSTNAME="frontend"
# Specify where we are connected with a job of the cosystem type
-COSYSTEM_HOSTNAME="127.0.0.1"
+COSYSTEM_HOSTNAME="frontend"
# Specify the database field to use to fill the file on the first node of the
# job in $OAR_NODE_FILE (default is 'network_address'). Only resources with
@@ -197,8 +197,8 @@
# Files to execute before and after each job on the first computing node
# (by default nothing is executed)
-#PROLOGUE_EXEC_FILE="/path/to/prog"
-#EPILOGUE_EXEC_FILE="/path/to/prog"
+PROLOGUE_EXEC_FILE="/etc/oar/prologue"
+EPILOGUE_EXEC_FILE="/etc/oar/epilogue"
# Set the timeout for the prologue and epilogue execution on the OAR server
#SERVER_PROLOGUE_EPILOGUE_TIMEOUT="60"
@@ -374,16 +374,16 @@
#
# Parameter for the scheduler to decide when a node is idle.
# Number of seconds since the last job was terminated on nodes
-#SCHEDULER_NODE_MANAGER_IDLE_TIME="600"
+SCHEDULER_NODE_MANAGER_IDLE_TIME="300"
# Parameter for the scheduler to decide if a node will have enough time to sleep.
# Number of seconds before the next job
-#SCHEDULER_NODE_MANAGER_SLEEP_TIME="600"
+SCHEDULER_NODE_MANAGER_SLEEP_TIME="600"
# Parameter for the scheduler to know when a node has to be woken up before the
# beginning of the job when a reservation is scheduled on a resource on this node
# Number of seconds for a node to wake up
-#SCHEDULER_NODE_MANAGER_WAKEUP_TIME="600"
+SCHEDULER_NODE_MANAGER_WAKEUP_TIME="600"
# When OAR scheduler wants some nodes to wake up then it launches this command
# and puts on its STDIN the list of nodes to wake up (one hostname by line).
@@ -415,26 +415,26 @@
# - the launching of wakeup/shutdown commands can be windowized to prevent
# from electric peeks
# Possible values are "yes" and "no"
-ENERGY_SAVING_INTERNAL="no"
+ENERGY_SAVING_INTERNAL="yes"
# Path to the script used by the energy saving module to wake up nodes.
# This command is executed from the oar server host.
# OAR puts the node list on its STDIN (one hostname by line).
# The scheduler looks at the available_upto field in the resources table to know
# if the node will be started for enough time.
-#ENERGY_SAVING_NODE_MANAGER_WAKE_UP_CMD="/etc/oar/wake_up_nodes.sh"
+ENERGY_SAVING_NODE_MANAGER_WAKE_UP_CMD="/etc/oar/wake_up_nodes.sh"
# Path to the script used by the energy saving module to shut down nodes.
# This command is executed from the oar server host.
# OAR puts the node list on its STDIN (one hostname by line).
-#ENERGY_SAVING_NODE_MANAGER_SLEEP_CMD="/etc/oar/shut_down_nodes.sh"
# Timeout to consider a node broken (suspected) if it has not woken up
# The value can be an integer of seconds or a set of pairs.
# For example, "1:500 11:1000 21:2000" will produce a timeout of 500
# seconds if 1 to 10 nodes have to wakeup, 1000 seconds if 11 t 20 nodes
# have to wake up and 2000 seconds otherwise.
-#ENERGY_SAVING_NODE_MANAGER_WAKEUP_TIMEOUT="900"
+ENERGY_SAVING_NODE_MANAGER_WAKEUP_TIMEOUT="1:200 81:400 161:600 241:800"
# You can set up a number of nodes that must always be on. You can use the
# syntax in the examples if you want a number of alive nodes of different types
@@ -457,7 +457,7 @@
# Possible values are "yes" and "no"
# When set to "yes", the list of nodes to wake up or shut down is passed to
# ENERGY_SAVING_NODE_MANAGER_*_CMD through stdin.
-#ENERGY_SAVING_WINDOW_FORKER_BYPASS="no"
+ENERGY_SAVING_WINDOW_FORKER_BYPASS="yes"
# Time in second between execution of each window.
# Minimum is 0 to set no delay between each window.
@@ -471,7 +471,7 @@
# The energy saving module can be automatically restarted after reaching
# this number of cycles. This is a workaround for some DBD modules that do
# not always free memory correctly.
-#ENERGY_MAX_CYCLES_UNTIL_REFRESH=5000
+ENERGY_MAX_CYCLES_UNTIL_REFRESH=500
################################################################################
* apt-get install -y at
* /etc/init.d/atd start
* add energy saving script
/etc/oar/wake_up_nodes.sh
#!/bin/bash
# Sample script for energy saving (wake-up)
DATE=$(date +%F_%T)
OARNODESETTINGCMD=/usr/local/sbin/oarnodesetting
# stdout/err goes to oar.log
set -x
# simulate the boot time with an at command: will set the node Alive asynchronously
while read n; do
for nn in $n; do
echo "bash -xc 'echo $DATE; $OARNODESETTINGCMD -h $nn -s Alive -p available_upto=2147483646' > /tmp/wake-up.$DATE.$n 2>&1" | at now +1 minute;
done
done
/etc/oar/shut_down_nodes.sh
#!/bin/bash
# Sample script for energy saving (shut-down)
# stdout/err goes to oar.log
set -x
# Nothing really needed here
while read n; do
for nn in $n; do
echo shutting down $n
done
done