|
-*- tcl-indent-level: 5; indent-tabs-mode: nil; -*-
vi: set autoindent expandtab tabstop=5 shiftwidth=5 :
manager.tcl Version 1.0
Wraps the genericAPI.tcl for use by the manager API.
set ::RCS_ID_managertcl {$Id: manager.tcl,v 1.519 2009/09/30 22:33:39 emaros Exp $}
set ::RCS_ID_managertcl [ string trim $::RCS_ID_managertcl "\$" ]
package provide manager 1.0
namespace eval mgr {
set activejobs 0
}
proc mgr::expandCmd { { cmd "" } } {
;## if its a xxx.meta command, expand and recurse.
if { [ catch {
if { [ llength [ split $cmd "\n" ] ] == 1 } {
set cmd [ mgr::parseMeta $cmd ]
set sequence [ mgr::expandCmd $cmd ]
} else {
set sequence [ mgr::parseBlock $cmd ]
}
} err ] } {
addLogEntry $err
return -code error "[ myName ]: $err"
}
return $sequence
}
proc mgr::commandSanityChecks { cmd } {
if { [ catch {
;## -np must be at least '2' since no search code is run
;## on the wrapper master and we include the wrapper master
;## as one of the nodes requested to avoid underrepresenting
;## the system requirements of large numbers of small jobs.
if { [ regexp -nocase -- {\s+-np\s+[01]\s+} $cmd ] } {
set err "minimum possible value of '-np' option is '2'"
return -code error $err
}
;## when ::REJECT_EXPLICIT_DYNLIB_PATHS is set to 1, we
;## reject al commands that attempt to specify a dynlib.so
;## by absolute path. this is done to prevent the running
;## of potentially hazardous experimental code on production
;## LDAS systems. PR #1825 10-07-2003
if { [ info exists ::REJECT_EXPLICIT_DYNLIB_PATHS ] && \
[ string equal 1 $::REJECT_EXPLICIT_DYNLIB_PATHS ] } {
set dyn_rx {\s+-dynlib\s+(\S+)\s+}
if { [ regexp -nocase -- $dyn_rx $cmd -> dynlib ] } {
regsub -all -- {[\}\{\"\s]} $dynlib {} dynlib
if { [ regexp {[\\\/]+} $dynlib ] } {
set msg "This system has ::REJECT_EXPLICIT_DYNLIB_PATHS"
append msg " enabled to prevent the use of experimental"
append msg " shared objects."
append msg " You may not specify path components as"
append msg " part of your -dynlib option (and you"
append msg " specified '$dynlib')."
return -code error $msg
}
}
}
} err ] } {
return -code error "[ myName ]: $err"
}
}
proc mgr::cmdParseUserTimeout { cmd } {
if { [ catch {
set timeout 0
;## optional option available to users in order to cause
;## jobs to timeout *more* quickly.
set uto_rx {\s+-usertimeout\s+(\S+)\s+}
if { [ regexp -nocase -- $uto_rx $cmd -> timeout ] } {
regsub -all -- {[\}\{\"\s]} $timeout {} timeout
set standard $::MANAGER_ABORT_AFTER_N_SECONDS_IN_ONE_API
set timeout [ expr { round($timeout) } ]
if { $timeout > 2 * $standard } {
set timeout [ expr { 2 * $standard } ]
}
} else {
set timeout 0
}
} err ] } {
set err "Error in -usertimeout option: '$err'"
return -code error "[ myName ]: $err"
}
return $timeout
}
proc mgr::parseMeta { cmd } {
set procname {}
set usropts [ list ]
set command {}
set macrolist [ mgr::globMacros ]
regexp $::cmd_rx $cmd -> procname usropts
;## first pass expansion from meta file
if { [ lsearch -exact $macrolist ${procname}.meta ] > -1 } {
set fname [ file join $::LDASMACROS meta ${procname}.meta ]
set fid [ open $fname r ]
;## meta macros are constrained to start with a line
;## declaring the "opts" and their defaults.
set command [ string trim [ gets $fid ] ]
;## test it.
if { ! [ regexp {^set opts \{[^\}]+\}} $command ] } {
set msg "the first line of the meta file must be\n"
append msg "the declaration of 'opts', but I found:\n"
append msg "$command\ninstead."
return -code error "[ myName ]: $msg"
}
;## apply the user supplied options
if { [ llength $usropts ] } {
append command "\n metaOpts \{ $usropts \}\n"
}
append command [ read $fid ]
::close $fid
} else {
return -code error "[ myName ]: Bad meta command: '$cmd'"
}
return $command
}
proc mgr::parseBlock { cmd } {
set api {}
set tmp [ list ]
set sequence [ list ]
set api_rx ^([ join $::API_LIST | ])\\s*\$
set macrolist [ mgr::globMacros ]
foreach line [ split $cmd ";\n" ] {
set line [ string trim $line ]
if { [ regexp {(^\#.*$|^$)} $line ] } {
continue
}
if { [ regexp {^set opts \{[^\}]+\}} $line ] } {
set sequence $line
continue
}
if { [ regexp -nocase $api_rx $line api ] } {
if { [ string equal $api $tmp ] } {
continue
} else {
set tmp $api
continue
}
}
if { ! [ regexp $::cmd_rx $line -> procname opts ] } {
set procname [ string trim $line ]
}
if { [ string equal metaOpts $procname ] } {
append sequence "\n metaOpts $opts\n"
continue
}
if { [ lsearch -exact $macrolist ${procname}.$api ] > -1 } {
set fname [ file join $::LDASMACROS $api ${procname}.$api ]
append sequence "\n$api\n"
set fid [ open $fname r ]
set tmp {}
while { [ gets $fid code ] > -1 } {
if { ! [ regexp {(^[ ]*;?\#)|(^[ ]*$)} $code ] } {
append tmp "$code\n"
}
}
::close $fid
append sequence \{[ string trim $tmp ]\}
set procname {}
continue
}
} ;# end of foreach
return $sequence
}
proc mgr::globMacros { } {
set macrolist {}
foreach file [ glob -nocomplain $::LDASMACROS/*/*.* ] {
lappend macrolist [ file tail $file ]
}
if { [ llength $macrolist ] < 3 } {
set msg "*********************************************\n"
append msg "ABORT!! No macro code found under '$::LDASMACROS'\n"
append msg "If this is a new installation of LDAS, you\n"
append msg "must read the documentation for installing\n"
append msg "the manager API.\n"
append msg "If this is not a new installation, then\n"
append msg "either you have deleted the macro files or\n"
append msg "redefined ::LDASMACROS to point to the wrong\n"
append msg "location. You may also being experiencing\n"
append msg "transient NFS problems.\n\n"
append msg " NO JOBS CAN BE PROCESSED\n"
append msg " UNTIL THIS PROBLEM IS SOLVED.\n"
append msg "*********************************************\n"
puts stderr $msg
return -code error "[ myName ]:\n$msg"
}
set macrolist
}
if { [ mgr::bootstrapAPI $apis ] } { ... }
Comments:
proc mgr::bootstrapAPI { { apis "" } { comment "" } } {
set redirect /dev/null
set libmalloc [ list ]
set libstdc [ list ]
set apihost unknown!
if { ! [ string length $apis ] } {
set msg "you must specify an api, list of api's, or 'all'"
return -code error "[ myName ]: $msg"
}
if { [ string equal all $apis ] } {
set apis $::API_LIST
} else {
foreach api $apis {
if { [ lsearch $::API_LIST $api ] < 0 } {
set msg "cannot bootstrap unregistered API: '$api'"
return -code error $msg
}
}
}
;## emergency port requests always succeed
set caller [ uplevel myName ]
if { [ string equal emergency_callback $caller ] } {
mgr::bootstrapLock $apis 1
} else {
set apis [ mgr::bootstrapLock $apis 0 ]
}
set localhost [ set ::${::LDAS_SYSTEM}(manager) ]
foreach api $apis {
set api [ string tolower $api ]
regsub {api$} $api {} api
if { [ string equal $api manager ] } { continue }
set apiPid [ getPid $api ]
set apiPid [ string trim $apiPid ]
addLogEntry "$api pid is '$apiPid'" purple
;## if API is up but appears stuck and user does not want reboot
;## hold off rebooting API
if { [ string length $apiPid ] && ! $::STARTUP_IN_PROGRESS && \
[ info exist ::WAIT_FOR_API ] && [ set ::WAIT_FOR_API ] } {
set subject "hold off bootstraping $api"
set body "set ::WAIT_FOR_API to 0 to resume booting"
addLogEntry "Subject: ${subject}; Body: ${body}" email
continue
}
if { [ catch {
mgr::shutdownAPI $api $comment
mgr::bootstrapPause $api
set postmortem [ mgr::postMortem $api ]
;## if the call to mgr::postMortem returned data
if { [ string length $postmortem ] } {
addLogEntry $postmortem blue mgr::postMortem
}
set apihost [ set ::${api}(host) ]
} err ] } {
addLogEntry "$err (api: $api)" 2
}
if { [ catch {
set libstdc [ mgr::libstdcPreload $api ]
} err ] } {
continue
}
if { [ catch {
set libmalloc [ mgr::libmallocPreload $api ]
} err ] } {
continue
}
;## Ed asked for different vars for different platforms 04/28/04
;## this will return SunOS or Linux
;## Ed asked for this 07/02/07
;## have different env for different APIs to run libumem
set extra_env [ mgr::getExtraEnv $api ]
set extra_env [ subst -nocommands $extra_env ]
set api_cmd "${api}API"
if { ! [ string equal $apihost $localhost ] } {
set api_cmd "$::LDAS/bin/${api_cmd}"
}
set launcher [ mgr::getLauncher $api $api_cmd ]
mgr::perApiRunningStats $api restart [ gpsTime ]
set dir [ mgr::makeApiAtHome $api ]
set dotfile "${dir}/.. "
set fid [ open $dotfile w ]
file attributes $dotfile -permissions 0600
puts $fid $::MGRKEY
::close $fid
set redirect ${dir}/${api}.log
addLogEntry "bootstrapping $api API on $apihost"
set apipath ${::LDASLIB}/${api}API
if { [ catch {
if { [ string equal $apihost $localhost ] } {
set LD_LIBRARY_PATH $::env(LD_LIBRARY_PATH)
set cwd [ pwd ]
cd $dir
set cmd "/bin/env $extra_env "
append cmd "RUNDIR=$::env(RUNDIR) "
append cmd "LD_LIBRARY_PATH=$apipath:$::LD_LIBRARY_PATH "
append cmd "DB2INSTANCE=$::DB2INSTANCE "
append cmd "$launcher >& $redirect &"
eval ::exec $cmd
cd $cwd
} else {
set cmd "alias cd=cd && cd $dir && /bin/env "
append cmd "RUNDIR=$::env(RUNDIR) HOST=$apihost "
append cmd "PATH=$::LDAS/bin:$::env(PATH) "
append cmd "LD_LIBRARY_PATH=$apipath:$::LD_LIBRARY_PATH "
append cmd "DB2INSTANCE=$::DB2INSTANCE $extra_env "
append cmd "$launcher >& $redirect &"
execssh $apihost $cmd
}
addLogEntry "Startup command for ${api}API: '$cmd'" blue
set up [ mgr::bootstrapStatus $api ]
set pid [ getPid $api ]
;## if we succeeded inform martian boxes of
;## FTP and HTTP info for gateway.
if { $up } {
mgr::pushFTPandHTTPinfo $api $::LDAS_VERSION
} else {
set msg "failed connect to operator socket on $api API"
return -code error "[ myName ]: $msg"
}
mgr::bootstrapSuccessEmail $api $apihost $pid
mgr::bootstrapUnlock $api
mgr::bootstrapFatalErrorEmail $api $apihost NULL 1
} err ] } {
addLogEntry $err red
mgr::bootstrapUnlock $api
mgr::bootstrapFatalErrorEmail $api $apihost $err 0
}
} ;## end of foreach on api
after 0 mgr::apiStatusPage
}
proc mgr::bootstrapSuccessEmail { api apihost pid } {
if { [ catch {
set report 1
if { [ info exists ::bootstrapemailpid($api) ] } {
set last_pid $::bootstrapemailpid($api)
if { [ string equal $pid $last_pid ] } {
set report 0
}
}
set ::bootstrapemailpid($api) $pid
if { $report } {
if { [ info exists ::jobid ] && \
[ string equal STARTUP $::jobid ] } {
set msg "done. $api API is now running as pid: $pid"
addLogEntry $msg green
} else {
set logurl $::HTTPURL
regexp {(.+)/jobs} $logurl -> logurl
set logurl ${logurl}/logs
set subject "$::LDAS_SYSTEM ${api}API restarted"
set msg "$::LDAS_SYSTEM ${api}API restarted.\n"
append msg "$api API is now running as pid '${pid}' on "
append msg "$::LDAS_SYSTEM host ${apihost}.\nSee the "
append msg "$::LDAS_SYSTEM LDAS logs for details: "
append msg "$logurl"
addLogEntry "Subject: ${subject}; Body: $msg" email
}
}
} err ] } {
return -code error "[ myName ]: $err"
}
}
proc mgr::bootstrapFatalErrorEmail { api apihost err { cancel 0 } } {
if { $cancel == 1 } {
if { [ info exists ::fatalerroremailtime($api) ] } {
::unset ::fatalerroremailtime($api)
}
return {}
}
if { [ catch {
set errlvl blue
if { [ info exists ::UNABLE_TO_START_API_EMAIL_INTERVAL_S ] } {
set interval $::UNABLE_TO_START_API_EMAIL_INTERVAL_S
} else {
set interval 3600
}
set now [ clock seconds ]
if { [ info exists ::fatalerroremailtime($api) ] } {
set last_notified $::fatalerroremailtime($api)
if { $now - $last_notified > $interval } {
set ::fatalerroremailtime($api) $now
set errlvl email
} else {
set errlvl red
}
} else {
set last_notified $now
set ::fatalerroremailtime($api) $now
set errlvl email
}
set subject "$::LDAS_SYSTEM bootstrap of $api failed!"
set msg "could not start $api API on $apihost!\n"
append msg "(error: $err)"
if { [ string equal email $errlvl ] } {
addLogEntry "Subject: ${subject}; Body: $msg" $errlvl
} else {
addLogEntry $msg $errlvl
}
} err ] } {
return -code error "[ myName ]: $err"
}
}
proc mgr::bootstrapPause { api } {
if { [ catch {
if { [ info exists ::BOOTSTRAP_PAUSE_TIMEOUT ] } {
if { ! [ regexp {\d+} $::BOOTSTRAP_PAUSE_TIMEOUT ] } {
set ::BOOTSTRAP_PAUSE_TIMEOUT 10000
} elseif { $::BOOTSTRAP_PAUSE_TIMEOUT < 5000 } {
set ::BOOTSTRAP_PAUSE_TIMEOUT 5000
}
} else {
set ::BOOTSTRAP_PAUSE_TIMEOUT 10000
}
set pause $::BOOTSTRAP_PAUSE_TIMEOUT
set first [ expr { $pause / 5 } ]
set second $first
set third [ expr { $first * 3 } ]
set sid [ sock::open $api emergency ]
::close $sid
set start [ clock clicks -milliseconds ]
addLogEntry "giving ${api}API $first ms to shut down..."
after $first
set sid [ sock::open $api emergency ]
::close $sid
addLogEntry "giving ${api}API $second more ms to shut down..."
after $second
set sid [ sock::open $api emergency ]
::close $sid
addLogEntry "giving ${api}API $third more ms to shut down..."
after $third
set sid [ sock::open $api emergency ]
::close $sid
set msg "${api}API failed to shutdown in $pause ms. "
append msg "will try 'genocide' action. "
append msg "if this fails, somebody will have to "
append msg "clean up the ${api}API BY HAND..."
addLogEntry $msg red
} err ] } {
if { [ info exists start ] } {
set done [ clock clicks -milliseconds ]
set dt [ expr { $done - $start } ]
addLogEntry "${api}API shut down in $dt ms"
}
}
}
proc mgr::libstdcPreload { api } {
if { [ catch {
;## force preloading of libstdc++ as required
;## to avoid instant abort when c++ exception
;## is thrown which is not handled by a c program,
;## i.e. Tcl.
;## if getApiOS failed due to ssh issue throw exeption
;## and caller will skip the API.
set libstdc [ list ]
set OS [ getApiOS $api ]
if { [ regexp {Linux} $OS ] } {
if { [ info exists ::STDCPLUSPLUSLIBPAT ] } {
set libstdc [ libstdcPlusPlus $api ]
}
} elseif { [ regexp {Sun} $OS ] } {
if { [ info exists ::SUN_LIBSTDCPP ] } {
set libstdc $::SUN_LIBSTDCPP
}
} elseif { [ regexp {ssh_failed} $OS ] } {
return -code error $OS
}
} err ] } {
return -code error "[ myName ]: $err"
}
return $libstdc
}
proc mgr::libmallocPreload { api } {
if { [ catch {
;## force preloading of libXXmalloc as required
;## to avoid monotonic increasing memory usage due
;## to memory fragmentation.
set libmalloc [ list ]
set OS [ getApiOS $api ]
if { [ regexp {Linux} $OS ] } {
if { [ info exists ::LINUX_MALLOC ] } {
set libmalloc $::LINUX_MALLOC
}
} elseif { [ regexp {Sun} $OS ] } {
if { [ info exists ::SUN_MALLOC ] } {
set libmalloc $::SUN_MALLOC
}
} elseif { [ regexp {ssh_failed} $OS ] } {
return -code error $OS
}
} err ] } {
return -code error "[ myName ]: $err"
}
return $libmalloc
}
proc mgr::makeApiAtHome { api } {
if { [ catch {
set dir $::env(RUNDIR)/${api}API
if { ! [ file exists $dir ] } {
file mkdir $dir
} else {
if { ! [ file isdirectory $dir ] } {
file rename $dir ${dir}.bogus
file mkdir $dir
set msg "file with name reserved for $api API "
append msg "working directory renamed to ${dir}.bogus"
addLogEntry $msg red
}
}
file attributes $dir -permissions 0755
;## move misplaced .rsc and .ini files
foreach ext [ list rsc ini ] {
if { [ file exists $::env(RUNDIR)/LDAS${api}.$ext ] } {
file rename -force \
$::env(RUNDIR)/LDAS${api}.$ext $dir/LDAS${api}.$ext
set msg "moving LDAS${api}.$ext from $::env(RUNDIR) to $dir"
addLogEntry $msg blue
}
}
;## make sure browsers can browse things in here
;## that have the correct permissions
set htaccess ${dir}/.htaccess
if { ! [ file exists $htaccess ] } {
set fid [ open $htaccess w 0644 ]
puts $fid "DefaultType text/html"
::close $fid
}
} err ] } {
return -code error "[ myName ]: $err"
}
return $dir
}
proc mgr::bootstrapStatus { api } {
set flag 0
set i 0
if { ! [ info exist ::API_BOOTSTRAP_MAX_S ] } {
set ::API_BOOTSTRAP_MAX_S 60
addLogEntry "Resource ::API_BOOTSTRAP_MAX_S set to $::API_BOOTSTRAP_MAX_S by default" blue
}
if { ! [ info exist ::API_BOOTSTRAP_DELAY_S ] } {
set ::API_BOOTSTRAP_DELAY_S 5
addLogEntry "Resource ::API_BOOTSTRAP_DELAY_S set to $::API_BOOTSTRAP_DELAY_S by default" blue
}
set maxtime [ expr $::API_BOOTSTRAP_MAX_S * 1000 ]
set delay [ expr $::API_BOOTSTRAP_DELAY_S * 1000 ]
while { $i < $maxtime } {
if { [ catch {
if { [ string equal manager $api ] } {
set flag 1
} else {
set sid [ sock::open $api operator ]
::close $sid
set flag 1
}
} err ] } {
catch { ::close $sid }
if { [ info exists ::DEBUG_BOOTSTRAP_LOCK ] && \
[ string equal 1 $::DEBUG_BOOTSTRAP_LOCK ] } {
addLogEntry "${api}API $err" purple
}
set flag 0
}
if { ! $flag } {
incr i $delay
after $delay
} else {
break
}
}
return $flag
}
proc mgr::bootstrapLock { apis { reset 0 } } {
if { [ catch {
set temp [ list ]
set now [ clock seconds ]
foreach api $apis {
if { ! [ info exists ::__bootstraplock_$api ] } {
set ::__bootstraplock_$api $now
lappend temp $api
} elseif { $reset } {
set ::__bootstraplock_$api $now
} else {
set lastlock [ lindex [ set ::__bootstraplock_$api ] end ]
if { $now > $lastlock } {
lappend ::__bootstraplock_$api $now
}
}
}
if { [ info exists ::DEBUG_BOOTSTRAP_LOCK ] && \
[ string equal 1 $::DEBUG_BOOTSTRAP_LOCK ] } {
set report [ list ]
set locks [ info vars ::__boot* ]
foreach lock $locks {
lappend report "$lock [ set $lock ] "
}
addLogEntry $report purple
}
} err ] } {
return -code error "[ myName ]: $err"
}
return $temp
}
proc mgr::bootstrapUnlock { api } {
if { [ catch {
if { [ info exists ::__bootstraplock_$api ] } {
set locks [ set ::__bootstraplock_$api ]
if { [ llength $locks ] > 1 } {
set locks [ lrange $locks 1 end ]
set ::__bootstraplock_$api $locks
set locked 1
} else {
::unset ::__bootstraplock_$api
set locked 0
}
} else {
set locked 0
}
if { [ info exists ::DEBUG_BOOTSTRAP_LOCK ] && \
[ string equal 1 $::DEBUG_BOOTSTRAP_LOCK ] } {
set report [ list ]
set locks [ info vars ::__boot* ]
foreach lock $locks {
lappend report "$lock [ set $lock ] "
}
addLogEntry $report purple
}
} err ] } {
return -code error "[ myName ]: $err"
}
return $locked
}
proc bootstrapLockTimeout { } {
if { [ catch {
set locks [ list ]
set api oops!
set now [ clock seconds ]
set notice "The resource variable ::MGR_BOOTSTRAPLOCK_TIMEOUT "
append notice "has been\nset to \$::MGR_BOOTSTRAPLOCK_TIMEOUT."
append notice " If this value is less than "
append notice "15 * number_of_beowulf_nodes\n"
append notice "or the number_of_mpi_search_users, "
append notice "whichever is GREATER for this system, then\n"
append notice "the value should be adjusted, and made to be\n"
append notice "at least 15 * (number of nodes or users)."
;## rationalise resource variable
if { ! [ info exists ::MGR_BOOTSTRAPLOCK_TIMEOUT ] } {
set ::MGR_BOOTSTRAPLOCK_TIMEOUT 300
set notice [ subst -nocommands $notice ]
addLogEntry $notice blue
} else {
;## constrain timeout to an integer number of
;## seconds between 0 and 9999
if { ! [ regexp {^\d{1,4}$} $::MGR_BOOTSTRAPLOCK_TIMEOUT ] } {
set ::MGR_BOOTSTRAPLOCK_TIMEOUT 300
set notice [ subst -nocommands $notice ]
addLogEntry $notice blue
}
}
set to $::MGR_BOOTSTRAPLOCK_TIMEOUT
;## iterate over the locks, unset the expired ones
;## and send email to the responsible party
set locks [ info vars ::__bootstraplock_* ]
foreach lock $locks {
regexp {__bootstraplock_(\S+)} $lock -> api
if { [ string equal manager $api ] } { continue }
set then [ set $lock ]
if { $now >= [ lindex $then end ] + $to } {
::unset $lock
set msg "$lock expired after $to seconds. "
append msg "something is wrong with the $::LDAS_SYSTEM "
append msg "LDAS system $api API!"
set subject "$::LDAS_SYSTEM $lock expired!"
set notice [ subst -nocommands $notice ]
set msg "${msg}\n\n$notice"
addLogEntry "Subject: ${subject}; Body: $msg" mail
}
}
} err ] } {
return -code error "[ myName ]: $err"
}
}
proc mgr::pushFTPandHTTPinfo { api { version "" } } {
set errs [ list ]
if { [ catch {
if { [ string length $version ] } {
mgr::apiVersionCheck $api $version
} else {
mgr::apiVersionCheck $api UNKNOWN
}
set vars [ list $::FTPURL $::FTPDIR $::HTTPURL \
$::HTTPDIR $::GRIDFTPURL $::GRIDFTPDIR \
[ lindex $::LDAS_GATEWAY 0 ] \
[ lindex $::LDAS_GATEWAY 1 ] $::LDAS_SYSTEM ]
set sid [ sock::open $api emergency ]
fconfigure $sid -blocking off
puts $sid "$::MGRKEY setFTPandHTTPinfo $vars"
::close $sid
} err ] } {
catch { ::close $sid }
return -code error "[ myName ]: $err"
}
}
mgr::shutdownAPI $apinameComments:
proc mgr::shutdownAPI { { apis "" } { comment "" } } {
if { ! [ string length $apis ] } {
set apis $::API_LIST
}
foreach api $apis {
if { [ string equal $api manager ] } {
continue
}
;## abort sequence for an assistant manager
set assistants [ namespace children :: ${::RUNCODE}* ]
foreach assistant $assistants {
if { [ catch {
set name [ string trim $assistant : ]
if { [ info exists ::${name}::api ] && \
[ string equal $api [ set ::${name}::api ] ] } {
set msg "$api API shutting down NOW!\n$name aborting!"
set ::${name}::errorapi [ set ::${name}::api ]
if { [ llength [ namespace children :: ::$name ] ] } {
::${name}::predelete $msg $msg
if { [ llength [ namespace children :: ::$name ] ] } {
::${name}::seq
}
}
}
} err ] } {
addLogEntry $err red
}
}
if { [ catch {
;## memory usage and other API process info
set statefile [ file join $::LDASLOG ${api}.status ]
set arc $::LDASARC/${api}API/${api}_status.[ gpsTime ]
if { [ file exists $statefile ] } {
set fid1 [ open $statefile r ]
set fid2 [ open $arc w ]
puts $fid2 "<html>\n<pre>"
fcopy $fid1 $fid2
::close $fid2
::close $fid1
file delete -force -- $statefile
}
} err ] } {
}
;## take a gcore of the API before shutdown
if { ! [ string equal nocore $comment ] } {
set apihost [ set ::${api}(host) ]
set now [ clock seconds ]
set pid ""
if { [ info exists ::bootstrapemailpid($api) ] } {
set pid $::bootstrapemailpid($api)
}
if { ![ string length $pid ] } {
set lockfile [ glob -nocomplain $::TOPDIR/${api}API/.${api}.*.lock ]
regexp {(\d+)} $lockfile -> pid
addLogEntry "obtained $api pid $pid from lock file" purple
}
if { [ string length $pid ] && [ info exist ::FORCE_GCORE ] && $::FORCE_GCORE} {
set cmd "alias cd=cd && cd $::TOPDIR/savedCores && gcore -o ${api}_${now}_gcore $pid"
addLogEntry "executing $cmd on $apihost" purple
execssh $apihost $cmd
}
}
if { [ catch {
set sid [ sock::open $api emergency ]
puts $sid "$::MGRKEY ${api}::sHuTdOwN \"$comment\""
after 100
cmd::receive $sid ${api}_$sid mgr::logReply
addLogEntry "$api API Shut Down" red
} err ] } {
}
if { [ catch {
;## I think we get here and delete the lock file
;## under all circumstances...
set globpat [ file join [ apiDirectory $api ] .$api.*.lock ]
set locks [ glob -nocomplain $globpat ]
foreach lock $locks {
set pid [ lindex [ split [ file tail $lock ] . ] end-1 ]
file delete -force -- $lock
addLogEntry "deleted lock file: $lock" blue
}
} err ] } {
}
}
}
proc mgr::postMortem { api } {
set data [ list ]
set url [ list ]
if { [ catch {
set url [ file dirname $::HTTPURL ]
regsub / $url // url
;## we may need an ::LDAS_LOG_ARCHIVE_URL
set url ${url}/logs/archive/${api}API/
set time [ gpsTime ]
set home [ apiDirectory $api ]
set stderrlog ${home}/${api}.log
if { [ file exists $stderrlog ] } {
set url $url${api}.log.$time
set url "stdout/stderr for previous run of $api API: $url"
set arcname $::LDASARC/${api}API/${api}.log.$time
set dir $::LDASARC/${api}API
if { ! [ file isdirectory $dir ] } {
file mkdir $dir
file attributes $dir -permissions 0755
gifBalls $dir
}
set fid1 [ open $arcname w ]
puts $fid1 "<html>\n<pre>"
set fid2 [ open $stderrlog r ]
fcopy $fid2 $fid1
::close $fid2
::close $fid1
} else {
set url [ list ]
}
} err ] } {
if { [ info exists fid ] } {
catch { ::close $fid }
}
if { [ info exists fid1 ] } {
catch { ::close $fid1 }
}
if { [ info exists fid2 ] } {
catch { ::close $fid2 }
}
return -code error "[ myName ]: $err"
}
return $url
}
proc mgr::dontStepOnJob { api timeout } {
set i 0
set vnames [ infoVars ]
;## try not to disturb a running job
foreach vname $vnames {
if { [ regexp $::RUNCODE $vname ] } {
while { [ regexp $api [ set $vname ] ] && $i < $timeout } {
incr i
sleep 1000
}
}
}
}
proc mgr::preValidate { cmd } {
if { [ catch {
set seqpt {}
set user [ list ]
set soname [ list ]
regexp -nocase -- {-name\s+\{?\s*(\S+)} [ lindex $cmd 1 ] -> user
set user [ string trim $user "{} " ]
;## see if dso is temporarily blocked!
regexp -nocase -- {-dynlib\s+\{?\s*(\S+)} $cmd -> soname
set soname [ string trim $soname "{} " ]
set cmdname [ lindex [ lindex $cmd 2 ] 0 ]
;## see if user is temporarily blocked!
if { [ lsearch $::mgr::blocked_users $user ] != -1 } {
set seqpt "ADMIN NOTICE: "
set err "user '$user' has been temporarily blocked!"
after 0 \
[ list mgr::logRejectedJob $user $cmdname blocked_user ]
return -code error $err
}
;## see if dso is temporarily blocked!
if { [ lsearch $::mgr::blocked_dsos $soname ] != -1 } {
set seqpt "ADMIN NOTICE: "
set err "DSO '$soname' has been temporarily blocked!"
after 0 \
[ list mgr::logRejectedJob $user $cmdname blocked_dso ]
return -code error $err
}
;## see if the user is exceeding his quota!
set seqpt "QUOTA EXCEEDED: "
mgr::userQuota $cmd
} err ] } {
return -code error "$seqpt$err"
}
}
proc mgr::userInfo { args } {
set return_info [ list ]
set scratch [ list ]
set now [ clock seconds ]
if { [ catch {
if { [ catch {
array set input $args
} ] } {
eval array set input $args
}
foreach user $::QUEUE(USERS) {
if { ! [ llength $user ] } { continue }
if { [ llength $user ] != 13 } {
mgr::reportCorruptedQueue $user
}
;## each user entry contains the three user info item
;## pairs and a list of the last N jobs processed for
;## that user.
set info [ lrange $user 0 end-1 ]
set jobs [ lindex $user end ]
array set q $info
if { [ string equal $q(-name) $input(-name) ] } {
;## Peter Shawhan's challenge/response code added
;## 05/20/02
if { [ string equal md5protocol $input(-password) ] } {
set salt [ uplevel set salt ]
set digest [ uplevel set digest ]
set test [ key::md5 $q(-password)$salt ]
if { [ string equal $digest $test ] } {
set hash $q(-password)
} else {
set hash invalid
}
} else {
set hash [ key::md5 $input(-password) ]
}
if { ! [ string equal $q(-password) $hash ] } {
set msg "Incorrect password given for user $q(-name)"
return -code error $msg
}
if { ! [ regexp {^\d+$} $q(-expires) ] } {
set q(expires) [ clock scan $q(-expires) ]
}
if { $now > $q(-expires) } {
set edate [ clock format $q(-expires) -format %D ]
set msg "MOU for user $q(-name) expires on ${edate}. "
append msg "Please contact LDAS admin to apply for "
append msg "an extension."
return -code error $msg
}
;## here we mangle the args so that the user password
;## is not updated when the other user data is updated!
regsub -- $input(-password) $args $q(-password) args
set info [ expandOpts info ]
set return_info $info
if { [ llength $return_info ] < 12 } {
set msg "QUEUE(USERS) has been corrupted!\n"
append msg "Please check the user info file for bad\n"
append msg "entries, missing \"'s, etc."
return -code error $msg
}
}
lappend scratch [ concat $info [ list $jobs ] ]
} ;## end of foreach
if { [ llength $return_info ] } {
set ::QUEUE(USERS) $scratch
if { ! [ regexp "\[a-z\]+" $::QUEUE(USERS) ] } {
set msg "The QUEUE(USERS) queue has been\n"
append msg "initialised, but is empty."
return -code error $msg
}
} else {
return -code error "unknown user: '$input(-name)'"
}
} err ] } {
return -code error "[ myName ]: $err"
}
return $return_info
}
proc mgr::reportCorruptedQueue { args } {
set baddata [ list $args ]
set subject "$::LDAS_SYSTEM users queue has become corrupted!"
set msg "${subject}\n"
append msg "The manager API choked on this user entry,\n"
append msg "which does not appear to have the correct\n"
append msg "number of fields:\n\n'$baddata'"
addLogEntry "Subject: ${subject}; Body: $msg" mail
return -code error "[ myName ]: $subject"
}
proc mgr::assignJobid { args } {
if { [ catch {
set return_info [ list ]
set scratch [ list ]
set now [ clock seconds ]
if { [ catch {
array set input $args
} ] } {
eval array set input $args
}
foreach user $::QUEUE(USERS) {
if { ! [ llength $user ] } { continue }
if { [ llength $user ] != 13 } {
set msg "QUEUE(USERS) has been corrupted!"
addLogEntry $msg 2
return -code error "[ myName ]: $msg"
}
;## each user entry contains the three user info item
;## pairs and a list of the last N jobs processed for
;## that user.
set info [ lrange $user 0 end-1 ]
set jobs [ lindex $user end ]
array set q $info
if { [ string equal $q(-name) $input(-name) ] } {
set jobid [ key::increment $::RUNCODE ]
set jobs \
[ linsert $jobs 0 [ list $jobid $input(-email) $now ] ]
set info [ expandOpts info ]
set return_info $info
if { [ llength $return_info ] < 12 } {
set msg "QUEUE(USERS) has been corrupted!\n"
append msg "Please check the user info file for bad\n"
append msg "entries, missing \"'s, etc."
return -code error "[ myName ]: $msg"
}
}
lappend scratch [ concat $info [ list $jobs ] ]
} ;## end of foreach
if { [ llength $return_info ] } {
set ::QUEUE(USERS) $scratch
if { ! [ regexp "\[a-z\]+" $::QUEUE(USERS) ] } {
set msg "The QUEUE(USERS) queue has been\n"
append msg "initialised, but is empty."
return -code error "[ myName ]: $msg"
}
}
} err ] } {
return -code error "[ myName ]: $err"
}
return $jobid
}
proc mgr::deleteJobidFromUsersQueue { jobid } {
if { [ catch {
set name [ list ]
regexp {\d+} $jobid jobid
set jobid $::RUNCODE$jobid
set done 0
set i 0
set j 0
foreach user $::QUEUE(USERS) {
set jobs [ lindex $user end ]
foreach job $jobs {
if { [ string equal $jobid [ lindex $job 0 ] ] } {
set done 1
break
}
incr i
}
if { $done } { break }
set i 0
incr j
} ;## end of foreach on users
if { $done } {
set jobs [ lreplace $jobs $i $i ]
set user [ lreplace $user end end $jobs ]
set ::QUEUE(USERS) [ lreplace $::QUEUE(USERS) $j $j $user ]
} else {
set msg "jobid: '$jobid' did not match any user!!"
return -code error $msg
}
} err ] } {
addLogEntry $err red
}
}
proc mgr::job2email { jobid } {
if { [ catch {
set email [ list ]
set jobemail [ list ]
regexp {\d+} $jobid jobid
set jobid $::RUNCODE$jobid
set name [ job2name $jobid ]
foreach user $::QUEUE(USERS) {
if { [ string equal $name [ lindex $user 1 ] ] } {
set email [ lindex $user 5 ]
set jobs [ lindex $user end ]
foreach job $jobs {
if { [ lsearch $job $jobid ] == 0 } {
set jobemail [ lindex $job 1 ]
break
}
}
break
}
} ;## end of foreach on users
if { ! [ string length $email ] && \
! [ string length $jobemail ] } {
return -code error "failed to resolve email for '$jobid'"
}
} err ] } {
return -code error "[ myName ]: $err"
}
if { [ string length $jobemail ] } {
return $jobemail
} else {
return $email
}
}
proc mgr::job2name { jobid } {
if { [ catch {
set name [ list ]
regexp {\d+} $jobid jobid
set jobid $::RUNCODE$jobid
foreach user $::QUEUE(USERS) {
set jobs [ join [ lindex $user end ] ]
if { [ lsearch $jobs $jobid ] > -1 } {
set name [ lindex $user 1 ]
break
}
} ;## end of foreach on users
} err ] } {
return -code error "[ myName ]: $err"
}
return $name
}
set url [ mgr::jobStatus $jobid ]Comments:
proc mgr::jobStatus { jobid } {
set data "
<!DOCTYPE LDAS-help>
<!-- created by mgr::JobStatus -->
<title>LIGO LDAS Job Queue Contents</title>
<a name=\"top\">
<p>
<img src=\"help/LDAS.gif\">
<br>\n"
if { [ catch {
set limit [ expr { $::NUMBER_OF_ASSISTANT_MANAGERS * \
($::MANAGER_QUEUE_SIZE_FACTOR + 1) } ]
;## get all the commands running and queued
foreach name [ array names ::QUEUE *,cmd ] {
set job [ lindex [ split $name , ] 0 ]
if { [ info exists ::${job}::cmd ] } {
set id($job) [ set ::${job}::cmd ]
} else {
set id($job) $::QUEUE($name)
}
}
;## get all the user info
set raw(USERS) [ lindex [ queue::dump USERS ] 0 ]
foreach user $raw(USERS) {
foreach { 1 user 2 3 4 5 6 7 8 9 10 11 jobs } $user {
if { [ llength $jobs ] } {
append data "<font color=red><b>$user</b></font><p>\n"
append data "<ol>\n"
foreach job [ lrange $jobs 0 $limit ] {
set job [ lindex $job 0 ]
append data "<li><b>$job</b><ul>\n"
if { [ catch { set cmd [ set id($job) ] } ] } {
append data "<li><tt>finished!</tt>\n"
} else {
set steps [ expr { [ llength $cmd ] / 2 } ]
incr steps -2
if { $steps < 0 } { set steps 0 }
if { [ catch {
set api [ set ::${job}::api ]
set starttime [ set ::${job}::starttime(job) ]
} err ] } {
set api {}
set starttime {}
}
set now [ clock clicks -milliseconds ]
if { [ catch {
set dt [ expr { ($now - $starttime) / 1000.0 } ]
} err ] } {
set dt 0
}
if { ! [ string length $api ] } {
set api "queue in manager"
}
append data "<li><tt><font color=red>$steps</font> steps remaining<br>"
append data "after <font color=red>$dt</font> seconds runtime<br>"
append data "(currently in <font color=red>$api</font> API)</tt>\n"
}
append data "</ul>\n"
}
append data "</ol>\n"
}
}
}
} err ] } {
return -code error "[ myName]: $err"
}
append data "
<center>
<h4>
<a href=\"#top\">Back to Top</a>
</h4>
</center>\n"
return [ publicFile $jobid status_report.html $data ]
}
proc mgr::activeJobStatusSummary { { format list } } {
if { [ catch {
set data [ list ]
set current [ list ]
set waiting [ list ]
set retval [ list ]
set running [ namespace children :: ${::RUNCODE}* ]
set running [ lsort $running ]
set queued $::assistant_queue
set queuelen [ llength $queued ]
set queuelen_msg [ list ]
set Nasst $::NUMBER_OF_ASSISTANT_MANAGERS
set now [ clock clicks -milliseconds ]
set time [ clock format [ clock seconds ] -format "%x-%X %Z" ]
set dom $::LDAS_SYSTEM
;## RUNNING
foreach job $running {
set user [ mgr::job2name $job ]
set api [ set ${job}::api ]
set command [ set ${job}::usrcmdname ]
set so_rx {-dynlib[\s\{]+([^\s\}\{]+)}
set fullcommand [ set ${job}::usrcommand ]
if { [ regexp -nocase -- $so_rx $fullcommand -> soname ] } {
set soname [ file tail $soname ]
set command "${command}($soname)"
}
set start [ set ${job}::starttime(job) ]
;## possibly less than a millisecond has elapsed and
;## we are at in-between states in the array
if { [ catch {
set apit0 [ set ${job}::starttime($api) ]
} err ] } {
set apit0 $now
}
set job [ string trim $job : ]
set runtime [ expr { ($now - $start) / 1000.0 } ]
set runtime [ format "%.2f" $runtime ]
set apitime [ expr { ($now - $apit0) / 1000.0 } ]
set apitime [ format "%.2f" $apitime ]
set data [ list $job $command $api $user $runtime:$apitime ]
lappend current $data
}
;## QUEUED
if { $queuelen > $Nasst } {
set queued [ lrange $queued 0 [ expr { $Nasst - 1 } ] ]
set queuelen_msg "$queuelen total jobs currently in queue"
}
set ::assistant_queue_stats(wait) 0.00
foreach job $queued {
foreach { job command cmdtext } $job { break }
set job [ string trim $job : ]
set user [ mgr::job2name $job ]
if { [ catch {
;## we don't use the throttle mark anymore...
#set waited [ ::__t::mark $job ]
set waited \
[ expr { ($now - $::queuetime($job)) / 1000.0 } ]
set waited [ format "%.2f" $waited ]
} err ] } {
set waited 0.00
}
if { $waited > $::assistant_queue_stats(wait) } {
set ::assistant_queue_stats(wait) $waited
}
set data [ list $job $command queued $user $waited ]
lappend waiting $data
}
;## text format header
set text "====================================================================\n"
append text "Summary of running jobs at $dom at $time:\n\n"
append text " job_i.d. command current_API username RUN_TIME(s)\n"
append text "====================================================================\n"
set text_fs "%-18s %-10s %-10s %-10s %-10s\n"
;## html format header
set head "<html>
<head>
<!-- summary report of running job status -->
<title>Summary of Running Job Status at $dom at $time</title>
</head>
<BODY BGCOLOR='#DDDDDD' TEXT='#000000'>
<h2>Summary of Running Job Status at $dom at $time</h2>
<p>
<ul>\n"
regsub -all {[ ]+} $head { } head
;## html format footer
set foot "</ol></body></html>"
switch -exact [ string tolower $format ] {
text {
set retval $text
foreach item [ concat $current $waiting ] {
foreach [ list 1 2 3 4 5 ] $item { break }
append retval [ format $text_fs $1 $2 $3 $4 $5 ]
}
}
html {
set body "<font color=red>\n"
append body "<b>Running:</b><p>\n</ul>\n"
append body "<font color=black>\ \ \ \ "
append body "guide:\ \ </font>\n"
append body "<font color=brown>jobid\ \ </font>\n"
append body "<font color=green>command\ \ </font>\n"
append body "<font color=brown>current_API\ \ </font>\n"
append body "<font color=green>user\ </font>\n"
append body "<font color=brown>total_runtime\ </font>\n"
append body "<font color=red>current_API_runtime\ </font>\n"
append body "<font color=green>\n<ul>\n"
foreach item $current {
foreach [ list 1 2 3 4 5 ] $item { break }
set 1 "<font color=brown>${1}</font>"
set 3 "<font color=brown>${3}</font>"
set 5s [ split $5 : ]
set 5 "<font color=brown>[ lindex $5s 0 ]</font>"
append 5 "\ "
append 5 "<font color=red>[ lindex $5s 1 ]</font>"
append body "<li>$1 $2 $3 $4 $5<br>\n"
}
append body "</ul>\n"
append body "</ol>\n"
append body "<font color=red>\n"
append body "<b>Queued:</b><p>\n"
append body "<font color=black>\n"
foreach item $waiting {
append body "<li>${item}<br>\n"
}
;## truncation message if queue is large
if { [ string length $queuelen_msg ] } {
append body "<font color=red>\n"
append body "<b>...<p>$queuelen_msg</b>\n"
append body "<font color=black>\n"
}
append body "</font>\n"
set retval $head$body$foot
}
list {
set retval [ list $current $waiting $queuelen_msg ]
}
default {
set msg "invalid format: '$format'"
return -code error $msg
}
}
} err ] } {
return -code error "[ myName ]: $err"
}
return $retval
}
proc mgr::activeJobStatusLockReport { args } {
if { [ catch {
set msg [ list ]
if { [ llength [ info vars ::__bootstraplock_* ] ] > 1 } {
set apis [ list ]
foreach item [ info vars ::__bootstraplock_* ] {
lappend apis [ lindex [ split $item _ ] end ]
}
regsub {\s*manager\s*} $apis { } apis
set msg "JOB PROCESSING SUSPENDED WHILE $apis API(s) "
append msg "STABILIZE!"
}
if { [ info exists ::STOP_PROCESSING_JOBS_NO_DISK_SPACE ] && \
$::STOP_PROCESSING_JOBS_NO_DISK_SPACE == 1 } {
set msg "JOB PROCESSING HALTED. NO DISK SPACE!"
}
if { [ string length $msg ] } {
set msg "<center><h2><font color=red>$msg</font></h2></center>"
}
} err ] } {
return -code error "[ myName ]: $err"
}
return $msg
}
proc mgr::writeActiveJobStatus { } {
if { [ catch {
set time [ clock format [ clock seconds ] -format "%x-%X %Z" ]
set dom $::LDAS_SYSTEM
set refresh $::RUNNING_JOB_STAT_UPDATE_RATE
;## text format header
set text "====================================================================\n"
append text "Summary of running jobs at $dom at $time:\n\n"
append text " job_i.d. command current_API username RUN_TIME(s)\n"
append text "====================================================================\n"
set text_fs "%-18s %-10s %-10s %-10s %-10s\n"
;## html format header
set head "<html>
<head>
<!-- summary report of running job status -->
<title>Summary of Running Job Status at $dom at $time</title>
</head>
<BODY BGCOLOR='#DDDDDD' TEXT='#000000'>
<center>
<h2>Summary of Running Job Status at $dom at $time</h2>
<b><i>(The content of this page is updated every $refresh seconds)</i></b>
</center>
<p>
<ul>\n"
regsub -all {[ ]+} $head { } head
;## BIG RED WARNING IF JOB PROCESSING IS CURRENTLY SUSPENDED!
set stopped [ mgr::activeJobStatusLockReport ]
if { [ string length $stopped ] } {
append head "${stopped}<p>\n\n"
}
;## html format footer
set foot "</ol></body></html>"
set textfile ${::PUBDIR}/activejobs.txt
set tclfile ${::PUBDIR}/activejobs.tcl
set htmlfile ${::PUBDIR}/activejobs.html
set data [ mgr::activeJobStatusSummary list ]
;## tcl list version
set fid [ open $tclfile w ]
puts $fid $data
::close $fid
foreach [ list running waiting queuelen_msg ] $data { break }
;## formatted text version
foreach item [ concat $running $waiting ] {
foreach [ list 1 2 3 4 5 ] $item { break }
append text [ format $text_fs $1 $2 $3 $4 $5 ]
}
set fid [ open $textfile w ]
puts $fid $text
::close $fid
;## running jobs in html version
set body "<table border=1 columns=6 width=90%>\n"
append body "<tr><th colspan=6 align=left><font color=red>\n"
append body "<b>Running:</b></tr>\n"
append body "<tr>\n<font color=brown><th>jobid</font>\n"
append body "<font color=green><th>command</font>\n"
append body "<font color=brown><th>current API</font>\n"
append body "<font color=green><th>user</font>\n"
append body "<font color=brown><th>total runtime</font>\n"
append body "<font color=red><th>current API runtime</font>\n"
foreach item $running {
foreach [ list 1 2 3 4 5 ] $item { break }
set 1 [ mgr::jobidToHref $1 ]
set 1 "<td>${1}"
set 2 "<td>$2"
set 3 "<td><font color=brown>${3}</font>"
set 4 "<td>$4"
set 5s [ split $5 : ]
set 5 "<td><font color=brown>[ lindex $5s 0 ]</font>"
append 5 "\ "
append 5 "<td><font color=red>[ lindex $5s 1 ]</font>"
append body "</tr><tr>$1 $2 $3 $4 $5<br>\n"
}
;## queued jobs in html version
append body "</table>\n"
append body "<table border=0 columns=6 width=90%>\n"
append body "<tr><th colspan=6 align=left>\n"
append body "<font color=red>\n"
append body "<br>Queued:</font></th></tr>\n"
set i 0
foreach item $waiting {
set href [ mgr::jobidToHref [ lindex $item 0 ] ]
set item [ lrange $item 1 end ]
regsub -all {\s+} $item "<td>" item
set item "${href}<td>$item"
append body "<tr><td>[ incr i ].<td>${item}</tr>\n"
}
append body "</table>\n"
;## truncation message if queue is large
if { [ string length $queuelen_msg ] } {
append body "<font color=red>\n"
append body "<b>...<p>$queuelen_msg</b>\n"
}
set fid [ open $htmlfile w ]
puts $fid $head$body$foot
::close $fid
} err ] } {
catch { ::close $fid }
addLogEntry $err red
}
}
proc mgr::jobidToHref { jobid } {
if { [ catch {
set jobdir [ jobDirectory $jobid ]
set base $::HTTPURL
regsub $::PUBDIR $base {} base
set full $base${jobdir}/user_command.html
set href "<a href=$full>$jobid</a>"
} err ] } {
return -code error "[ myName ]: $err"
}
return $href
}
proc mgr::dropPrettyCommand { jobid cmd } {
if { [ catch {
;## make a prettified verison of the command for dropping
set tmp [ mgr::prettifyCmdHtml $jobid $cmd ]
;## use side-effect of jobDirectory to create one
set jobdir [ jobDirectory $jobid ]
;## drop a nice html formatted user command
set fid [ open $jobdir/user_command.html w 0644 ]
puts $fid $tmp
close $fid
} err ] } {
catch { ::close $fid }
addLogEntry $err orange
}
}
proc mgr::prettifyCmdHtml { jobid cmd } {
if { [ catch {
set user [ mgr::job2name $jobid ]
set start [ clock format [ clock seconds ] ]
set head "<html><h1>$jobid user: $user</h1>\n"
append head "<h2>started at: $start</h2>\n"
append head "<p><p><tt>"
regsub -all -- { -} $cmd "<p>\n -" cmd
regsub -all -- {\|} $cmd ";<br>\n" cmd
regsub -all -- { (-\S+)} $cmd \
{<font color=red> \1</font>} cmd
regsub -all -- {([\}\{]+)} $cmd \
{<font color=blue>\1</font>} cmd
regsub -all -- {(file:/[^\s\n;]+)} $cmd \
{<font color=purple>\1</font>} cmd
set cmd "$head\n$cmd\n</html>"
} err ] } {
return -code error "[ myName ]: $err"
}
return $cmd
}
proc mgr::getCommandInputText { args } {
if { [ catch {
} err ] } {
return -code error "[ myName ]: $err"
}
}
proc mgr::jobTimingReport { jobid } {
regexp {\d+} $jobid job
set jobid $::RUNCODE$job
set errorapi [ list ]
set runtimedt 0
set report "=========================================\n"
append report "'LDAS API' 'CLOCK TIME(seconds)'\n"
append report "=========================================\n"
if { [ catch {
;## if job ended with an error
if { [ info exists ::${jobid}::errorapi ] } {
set errorapi [ set ::${jobid}::errorapi ]
set status fail
} else {
set status pass
}
set command [ set ::${jobid}::usrcmdname ]
set so_rx {-dynlib[\s\{]+([^\s\}\{]+)}
set fullcommand [ set ::${jobid}::usrcommand ]
if { [ regexp -nocase -- $so_rx $fullcommand -> soname ] } {
set soname [ file tail $soname ]
set command "${command}($soname)"
}
;## time job spent waiting in manager queue
set queuetime [ set ::${jobid}::queuetime ]
mgr::oneLineJobTimingReport $jobid queue $queuetime cm $command
set line [ format "%-27s %10.2f" "Wait Time(queue):" $queuetime ]
append report "$line\n"
set runtimedt $queuetime
set jobstart [ set ::${jobid}::starttime(job) ]
foreach api [ array names ::${jobid}::deltat ] {
set dt [ set ::${jobid}::deltat($api) ]
set runtimedt [ expr { $runtimedt + $dt } ]
set dt [ expr { $dt / 1000.0 } ]
mgr::oneLineJobTimingReport $jobid $api $dt
mgr::perApiRunningStats $api runtime $dt
set line [ format "%-27s %10.2f" ${api}API: $dt ]
if { [ string equal $api $errorapi ] } {
append line " (ERROR)"
mgr::perApiRunningStats $api failed 1
} else {
mgr::perApiRunningStats $api succeeded 1
}
append report "$line\n"
}
set runtimedt [ expr { $runtimedt / 1000.0 } ]
append report "-----------------------------------------\n"
set line [ format "%-27s %10.2f" "managerAPI(total):" $runtimedt ]
append report "$line\n"
append report "=========================================\n"
append report "\n"
append report "$::LDAS_SYSTEM running LDAS version "
append report "$::LDAS_VERSION\n"
} err ] } {
addLogEntry $err red
set report [ list ]
}
catch { mgr::runningJobStatsFile $jobid $runtimedt $status }
return $report
}
Column: Content:Comments:
0 jobid 1 gps start time 2 username 3 user command name 4 pass or fail 5 seconds in queue 6 seconds in managerAPI 7 seconds in diskcacheAPI 8 seconds in frameAPI 9 seconds in metadataAPI 10 seconds in datacondAPI 11 seconds in mpiAPI 12 seconds in wrapperAPI 13 seconds in eventmonAPI 14 seconds in lightweightAPI 15 total seconds in all APIs 16 total of column 5 plus 15 (all time)
17 current queue size 18 seconds in cntlmonAPI
proc mgr::oneLineJobTimingReport { jobid args } {
if { [ catch {
if { [ llength $args ] == 1 } {
set args [ lindex $args 0 ]
}
if { ! [ info exists ::onelinereport($jobid) ] } {
set ::onelinereport($jobid) \
[ list $jobid 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ]
}
;## list of two letter keys corresponding to columns in order
set keys \
[ list id st nm cm pf qu ma di fr me da mp wr ev li tt cn ]
foreach [ list key value ] $args {
set key [ string tolower [ string range $key 0 1 ] ]
set i [ lsearch $keys $key ]
if { $i > -1 } {
set ::onelinereport($jobid) \
[ lreplace $::onelinereport($jobid) $i $i $value ]
} elseif { [ string equal xx $key ] } {
;## calculate and add column 16, the total time for
;## the job including queue time
set QT [ lindex $::onelinereport($jobid) 5 ]
set RT [ lindex $::onelinereport($jobid) 15 ]
set TT [ expr { $QT + $RT } ]
#lappend ::onelinereport($jobid) $TT
set ::onelinereport($jobid) [ linsert $::onelinereport($jobid) end-2 $TT ]
;## get the current job queue size
set QS [ llength $::assistant_queue ]
# lappend ::onelinereport($jobid) $QS
set ::onelinereport($jobid) [ linsert $::onelinereport($jobid) end-1 $QS ]
;## manage the file...
set now [ gpsTime ]
set active $::LDASLOG/job_stats.log
set archive $::LDASARC/jobstats_archive/job_stats.log.$now
set dir $::LDASARC/jobstats_archive
if { ! [ file isdirectory $dir ] } {
file mkdir $dir
file attributes $dir -permissions 0755
gifBalls $dir
}
set mb [ expr { 1024 * 1024 } ]
if { [ file exists $active ] } {
if { [ file size $active ] >= $mb } {
file rename $active $archive
}
}
set fid [ ::open $active a+ 0644 ]
::puts $fid $::onelinereport($jobid)
unset ::onelinereport($jobid)
::close $fid
} else {
set msg "bad key received: '$key' (value: '$value')"
addLogEntry $msg email
}
}
} err ] } {
catch { ::close $fid }
catch { ::unset ::onelinereport($jobid) }
addLogEntry $err email
}
}
proc mgr::perApiRunningStats { api { item "" } { value "" } } {
if { [ catch {
set filename perAPIstats
if { [ string equal read $api ] } {
if { [ file exists $filename ] } {
set fid [ open $filename r ]
set data [ read $fid [ file size $filename ] ]
::close $fid
array set ::mgr::apirunstats $data
set msg "read $filename into ::mgr::apirunstats array"
addLogEntry $msg blue
}
return {}
}
if { [ string equal write $api ] && \
[ array exists ::mgr::apirunstats ] } {
set fid [ open $filename w ]
puts $fid [ array get ::mgr::apirunstats ]
::close $fid
addLogEntry "wrote $filename" blue
return {}
}
if { ! [ string length $item ] } {
set msg "no value given for parameter 'item'"
return -code error $msg
}
if { ! [ string length $value ] } {
set msg "no value given for parameter 'value'"
return -code error $msg
}
set keys [ list runtime succeeded failed ]
foreach key $keys {
if { ! [ info exists ::mgr::apirunstats($api,$key) ] } {
set ::mgr::apirunstats($api,$key) 0
}
}
if { ! [ info exists ::mgr::apirunstats($api,restarted) ] } {
set ::mgr::apirunstats($api,restarted) [ list ]
}
switch -exact $item {
runtime {
set rt $::mgr::apirunstats($api,runtime)
set rt [ expr { $rt + $value } ]
set ::mgr::apirunstats($api,runtime) $rt
}
succeeded {
incr ::mgr::apirunstats($api,succeeded)
}
failed {
incr ::mgr::apirunstats($api,failed)
}
restart {
set gps $value
lappend ::mgr::apirunstats($api,restart) $gps
}
default {
set msg "invalid key: $api --> $item <-- $value"
return -code error $msg
}
} ;## end of switch
} err ] } {
if { [ string length $err ] } {
catch { ::close $fid }
return -code error "[ myName ]: $err"
}
}
}
proc mgr::runStatusSummary { { apis all } { form list } { save 0 } } {
if { [ catch {
if { ! [ array exists ::mgr::apirunstats ] } {
set err "no jobs have run since last system restart"
return -code error $err
}
if { [ string equal all $apis ] } {
set apis [ lsort $::API_LIST ]
}
set dom $::LDAS_SYSTEM
set head "<html>
<head>
<!-- summary report of API run history -->
<title>API Status History at $dom</title>
</head>
<BODY BGCOLOR='#DDDDDD' TEXT='#000000'>
<h2>LDAS API Status History Summary at $dom</h2>
<p>
<ul>\n"
set foot "</ul></body></html>"
regsub -all {[ ]+} $head { } head
set body [ list ]
foreach api $apis {
if { [ string equal manager $api ] } { continue }
;## no exception on failure of a single API to be
;## initialised.
if { [ llength [ array names ::mgr::apirunstats $api,* ] ] } {
set good $::mgr::apirunstats($api,succeeded)
set bad $::mgr::apirunstats($api,failed)
set dt $::mgr::apirunstats($api,runtime)
set restarts $::mgr::apirunstats($api,restart)
} else {
set good 0
set bad 0
set dt 0
set restarts 0
}
set dt [ expr int($dt) ]
set total [ expr { $good + $bad } ]
if { $total } {
set success_rate [ expr { ($good * 100) / ($good + $bad) } ]
set success_rate [ format %.2f $success_rate ]
} else {
set success_rate 100
}
set gps [ lindex $restarts end ]
set time [ clock format [ utcTime $gps ] -format "%x-%X %Z" ]
set last_rs [ lindex $restarts end ]
set last_rs [ expr { [ gpsTime ] - $last_rs } ]
set restarts [ llength $restarts ]
if { [ string equal list $form ] } {
lappend results \
[ list $api $total $success_rate $dt $last_rs ]
} else {
set item "<li><b>${api}API started at
<font color=brown>$time
(GPS: $gps)</font></b><ul>
<li>Total jobs visiting this API:
<font color=red>$total</font>
<li>Job completion success rate:
<font color=red>${success_rate}%</font>
<li>Total processing time for this API:
<font color=red>$dt s</font>
<li>seconds since this API was restarted:
<font color=red>$last_rs</font>
<li>number of times restarted this run:
<font color=red>$restarts</font>
</ul>\n"
regsub -all {[ ]+} $item { } item
append body $item
}
proc mgr::runningJobStatsFile { jobid dt status } {
if { [ catch {
set now [ gpsTime ]
set start [ set ::${jobid}::starttime(gps) ]
set user [ mgr::job2name $jobid ]
set dt [ format "%.2f" $dt ]
;## and complete the OTHER report
mgr::oneLineJobTimingReport \
$jobid st $start nm $user pf $status tt $dt xx 0
} err ] } {
return -code error "[ myName ]: $err"
}
}
proc mgr::zombies { api } {
if { [ catch {
set error "$api API fault: UNREACHABLE!"
set children [ namespace children :: ${::RUNCODE}* ]
foreach child $children {
set jobid [ string trim $child : ]
if { [ info exists ${child}::api ] } {
if { [ string equal $api [ set ${child}::api ] ] } {
set subject "$api API error!"
if { [ llength [ namespace children :: $child ] ] } {
set ${child}::errorapi $api
${child}::predelete $subject $error
if { [ llength [ namespace children :: $child ] ] } {
${child}::seq
}
}
}
}
}
if { [ string equal frame $api ] } {
mgr::zombies diskcache
}
} err ] } {
addLogEntry $err red
}
}
proc mgr::ping { { apilist "" } } {
if { [ catch {
if { ! [ string length $apilist ] } {
set apilist [ lsort $::API_LIST ]
}
set interval $::LOGINTERVAL
set me [ myName ]
;## log pings if error is returned OR if ::LOG_PINGS
;## is defined and set to '1'.
if { [ info exists ::LOG_PINGS ] && \
[ string equal 1 $::LOG_PINGS ] } {
} else {
set ::LOG_PINGS 0
}
foreach api $apilist {
incr interval -1
if { [ string equal $api manager ] } {
continue
}
set code "if { ! \[ info exists ::__bootstraplock_$api \] } {
set lvl 0
set msg \[ pingAPI $api \]
if { \[ regexp unreachable \$msg \] } {
set lvl 2
}
proc mgr::killPing { { apilist "" } } {
if { ! [ string length $apilist ] } {
set apilist [ lsort $::API_LIST ]
}
foreach api $apilist {
if { [ string equal $api manager ] } {
continue
}
if { [ catch {
set ::bg::jobs(${api}ping,run) 0
addLogEntry "No longer pinging $api" blue
} err ] } {
addLogEntry $err
}
}
}
proc mgr::manageLogs { } {
if { [ catch {
;## make sure browsers can browse archived logs!
if { ! [ file exists $::LDASARC/.htaccess ] } {
set fid [ open $::LDASARC/.htaccess w 0644 ]
puts $fid "DefaultType text/html"
::close $fid
}
if { ! [ file exists $::LDASLOG/.htaccess ] } {
set fid [ open $::LDASLOG/.htaccess w 0644 ]
puts $fid "DefaultType text/html"
::close $fid
}
foreach api $::API_LIST {
set log LDAS${api}.log.html
set log $::LDASLOG/$log
if { [ file exists $log ] } {
if { [ file size $log ] > 100000 } {
}
}
}
} err ] } {
catch { ::close $fid }
return -code error "[ myName ]: $err"
}
}
mgr::reply $cid $msgComments:
proc mgr::reply { { cid "" } { args "" } } {
set cid [ string trim $cid ]
if { [ regexp {_globus_xio_handle_t$} $cid ] } {
return [ mgrGlobus::reply $cid $args ]
}
if { ! [ regexp {sock[0-9]+$} $cid ] } {
set msg "mgr::reply:\n"
append msg "received: '$cid $args'\n"
append msg "First argument must be channel i.d."
addLogEntry $msg 1
}
if { [ string length $args ] > 4 } {
if { [ catch {
set prevblock [ fconfigure $cid -blocking ]
fconfigure $cid -blocking 0
puts $cid $args
fconfigure $cid -blocking $prevblock
} err ] } {
catch { ::close $cid }
set msg "mgr::reply:\n"
lappend msg "$err\n"
lappend msg "Lost contact with my caller! $cid closed"
addLogEntry $msg 2
}
}
return {}
}
namespace eval manager {}
proc manager::reply { cid args } {
eval mgr::reply $cid $args
}
mgr::sHuTdOwNComments:
proc mgr::sHuTdOwN { } {
set ::jobid SHUTDOWN
set ::RESTART_ON_MEMFLAG 0
set ::SYSTEM_SHUTDOWN_IN_PROGRESS 1
set now [ gpsTime ]
;## disable boostraps
proc ::mgr::bootstrapAPI { args } {}
proc ::createAssistant { args } {}
;## don't take any new commands!!
if { [ catch {
closeListenSock operator
} err ] } {
addLogEntry $err
}
catch { mgr::chopQueueTo 0 }
;## capture state at shutdown for analysis if necessary
set fname1 [ file join $::LDASLOG APIstatus.html ]
set fname2 [ file join $::PUBDIR activejobs.html ]
set arcdir $::LDASARC/shutdown_status
set archive1 ${arcdir}/APIstatus-${now}.html
set archive2 ${arcdir}/activejobs-${now}.html
catch {
if { ! [ file isdirectory $arcdir ] } {
file mkdir $arcdir
file attributes $arcdir -permissions 0755
gifBalls $arcdir
}
file copy -force $fname1 $archive1
file copy -force $fname2 $archive2
}
catch { mgr::killPing } err0
foreach api $::API_LIST {
if { [ string equal $api manager ] } {
continue
}
catch { mgr::shutdownAPI $api "nocore" }
mgr::bootstrapPause $api
}
catch { mgr::perApiRunningStats write } err2
#catch { queue::save } err3
catch { key::increment shutdown } err4
;## force shutdown to be quick
set ::API_SOCKET_TIMEOUT_MS 1000
catch { mgr::apiStatusPage } err5
if { [ catch {
closeListenSock emergency
} err ] } {
addLogEntry $err
}
;## close globus socket
foreach type [ list User Host ] {
if { [ info exist ::mgrGlobusChannel::Globus${type}ListenSocket ] } {
set globus_sock [ set ::mgrGlobusChannel::Globus${type}ListenSocket ]
addLogEntry "close tclglobus $type channel $globus_sock" purple
::close $globus_sock
}
}
closeLog
log::lock
kill -9 [ pid ]
}
proc mgr::validateopts { cmd user } {
if { [ catch {
set procname [ list ]
set usropts [ list ]
set options [ list ]
set metalist [ list ]
set reason [ list ]
set files [ glob -nocomplain ${::LDASMACROS}/meta/*.meta ]
;## if the /ldas directory gets deleted...
;## this has happened!
if { ! [ llength $files ] } {
set reason ldas_system_failed_sanity_check
set subject "$::LDAS_SYSTEM ${::API}API CRITICAL error!"
set msg "the $::LDAS_SYSTEM manager API did not "
append msg "find ANY macro files under the directory "
append msg "${::LDASMACROS}/meta.\nthis is a critical "
append msg "LDAS system error indicating a condition "
append msg "which must be remedied immediately!!\n"
append msg "ALL LDAS user commands will fail until "
append msg "the filesystem is restored."
if { ! [ info exists ::meta_macros_are_missing ] || \
$::meta_macros_are_missing == 0 } {
set ::meta_macros_are_missing 1
after 1967666 set ::meta_macros_are_missing 0
addLogEntry "Subject: ${subject}; Body: $msg" mail
}
return -code error $msg
}
foreach file $files {
lappend metalist [ file tail $file ]
}
regexp $::cmd_rx $cmd -> procname usropts
if { [ catch {
mgr::duplicateOpts $usropts
} err ] } {
set reason duplicate_option
return -code error $err
}
if { [ catch {
mgr::commandSanityChecks $usropts
} err ] } {
set reason invalid_command
return -code error $err
}
if { [ lsearch -exact $metalist ${procname}.meta ] > -1 } {
set fname [ file join $::LDASMACROS meta ${procname}.meta ]
set fid [ open $fname r ]
set options "[ gets $fid ]\n"
;## PR 3087 abort job if it requires an inactive api
set rest [ read -nonewline $fid ]
;## always do this fresh as the list may change
set apipat [ join $::API_LIST \| ]
set allapipat [ list manager diskcache frame metadata ligolw datacond mpi eventmon cntlmon ]
set allapipat [ join $allapipat \| ]
::close $fid
} else {
set msg "The command entered: '$cmd'\n"
append msg "is not a valid user command or is missing\n"
append msg "required options. Please see\n"
append msg "the documentation for the LDAS system\n"
append msg "user interface for information about valid\n"
append msg "commands and command syntax."
set reason invalid_command
return $msg
}
array set opts [ lindex $options 2 ]
array set useroptions $usropts
foreach item [ array names useroptions ] {
set possible {}
set i 0
foreach opt [ array names opts ] {
regsub -- "--" $item "-" item
if { [ regexp -- $item $opt ] } {
lappend possible $opt
incr i
}
}
if { ! $i } {
set msg "User option: '$item'\n"
append msg "is not a valid option to $procname.\n"
append msg "Please see the documentation for\n"
append msg "$procname before resubmitting your\n"
append msg "request."
set reason invalid_option
return $msg
}
if { $i >= 2 } {
set msg "User option: '$item'\n"
append msg "is ambiguous in the context of $procname\n"
append msg "since it matches all of these options:\n"
append msg "$possible\n"
append msg "Please make sure your option list is\n"
append msg "unambiguous in the context of $procname\n"
append msg "before resubmitting your request."
set reason ambiguous_option
return $msg
}
}
;## check if job requires an inactive API
foreach line [ split $rest \n ] {
if { [ regexp {^\#|^[\s\t\n]*$} $line ] } {
continue
}
if { [ string length $line ] > 8 } {
continue
}
set line [ string trim $line ]
if { [ regexp $apipat $line ] } {
continue
} elseif { [ regexp $allapipat $line ] } {
set reason "inactive_api $line"
error "This job requests an inactive api $line"
}
}
} err ] } {
if { [ string length $reason ] } {
after 0 \
[ list mgr::logRejectedJob $user $procname $reason ]
}
return -code error "[ myName ]: $err"
}
}
proc mgr::localiseInputURLs { jobid options } {
if { [ catch {
set url [ list ]
set localfile [ list ]
set errs [ list ]
set localised [ list ]
set files [ list ]
set out_rx {(return|algorithm|output)}
set url_rx {(?:http|gridftp|ftp|file)://?[^/][^ \,\}\]\)]+}
foreach [ list option value ] $options {
if { [ regexp -nocase -- $out_rx $option ] } {
continue
}
set urls [ regexp -nocase -inline -all $url_rx $value ]
;## PR #2705 tarball handling!
set limit $::MAXIMUM_NUMBER_OF_REMOTE_URLS
if { [ llength $urls ] > $limit } {
if { ! [ info exists ::${jobid}::tarball ] } {
set error "Too many remote ftp and http URL's in "
append error "your request. Please use the -tarball "
append error "option q.v."
return -code error $error
}
}
if { [ llength $urls ] } {
foreach url $urls {
if { [ catch {
set localfile [ getUrl $jobid $url ]
lappend localised $url $localfile
} err ] } {
lappend errs "$err (url: '$url')"
}
}
}
}
if { [ llength $errs ] } {
return -code error $errs
}
;## and actually do the substitutions
foreach [ list url localfile ] $localised {
lappend files $localfile
regsub $url $options $localfile options
}
if { [ llength $localised ] } {
debugPuts "URL's substituted: '$options'"
}
} err ] } {
return -code error "[ myName ]: $err"
}
return [ list $files $options ]
}
proc mgr::delocaliseOutputURLs { jobid URLS } {
if { [ catch {
set errs [ list ]
set remotes [ list ]
foreach [ list file url ] [ join $URLS ] {
if { [ catch {
lappend remotes [ putUrl $jobid $file $url ]
} err ] } {
lappend errs "$err (file: '$file' url: '$url')"
}
}
if { [ llength $errs ] } {
return -code error $errs
}
} err ] } {
return -code error "[ myName ]: $err"
}
return $remotes
}
proc mgr::duplicateOpts { usropts } {
if { [ catch {
set dupes [ list ]
foreach { opt val } $usropts {
if { [ info exists opts($opt) ] } {
lappend dupes [ list $opt $opts($opt) $val ]
set opts($opt) [ join $opts($opt) $val ]
} else {
set opts($opt) $val
}
}
if { [ llength $dupes ] } {
return -code error $dupes
}
} err ] } {
set msg "[ myName ]: option(s) duplicated: $err"
return -code error $msg
}
}
proc mgr::apiStatusPage { { api all } } {
if { [ catch {
set body [ list ]
set time [ gpsTime ]
set t_utc [ utcTime $time ]
set t_local [ clock format $t_utc -format "%x-%X %Z" ]
set t_gmt [ clock format $t_utc -format "%x-%X %Z" -gmt 1 ]
set caller [ uplevel myName ]
set dom $::LDAS_SYSTEM
set head "<html><head>\n"
append head "<!-- reload this page every 5 minutes -->\n"
append head "<META HTTP-EQUIV=\"REFRESH\" CONTENT=300>\n"
append head "<title>LDAS API Status Monitor</title>\n"
append head "</head>\n"
append head "<BODY BGCOLOR='#DDDDDD' TEXT='#000000'>\n"
append head "<h2>LDAS API Status at $dom</h2>\n"
append head "<h3><font color='red'>$time</font>\n"
append head "<font color='green'>$t_local</font>\n"
append head "<font color='brown'>$t_gmt</font>\n"
append head "</h3>\n"
foreach api $::API_LIST {
if { [ catch {
set data [ mgr::apiStatus $api ]
foreach [ list properties channels state color ] \
$data { break }
} err ] } {
set color red
;## at shutdown, connection may succeed before the api
;## has completely expired.
if { [ regexp {broken pipe} $err ] } {
set state "\$host is up, but $api API is not running\n"
} else {
set state "\$host unreachable\n"
}
}
foreach { name host port pid pcpu pmem vsz rss uptime } \
$properties { break }
foreach { socks files threads events objects total } \
$channels { break }
;## short circuit on error getting proc info
if { ! [ info exists port ] } {
set color red
set state "runtime data for $api API on \$host "
append state "temporarily unavailable"
}
set gif "<img src=\"ball_$color.gif\">\n"
if { [ string equal $color red ] } {
if { ! [ string equal ok $state ] } {
set host [ set ::[ string toupper $api]_API_HOST ]
set state [ subst -nocommands $state ]
append body $gif
append body "<b><a href=LDAS$api.log.html>$api API</a>\n"
append body $state
append body "</b><p>\n"
}
} else {
;## if the operator ports for API's are undefined at
;## the manager's startup - set the baseport
;## in the state array to a value reflecting that
;## state so that it cannot be mistaken for a valid
;## port assignment.
if { [ regexp {^\d+$} $port ] } {
set port ::BASEPORT+[ expr $port % $::BASEPORT ]
} else {
set port ::BASEPORT+???
}
append body $gif
append body "<b><a href=LDAS$api.log.html>$api API</a>\n"
append body "is running on <i>$host</i> port $port</b>\n"
append body "<ul><tt>\n"
append body "<li><font color='red'>uptime:\n"
append body "<b>$uptime</b></font>\n"
append body "<li>maximum virtual memory allocated by this API:\n"
append body "<font color='red'>$vsz Kb</font>\n"
append body "<ul>\n"
append body "(current resident set size: $rss Kb)\n"
;## if we have a previous vsz value, calculate the
;## integrated rate of change of memory usage.
if { [ info exists ::mgr::${api}_vsz ] } {
set nowmem [ set ::mgr::${api}_vsz ]
if { [ regexp {^[\.\d]+$} $nowmem ] } {
if { [ catch {
set dmem [ expr { $vsz - $nowmem } ]
} err ] } {
# weird bug occurred!!
set dmem 0
}
set dt [ expr { double($::LOGINTERVAL) } ]
set dmdt [ format "%3.2f" [ expr { $dmem / $dt } ] ]
append body "<br>\n"
append body "(heap size change rate: $dmdt Kb/sec.)\n"
}
}
set ::mgr::${api}_vsz $vsz
append body "</ul>\n"
;## character 37 is the percent symbol
append body "<li>current cpu usage ${pcpu}%\n"
append body "<ul>\n"
append body "<li>number of open sockets: $socks\n"
append body "<li>number of open files: $files\n"
append body "<li>number of running threads: $threads\n"
append body "<li>number of pending tcl events: $events\n"
append body "<li>number of objects registered: $objects\n"
append body "<li>total: $total\n"
append body "</ul>\n"
append body "<a href=../${api}API/${api}.packages>"
append body "Package Version Information</a>\n"
append body "</tt></ul>\n\n"
}
}
;## new activity summary as of 04/09
set url1 $::PUBDIR/activejobs.html
set url2 [ mgr::runStatusSummary all link ]
set gif "<img src=\"ball_purple.gif\">\n"
set tail "<font color=brown>\n"
append tail $gif
append tail "<a href=$url1>\n"
append tail "<b>Running Job Status Summary at $dom</b></a>\n"
append tail "<p>\n"
append tail $gif
append tail "<a href=$url2>\n"
append tail "<b>Per API Activity Summary at $dom</b></a>\n"
if { [ string equal $caller mgr::sHuTdOwN ] } {
append tail "<p><font color='red'><b>\n"
append tail "LDAS system at $dom has been shut down.\n"
append tail "</b><p></font>\n"
}
set end "</body></html>"
set fname [ file join $::LDASLOG APIstatus.html ]
set fid [ open $fname w 0644 ]
puts $fid "$head$body$tail$end"
::close $fid
} err ] } {
if { [ string length $err ] } {
set subject "$::LDAS_SYSTEM ${::API}API error!"
set body "[ myName ] error: $err"
puts stderr $body
addLogEntry "Subject: ${subject}; Body: $body" mail
}
}
}
proc mgr::apiStatus { api } {
if { [ catch {
set retval [ list ]
;## set color and state
set color green
set state ok
set mgrhost $::manager(host)
set host [ set ::${api}(host) ]
set port [ set ::${api}(operator) ]
set port [ lindex $port 0 ]
set status [ mgr::apiSocketStatus $host ]
foreach { flag msg } $status { break }
if { $flag } {
set color red
set state $msg
set retval [ list {} {} $state $color ]
return {}
}
set props [ list {} $host $port {} {} {} {} {} {} ]
set chans [ list {} {} {} {} {} ]
;## set props
if { [ catch {
;## don't let the manager announce it is running after
;## shutdown!!
if { [ string equal manager $api ] && \
[ string equal mgr::sHuTdOwN [ uplevel 2 myName ] ] } {
shutdown
}
if { ! [ string equal manager $api ] } {
set sid [ sock::open $api emergency ]
#set sid [ mgr::connect2API $api emergency ]
}
} err ] } {
set color red
set state "$host is up, but $api API is not running"
set retval [ list $props $chans $state $color ]
return {}
} else {
catch { ::close $sid }
}
;## get the asynchornously generated API state file
set filename [ file join $::LDASLOG ${api}.status ]
if { [ file exists $filename ] && [ file size $filename ] } {
foreach [ list props chans ] [ dumpFile $filename ] { break }
}
set props [ linsert $props 1 $host $port ]
set dummy [ list {} {} {} {} {} {} {} {} {} ]
set size [ llength $props ]
eval lappend props [ lrange $dummy $size end ]
set dummy [ list {} {} {} {} {} ]
set size [ llength $chans ]
eval lappend chans [ lrange $dummy $size end ]
foreach { socks files threads events objects } $chans {
break
}
set chans [ llength $socks ]
lappend chans [ llength $files ]
lappend chans [ expr [ llength $threads ] / 3 ]
lappend chans [ llength $events ]
lappend chans [ llength $objects ]
lappend chans [ expr [ join [ split $chans ] + ] ]
set ::api_status_data($api) \
[ list $props $chans $state $color ]
} err ] } {
catch { ::close $sid }
if { ! [ string length $err ] } {
return $retval
}
return -code error "[ myName ]: $err"
}
return [ list $props $chans $state $color ]
}
proc mgr::apiSocketStatus { host } {
if { [ catch {
set now [ clock seconds ]
if { [ info exists ::apisocketstatus($host) ] } {
foreach [ list last status ] \
$::apisocketstatus($host) { break }
if { ($now - $last) > 60 } {
set status [ sock::diagnostic $host ]
set ::apisocketstatus($host) [ list $now $status ]
}
} else {
set status [ sock::diagnostic $host ]
set ::apisocketstatus($host) [ list $now $status ]
}
} err ] } {
return -code error "[ myName ]: $err"
}
return $status
}
proc mgr::removeAPI { api } {
regexp {(.+)API} $api -> api
if { [ catch {
set i [ lsearch $::API_LIST $api ]
if { $i < 0 } {
return -code error "not a registered API: '$api'"
}
set host [ set ::${api}(host) ]
mgr::shutdownAPI $api
mgr::bootstrapPause $api
set ::API_LIST [ lreplace $::API_LIST $i $i ]
unset ::${::LDAS_SYSTEM}($api)
mgr::killPing $api
set ::removedapis($api) [ array get ::$api ]
lappend ::removedapis($api) $i
unset ::$api
foreach _api $::API_LIST {
if { [ string equal $_api manager ] } {
continue
}
set sid [ sock::open $_api emergency ]
fconfigure $sid -blocking off
puts $sid "$::MGRKEY \"set ::API_LIST \[ list $::API_LIST \];
unset ::${::LDAS_SYSTEM}($api)\""
after 10 [ list cmd::receive $sid ${_api}_$sid mgr::logReply ]
}
} err ] } {
return -code error "[ myName ]: $err"
}
set dom $::LDAS_SYSTEM
set subject "$dom ${api}API removed from '$host'"
set body "unregistered '$api' API from '$host' at $dom"
addLogEntry "Subject: ${subject}; Body: $body" email
}
proc mgr::addAPI { api host } {
regexp {(.+)API} $api -> api
if { [ catch {
set i [ lsearch $::API_LIST $api ]
if { $i != -1 } {
return -code error "could not add already registered API: $api"
}
;## for an API which the system has not seen before,
;## ports need to be assigned and all API's need to
;## be made aware of them.
if { [ catch {
if { [ info exists ::removedapis($api) ] } {
set i [ lindex $::removedapis($api) end ]
array set ::$api [ lrange $::removedapis($api) 0 end-1 ]
unset ::removedapis($api)
set ::${api}(host) $host
} else {
set i end
}
if { ! [ array exists ::$api ] } {
set port [ mgr::lastUsedPort ]
foreach service [ list operator emergency data ] {
set ::${api}($service) [ incr port ]
set ::${api}(host) $host
}
}
set dir $::env(RUNDIR)/${api}API
if { ! [ file isdirectory $dir ] } {
set dir [ mgr::makeApiAtHome $api ]
}
set ::API_LIST [ linsert $::API_LIST $i $api ]
set ::${::LDAS_SYSTEM}($api) $host
;## temporarily modify resource file
if { ! [ file exists $dir/LDAS$api.rsc ] } {
file copy -force \
$::LDAS/lib/${api}API/LDAS$api.rsc \
$dir/LDAS$api.rsc \
} else {
file copy -force $dir/LDAS$api.rsc $dir/LDAS$api.tmp
}
set fid [ open $dir/LDAS$api.rsc a+ ]
foreach service [ list operator emergency data ] {
puts $fid "set ::${api}($service) [ set ::${api}($service) ]"
}
puts $fid "set ::API_LIST [ list $::API_LIST ]"
::close $fid
} err ] } {
catch { ::close $fid }
return -code err "problem updating LDAS$api.rsc"
}
foreach _api $::API_LIST {
if { [ regexp "(manager|$api)" $_api ] } {
continue
}
set sid [ sock::open $_api emergency ]
fconfigure $sid -blocking off
puts $sid "$::MGRKEY \"set ::API_LIST [ list $::API_LIST ];
array set ::$api \{[ array get ::$api ]\};
set ::${::LDAS_SYSTEM}($api) $host\""
after 10 [ list cmd::receive $sid ${_api}_$sid mgr::logReply ]
}
addLogEntry "bootstrapAPI $api" purple
mgr::bootstrapAPI $api
;## clean up modified resource file
file delete -force $dir/LDAS$api.rsc
if { [ file exists $dir/LDAS$api.tmp ] } {
file rename -force $dir/LDAS$api.tmp $dir/LDAS$api.rsc
}
} err ] } {
catch { ::close $sid }
return -code error "[ myName ]: $err"
}
set dom $::LDAS_SYSTEM
set subject "$dom ${api}API started on '$host'"
set body "registered new '$api' API on '$host' at $dom"
addLogEntry "Subject: ${subject}; Body: $body" email
}
proc mgr::lastUsedPort { } {
if { [ catch {
set last 0
;## get the current highest port number
foreach api $::API_LIST {
foreach port { operator emergency data } {
if { [ set ::${api}($port) ] > $last } {
set last [ set ::${api}($port) ]
}
}
}
} err ] } {
return -code error "[ myName ]: $err"
}
set last
}
proc mgr::updateUserInfo { name password email fullname phone expires { delete 0 } } {
if { [ catch {
set valid 0
set panic $::QUEUE(USERS)
if { ! $delete } {
;## otherwise expires would be changed to integer seconds
;## via an uplevel call from mgr::validateUserOptions
set temp $expires
mgr::validateUserOptions
set expires $temp
::unset temp
}
if { ! [ regexp {^[01]$} $delete ] } {
set msg "invalid delete flag, must be 0 or 1"
return -code error msg
}
foreach user $::QUEUE(USERS) {
if { ! [ regexp -- "-name\\s+$name\\s+" $user ] } {
lappend tmp $user
} else {
set valid 1
if { [ regexp -- "-password\\s+(\S+)\\s+" $user -> hash ] } {
if { [ string equal $password [ key::md5 $hash ] ] } {
set password $hash
}
}
}
}
if { $valid } {
;## we need to be careful not to expire a user after they
;## have received an extension within the last 86400
;## seconds of their lifetime. PR#1943
if { [ info exists ::user_scheduled_to_expire($name) ] } {
set afterid $::user_scheduled_to_expire($name)
::unset ::user_scheduled_to_expire($name)
after cancel $afterid
}
set ::QUEUE(USERS) $tmp
if { ! $delete } {
mgr::addUser $name $password $email $fullname $phone $expires
} else {
mgr::removeUser $name $password $email $fullname $phone $expires
}
} else {
set msg "user '$name' does not exist, use addUser instead!"
addLogEntry $msg red
return -code error $msg
}
} err ] } {
set ::QUEUE(USERS) $panic
return -code error "[ myName ]: $err"
}
}
proc mgr::updateOrAddUser { args } {
if { [ catch {
set errs [ list ]
if { [ llength $args ] == 1 } {
set args [ lindex $args 0 ]
}
if { [ llength [ lindex $args 0 ] ] == 6 } {
set args [ join $args ]
}
foreach [ list name password email fullname phone expires ] $args {
if { [ catch {
mgr::updateUserInfo $name $password $email $fullname $phone $expires
} err ] } {
set err1 $err
if { [ catch {
mgr::addUser $name $password $email $fullname $phone $expires
} err ] } {
lappend errs $err1 $err
}
}
}
if { [ llength $errs ] } {
return -code error $errs
}
} err ] } {
return -code error "[ myName ]: $err"
}
}
proc mgr::abortJob { jobid1 jobid2 } {
if { [ catch {
set name [ list ]
regexp {\d+} $jobid1 jobid1
regexp {\d+} $jobid2 jobid2
set jobid1 ${::RUNCODE}$jobid1
set jobid2 ${::RUNCODE}$jobid2
set data $::QUEUE(USERS)
foreach user $data {
set jobs [ lindex $user end ]
if { [ regexp -- $jobid1 $jobs ] && \
[ regexp -- $jobid2 $jobs ] } {
set name [ lindex $user 1 ]
if { [ llength [ namespace children :: ::$jobid1 ] ] } {
set msg "$jobid1 aborted at user '$name' request"
set subject "$jobid1 [ set ::${jobid1}::userSubject ] error!"
::${jobid1}::predelete $subject $msg
if { [ llength [ namespace children :: ::$jobid1 ] ] } {
::${jobid1}::seq
}
} else {
set msg "$jobid1 has already been aborted or has finished"
addLogEntry $msg blue
}
break
}
}
if { ! [ string length $name ] } {
set msg "permission denied"
addLogEntry $msg red
}
} err ] } {
addLogEntry $err red
}
return $msg
}
proc mgr::hideDirectories { } {
set errs [ list ]
if { [ catch {
file attributes $::PUBDIR -permissions 0775
if { [ info exists ::MGR_NOHIDE_DIRECTORIES ] } {
foreach { topdir dont_hide } $::MGR_NOHIDE_DIRECTORIES {
set dirs [ glob -nocomplain $topdir/* ]
foreach dir $dirs {
if { [ file isdirectory $dir ] \
&& [ lsearch $dont_hide [ file tail $dir ] ] == -1 } {
set fname [ file join $dir index.html ]
if { ! [ file exists $fname ] } {
if { [ catch {
set fid [ open $fname w 0644 ]
puts $fid "<html>"
::close $fid
debugPuts "wrote $fname"
} err ] } {
lappend errs $err
}
}
if { [ catch {
foreach item [ glob -nocomplain $dir/* ] {
if { [ string equal $item $fname ] } {
continue
}
set perms [ file attributes $item -permissions ]
set world [ string range $perms end end ]
if { $world } {
file attributes $item -permissions o-rwx
debugPuts "attributes of $item modified"
}
}
file attributes $fname -permissions 0444
} err ] } {
lappend errs $err
}
}
}
}
}
if { [ catch {
mgr::forcePerms
} err ] } {
lappend errs $err
}
if { [ llength $errs ] } {
return -code error $errs
}
} err ] } {
set jobid IDLE
addLogEntry $err yellow
}
}
proc mgr::forcePerms { } {
if { [ catch {
;## PR 3047 - diskcache API should push hashfile name
if { [ info exists ::DISKCACHE_HASHFILE_NAME_BINARY ] } {
set diskcache $::DISKCACHE_HASHFILE_NAME_BINARY
} else {
set diskcache .frame.cache
}
if { [ info exists ::DISKCACHE_HASHFILE_NAME_ASCII ] } {
set adiskcache $::DISKCACHE_HASHFILE_NAME_ASCII
} else {
set adiskcache frame_cache_dump
}
set i 0
foreach fname [ glob -nocomplain $::env(RUNDIR)/*.rsc ] {
catch { file attributes $fname -permissions 0640 } err$i
}
set extradirs [ list savedCores tmp test developers ]
foreach dir $extradirs {
if { [ file exists $::env(RUNDIR)/$dir ] } {
catch {
file attributes $::env(RUNDIR)/$dir -permissions 0700
} err[ incr i ]
}
}
foreach fname [ glob -nocomplain $::env(RUNDIR)/* ] {
set filetype [ fileType $fname ]
if { [ lsearch $filetype script ] != -1 || \
[ lsearch $filetype executable ] != -1 } {
catch {
file attributes $fname -permissions 0600
} err[ incr i ]
}
}
catch {
set fid [ open $::env(RUNDIR)/index.html w 0644 ]
puts $fid "<html>"
::close $fid
} err[ incr i ]
foreach api $::API_LIST {
set dir $::env(RUNDIR)/${api}API
catch {
file attributes $dir -permissions 0755
} err[ incr i ]
foreach pattern [ list * .??* ] {
foreach fname [ glob -nocomplain $dir/$pattern ] {
catch {
file attributes $fname -permissions 0640
} err[ incr i ]
}
}
set expected [ list ${api}.packages .htaccess index.html \
queuestats bootschema.lam ]
foreach file $expected {
foreach fname [ glob -nocomplain $dir/$file ] {
catch {
file attributes $fname -permissions 0644
} err[ incr i ]
}
}
if { [ string equal $api diskcache ] } {
;##----------------------------------------------------------
;## Ensure attributes of the frame filename caches
;##----------------------------------------------------------
foreach fname [ glob -nocomplain $dir/$diskcache ] {
catch {
file attributes $fname -permissions 0644
} err[ incr i ]
}
foreach fname [ glob -nocomplain $dir/$adiskcache ] {
catch {
file attributes $fname -permissions 0644
} err[ incr i ]
}
}
}
set errors [ list ]
foreach var [ info vars err* ] {
if { [ string length [ set $var ] ] } {
lappend errors [ set $var ]
}
}
if { [ llength $errors ] } {
set subject "$::LDAS_SYSTEM LDAS SYSTEM file permission error!"
set errors [ join $errors "\n" ]
addLogEntry "Subject: ${subject}; Body: $errors" email
}
} err ] } {
return -code error "[ myName ]: $err"
}
}
proc mgr::addUser { name password email fullname phone expires } {
if { [ catch {
if { [ regexp -- "-name\\s+$name\\s+" $::QUEUE(USERS) ] } {
return -code error "user '$name' already exists!"
}
set caller [ uplevel myName ]
#if { [ string equal emergency_callback $caller ] } {
# mgr::validateUserOptions
#}
set item [ list -name $name -password $password \
-email $email -fullname $fullname \
-phone $phone -expires $expires {} ]
mgr::cryptCheck
;## write new user info into users.queue
if { [ file exists $qname ] } {
if { [ file writable $qname ] } {
set data [ dumpFile $qname ]
if { $crypt } {
set data [ decrypt $data $::MGRKEY ]
}
bak $qname
set fid [ open $qname w ]
foreach user [ split $data "\n" ] {
if { ! [ regexp -- "-name\\s+$name\\s+" $user ] } {
puts $fid $user
}
}
puts $fid $item
::close $fid
if { $crypt } {
cryptFileInPlace $qname $::MGRKEY
}
file attributes $qname -permissions 0600
addLogEntry "new user added to $qname: '$item'"
} else {
addLogEntry "$qname not writable. User '$name' not added."
}
} else {
return -code error "$qname not found!"
}
;## update manager's ::QUEUE(USERS) list in place
lappend ::QUEUE(USERS) $item
addLogEntry "new user added to ::QUEUE(USERS): '$item'"
} err ] } {
catch { ::close $fid }
return -code error "[ myName ]: $err"
}
}
proc mgr::removeUser { name password email fullname phone expires } {
if { [ catch {
set caller [ uplevel myName ]
#if { [ string equal emergency_callback $caller ] } {
# mgr::validateUserOptions
#}
set item [ list \
-name $name \
-password $password \
-email $email \
-fullname $fullname \
-phone $phone \
-expires $expires \
{} ]
mgr::cryptCheck
;## write new user info into users.queue
if { [ file exists $qname ] } {
if { [ file writable $qname ] } {
set data [ dumpFile $qname ]
if { $crypt } {
set data [ decrypt $data $::MGRKEY ]
}
set fid [ open $qname w ]
foreach user [ split $data "\n" ] {
if { ! [ regexp -- "-name\\s+$name\\s+" $user ] } {
puts $fid $user
}
}
::close $fid
if { $crypt } {
cryptFileInPlace $qname $::MGRKEY
}
file attributes $qname -permissions 0600
addLogEntry "user '$name' removed from '$qname'"
} else {
addLogEntry "$qname not writable. User '$name' not removed."
}
} else {
return -code error "$qname not found!"
}
} err ] } {
catch { ::close $fid }
return -code error "[ myName ]: $err"
}
}
proc mgr::validateUserOptions { } {
if { [ catch {
set now [ clock seconds ]
uplevel set now $now
set name [ uplevel set name ]
set password [ uplevel set password ]
set email [ uplevel set email ]
set fullname [ uplevel set fullname ]
set phone [ uplevel set phone ]
set expires [ uplevel set expires ]
if { ! [ string length $name ] || \
! [ string length $password ] || \
! [ string length $email ] || \
! [ string length $fullname ] || \
! [ string length $phone ] || \
! [ string length $expires ] } {
set msg "all fields must be provided when changing "
append msg "user info (name password email full_name"
append msg " phone expiration_date)"
return -code error $msg
}
if { [ regexp {\s+} $name$password$email ] } {
return -code error "embedded spaces forbidden!"
}
if { ! [ regexp {.+@.+\..+} $email ] } {
return -code error "invalid e-mail: '$email'"
}
if { ! [ regexp -nocase {[0-9a-f]{32}} $password ] } {
set password [ key::md5 $password ]
uplevel set password $password
}
if { ! [ regexp {^\d+$} $expires ] } {
set expires [ clock scan $expires ]
uplevel set expires $expires
}
if { $expires <= $now } {
set msg "account expiration given has already passed"
return -code error $msg
}
if { ! [ regexp {\d{3}} $phone ] } {
set msg "invalid phone number: '$phone'"
return -code error $msg
}
if { [ string length $fullname ] < 4 || \
[ llength $fullname ] < 2 } {
set msg "invalid full user name (full name required):"
append msg " '$fullname'"
return -code error $msg
}
} err ] } {
return -code error "[ myName ]: $err"
}
}
proc mgr::cryptCheck { args } {
if { [ catch {
;## name of the file containing the user info
if { [ info exists ::ENCRYPTED_USERS_QUEUE ] } {
if { [ string equal 1 $::ENCRYPTED_USERS_QUEUE ] } {
set qname users.queue.crypt
set crypt 1
} else {
set qname users.queue
set crypt 0
}
} else {
set qname users.queue
set crypt 0
}
if { $crypt && \
[ file exists users.queue ] && \
! [ file exists users.queue.crypt ] } {
file copy -force users.queue users.queue.crypt
cryptFileInPlace users.queue.crypt $::MGRKEY
}
if { ! [ file exist $qname ] } {
file copy \
[ file join $::LDAS lib managerAPI $qname ] .
file attributes $qname -permissions 0600
}
uplevel set qname $qname
uplevel set crypt $crypt
} err ] } {
return -code error "[ myName ]: $err"
}
}
proc mgr::killJob { args } {
}
proc kIlLjOb { args } {
}
proc mgr::garbageCollect { jobid } {
if { [ catch {
if { [ info exists ::SYSTEM_SHUTDOWN_IN_PROGRESS ] } {
set worry 0
} else {
set worry 1
}
set bootstraps [ list ]
set errs [ list ]
foreach _api $::API_LIST {
set errmsg ""
if { [ string equal manager $_api ] } { continue }
;## try to empty databucket
if { [ catch {
#set sid [ mgr::connect2API $_api emergency $::API_SOCKET_TIMEOUT_MS ]
set sid [ sock::open $_api emergency ]
fconfigure $sid -blocking off
puts $sid "$::MGRKEY kIlLjOb $jobid"
::close $sid
} err ] } {
addLogEntry $err red
if { $worry } {
lappend bootstraps $_api
}
lappend errs $err
}
}
;## we were having a weird problem with errs being "{}"
if { [ regexp {[a-zA-Z]+} $errs ] } {
return -code error $errs
}
foreach _api $bootstraps {
;## don't do it if it's already locked
if { ! [ info exists ::__bootstraplock_$_api ] } {
addLogEntry "bootstrapAPI $_api" purple
mgr::bootstrapAPI $_api
}
}
} err ] } {
catch { ::close $sid }
return -code error "[ myName ]: $err"
}
}
proc mgr::makeFtpDirectory { jobid dir } {
if { [ catch {
regexp {\d+} $jobid job
set jobid $::RUNCODE$job
set dir [ file join $::PUBDIR $dir ]
if { [ file isdirectory $dir ] } {
return -code error "'$dir' already exists!"
}
if { [ file exists $dir ] } {
return -code error "a FILE named '$dir' exists!"
}
file mkdir $dir
file attributes $dir -permissions 0777
} err ] } {
set err "failed to create directory '$dir': $err"
return -code error "[ myName ]: $err"
}
}
proc mgr::getURLsForJob { jobid { files "" } { i 0 } } {
if { ! [ llength [ namespace children :: $jobid ] ] } {
return {}
}
if { [ catch {
if { ! [ string length [ join $files ] ] } {
set cmd $::QUEUE($jobid,cmd)
;## substitute URL's on the input side with
;## localised filenames visible to all API's
set idx [ lsearch -exact $cmd metaOpts ]
set opts [ lindex $cmd [ incr idx ] ]
foreach [ list files opts ] \
[ mgr::localiseInputURLs $jobid $opts ] { break }
set cmd [ lreplace $cmd $idx $idx $opts ]
set ::QUEUE($jobid,cmd) $cmd
}
set done 1
;## we have seen a number of cases where the file
;## existed but had zero length due to network
;## latency problems.
foreach file $files {
if { [ file exists ${file}.lock ] } {
set done 0
;## PR #2705 handle the -tarball option!!
} elseif { [ regexp {\.tar\.(gz|bz2)$} $file ] } {
set dir [ file dirname $file ]
set unpacked [ unpackTarball $file ]
foreach item $unpacked {
file delete -force ${dir}/${item}.lock
}
}
}
;## wait for up to 1 hour for data to arrive
;## the one second delay code addresses PR# 2415
if { $done == 0 && $i <= 3600 } {
if { $i <= 4 } {
incr i 1
set delay 1000
} else {
incr i 5
set delay 5000
}
after $delay [ list ::mgr::getURLsForJob $jobid $files $i ]
} elseif { $done == 0 && $i > 3600 } {
set msg "timed out waiting for remote files to "
append msg "be transferred. expected '$files' to "
append msg "be available, but they are not all here."
return -code error $msg
} elseif { $done == 1 } {
foreach file $files {
set filetype [ fileType $file ]
;## test for .gwf files that are NOT frames!
if { [ string equal .gwf [ file extension $file ] ] && \
[ lsearch $filetype frame ] < 0 } {
set err "expected file '$file' to be a gravity "
append err "wave frame file, but it is '$filetype'"
lappend errors $err
}
;## test for 404 pages from http retrievals
if { [ regexp {html\s+404} $filetype ] } {
lappend errors [ dumpFile $file ]
}
}
;## if there were 404's, abort
;## and return 404 messages to user
if { [ info exists errors ] } {
return -code error $errors
}
;## ready to re-enter the job processing sequence
after 1 ::${jobid}::seq
}
} err ] } {
set ::${jobid}::errorapi manager
set subject "$jobid error! URL not found."
set msg "user specified URL not found:\n'$err'"
::${jobid}::predelete $subject $msg
if { [ llength [ namespace children :: ::$jobid ] ] } {
::${jobid}::seq
}
}
}
proc mgr::userQuota { cmd } {
if { [ catch {
set reason [ list ]
set user [ list ]
set now [ clock seconds ]
;## parse the important parts from the command
regexp -nocase -- {-name\s+\{?\s*(\S+)} [ lindex $cmd 1 ] -> user
set user [ string trim $user "{} " ]
set cmdname [ lindex [ lindex $cmd 2 ] 0 ]
set rate $::SAME_USER_REQUEST_RATE
set hourly $::SAME_USER_REQUESTS_PER_HOUR
set qf $::MANAGER_QUEUE_SIZE_FACTOR
set queuesize [ llength $::assistant_queue ]
set maxsize [ expr { $::NUMBER_OF_ASSISTANT_MANAGERS * $qf } ]
if { $queuesize >= $maxsize } {
set msg "The $::LDAS_SYSTEM LDAS system has a limited "
append msg "queue size of $maxsize jobs. There are "
append msg "currently $queuesize jobs in the queue. "
append msg "Please retry your job in a few minutes."
set reason queuesize_exceeded
return $msg
}
foreach entry $::QUEUE(USERS) {
set name [ lindex $entry 1 ]
if { ! [ string equal $user $name ] } {
continue
}
set jobs [ lindex $entry end ]
set last_time [ lindex [ join $jobs ] 2 ]
if { ! [ regexp {\d+} $last_time ] } {
break
}
;## calculate the retry dt
set dt [ expr { $now - $last_time } ]
set rdt [ expr { $rate - $dt } ]
if { [ clock seconds ] < ($last_time + $rate) } {
set msg "You must wait at least $rate seconds "
append msg "between LDAS system requests. Try "
append msg "again in $rdt seconds. Request Aborted."
set reason submission_rate
return $msg
}
set old [ lindex [ lindex $jobs end-$hourly ] end ]
if { ! [ regexp {\d+} $old ] } {
break
}
set somany [ expr { $now - $old } ]
if { $now < ($old + 3600) } {
set msg "You have exceeded the hourly per-user "
append msg "job quota of $hourly jobs/hr. You are"
append msg "forbidden from running another job on "
append msg "the $::LDAS_SYSTEM LDAS system for "
append msg "$somany seconds. Request Aborted."
set reason hourly_rate
return $msg
}
}
} err ] } {
if { [ string length $reason ] } {
after 0 \
[ list mgr::logRejectedJob $user $cmdname $reason ]
}
return -code error "[ myName ]: $err"
}
}
proc mgr::logRejectedJob { user command reason } {
if { [ catch {
set time [ gpsTime ]
set active $::LDASLOG/rejected_jobs.log
set archive $::LDASARC/jobstats_archive/rejected_jobs.log.$time
set dir $::LDASARC/jobstats_archive
if { ! [ file isdirectory $dir ] } {
file mkdir $dir
file attributes $dir -permissions 0755
gifBalls $dir
}
set mb [ expr { 1024 * 1024 } ]
if { [ file exists $active ] } {
if { [ file size $active ] >= $mb } {
file rename $active $archive
}
}
set fid [ ::open $active a+ 0644 ]
;## make sure whatever rubbish reaches the cntlmon API
;## is really a list! PR#1944
::puts $fid [ list $time [ list $command ] $user $reason ]
::close $fid
} err ] } {
catch { ::close $fid }
addLogEntry $err red
}
}
proc mgr::detectBlockage { { test 0 } } {
if { [ catch {
set data [ list ]
set sys $::LDAS_SYSTEM
;## force strict boolean evaluation
if { ! [ regexp {^(1|2)$} $test ] } {
set test 0
}
if { [ info exists ::blockage_reported_at ] } {
set last $::blockage_reported_at
} else {
set last 0
}
;## added at Stuart's request so that ALL stuck jobs
;## would unstick even if system was not busy.
if { [ info exists ::ALWAYS_KILL_BLOCKED_JOBS ] } {
set always_kill $::ALWAYS_KILL_BLOCKED_JOBS
if { [ regexp -nocase {^(true|1)$} $always_kill ] } {
set always_kill 1
} else {
set always_kill 0
}
} else {
set always_kill 0
}
;## these will be returned with leading "::"
set running [ namespace children :: ${::RUNCODE}* ]
set running [ lsort $running ]
set now [ clock clicks -milliseconds ]
set N [ llength $running ]
set limit $::NUMBER_OF_ASSISTANT_MANAGERS
if { [ info exists ::MGR_IS_BLOCKED_TIMEOUT ] } {
set timeout $::MGR_IS_BLOCKED_TIMEOUT
} else {
;## default panic timeout is 10 minutes
set timeout 600
}
;## $limit jobs are possible.
;## note that if allways_kill is not set to 1 that
;## no jobs will actually be killed unless the system
;## is running saturated (all assistant managers assigned).
if { $N >= $limit || $test || $always_kill } {
set shortest 100000.0
foreach job $running {
set user [ mgr::job2name $job ]
;## job not actually running right now...
if { ! [ info exists ${job}::api ] } { continue }
set api [ set ${job}::api ]
lappend score($api) $user
set start [ set ${job}::starttime(job) ]
;## possibly less than a millisecond has elapsed and
;## we are at in-between states in the array
if { [ catch {
set apit0 [ set ${job}::starttime($api) ]
} err ] } {
set apit0 $now
}
set job [ string trim $job : ]
set runtime [ expr { ($now - $start) / 1000.0 } ]
set runtime [ format "%.2f" $runtime ]
set apitime [ expr { ($now - $apit0) / 1000.0 } ]
set apitime [ format "%.2f" $apitime ]
if { $apitime < $shortest } { set shortest $apitime }
set command [ set ${job}::usrcmdname ]
lappend data [ list $job $command $api $user $apitime ]
;## *** real kill occurs here ***
mgr::abortBlockedJob $job $api $apitime
}
;## do not report more often than once per hour
set now [ clock seconds ]
if { $shortest >= $timeout && $N > 1 || $test } {
set apis [ array names score ]
;## all currently running jobs are stuck in one API
if { [ llength $apis ] == 1 || $test == 1 } {
set api [ lindex $apis 0 ]
set n [ llength $score($api) ]
set subject "${sys}: $n job(s) stalled in ${api}API"
set body [ mgr::activeJobStatusSummary text ]
set msg "Subject: ${subject}; Body: $body"
if { $last < ($now - 3600) || $test } {
addLogEntry $msg mail
set ::blockage_reported_at $now
}
} elseif { $shortest >= (2 * $timeout) || $test == 2 } {
foreach api $apis {
set n [ llength $score($api) ]
if { $N > 1 && $n >= ($N / 2) || $test } {
set subject "${sys}: $n job(s) stalled in ${api}API"
set body [ mgr::activeJobStatusSummary text ]
set msg "Subject: ${subject}; Body: $body"
if { $last < ($now - 3600) || $test } {
addLogEntry $msg mail
set ::blockage_reported_at $now
}
break
}
}
}
;## end of shortest/timeout comparison
} else {
mgr::reportOversizeQueue
}
}
} err ] } {
return -code error "[ myName ]: $err"
}
}
if ::MANAGER_ABORT_AFTER_N_SECONDS_IN_ONE_API is defined then the default timeout period for all API's is that value.Comments:
if it is NOT defined, the default timeout period for all API's defaults to 15000 seconds (4 hours).
additionally, if a variable is defined according to the pattern:
::MANAGER_ABORT_AFTER_N_SECONDS_IN_${API}_API NOTE THAT THE API NAME MUST BE ALL CAPS!!
then the timeout period for the API named in the next to the last underscore seperated field in that variable name will be equal to the value of that variable, in seconds.
proc mgr::abortBlockedJob { jobid api runtime } {
if { [ catch {
set Api [ string toupper $api ]
if { [ info exists ::MANAGER_ABORT_AFTER_N_SECONDS_IN_${Api}_API ] } {
set control ::MANAGER_ABORT_AFTER_N_SECONDS_IN_${Api}_API
set timeout [ set ::MANAGER_ABORT_AFTER_N_SECONDS_IN_${Api}_API ]
} elseif { [ info exists ::MANAGER_ABORT_AFTER_N_SECONDS_IN_ONE_API ] } {
set control ::MANAGER_ABORT_AFTER_N_SECONDS_IN_ONE_API
set timeout $::MANAGER_ABORT_AFTER_N_SECONDS_IN_ONE_API
} else {
set timeout 15000
}
;## if the user has provided the -usertimeout option for this
;## job, make use of it.
if { [ info exists ::${jobid}::usertimeout ] } {
set timeout [ set ::${jobid}::usertimeout ]
}
if { [ catch {
set timeout [ expr int($timeout) ]
} err ] } {
set err "non-numerical value given for ${control}: '$timeout'\n"
append err "this value can be defined in the LDASapi.rsc OR\n"
append err "the LDASmanager.rsc, with the manager's resource\n"
append err "file overriding the system resource file if there\n"
append err "is a conflict."
return -code error $err
}
if { [ info exists ::MANAGER_SHOULD_ABORT_STUCK_JOBS ] } {
set test $::MANAGER_SHOULD_ABORT_STUCK_JOBS
if { [ regexp -nocase {(T|TRUE)} $test ] } {
set ::MANAGER_SHOULD_ABORT_STUCK_JOBS 1
} elseif { [ regexp -nocase {(F|FALSE)} $test ] } {
set ::MANAGER_SHOULD_ABORT_STUCK_JOBS 0
} elseif { ! [ string equal 1 $test ] } {
set ::MANAGER_SHOULD_ABORT_STUCK_JOBS 0
}
} else {
set ::MANAGER_SHOULD_ABORT_STUCK_JOBS 0
}
if { [ info exists ::WARN_ABOUT_POSSIBLY_STUCK_JOBS ] } {
set test $::WARN_ABOUT_POSSIBLY_STUCK_JOBS
if { [ regexp -nocase {(T|TRUE)} $test ] } {
set ::WARN_ABOUT_POSSIBLY_STUCK_JOBS 1
} elseif { [ regexp -nocase {(F|FALSE)} $test ] } {
set ::WARN_ABOUT_POSSIBLY_STUCK_JOBS 0
} elseif { ! [ string equal 1 $test ] } {
set ::WARN_ABOUT_POSSIBLY_STUCK_JOBS 0
}
} else {
set ::WARN_ABOUT_POSSIBLY_STUCK_JOBS 0
}
if { ! [ info exists ::stuckjobwarnings ] } {
set ::stuckjobwarnings [ list ]
}
if { [ info exist ::${jobid}::userSubject ] } {
set userSubject [ set ::${jobid}::userSubject ]
} else {
set userSubject ""
}
if { [ string equal 1 $::MANAGER_SHOULD_ABORT_STUCK_JOBS ] &&
( $timeout > 0 ) } {
if { $runtime > $timeout } {
set subject "$jobid $userSubject runtime error!"
set body "$jobid error!\naborted after $runtime "
append body "seconds in ${api}API.\n"
append body "You have encountered an LDAS system bug!\n"
append body "This condition will be automatically\n"
append body "reported to the developer who is\n"
append body "responsible for maintaining the $api API.\n"
append body "User command $jobid aborting!"
} elseif { $::WARN_ABOUT_POSSIBLY_STUCK_JOBS } {
if { [ info exists ::stuckjobwarnings ] && \
[ lsearch $::stuckjobwarnings $jobid ] == -1 && \
$runtime > $timeout / 2 } {
set now \
[ clock format [ clock seconds ] -format "%D-%T %Z" ]
set subj "$jobid $userSubject running in ${api}API for "
append subj "$runtime secs."
set body "At $now $jobid had been running in the\n"
append body "${api}API for $runtime seconds.\n"
append body "If this job is still in the $api API "
append body "after $timeout seconds,\nit will be consid"
append body "ered moribund and will be aborted by the"
append body " system.\nThis will be the only warning "
append body "sent before the job is aborted."
addLogEntry "Subject: ${subj}; Body: $body" email
lappend ::stuckjobwarnings $jobid
set N $::NUMBER_OF_ASSISTANT_MANAGERS
set ::stuckjobwarnings \
[ lrange $::stuckjobwarnings end-$N end ]
}
}
}
;## The full, correct abort sequence
if { [ info exists subject ] } {
if { [ llength [ namespace children :: ::$jobid ] ] } {
set ::${jobid}::errorapi $api
::${jobid}::predelete $subject $body email
if { [ llength [ namespace children :: ::$jobid ] ] } {
::${jobid}::seq
}
}
}
} err ] } {
if { [ string length $err ] } {
set subject "$::LDAS_SYSTEM managerAPI error!"
set body "[ myName ]: $err"
addLogEntry "Subject: ${subject}; Body: $body" email
}
}
}
proc mgr::reportOversizeQueue { { test 0 } } {
if { [ catch {
set now [ clock seconds ]
set qf $::MANAGER_QUEUE_SIZE_FACTOR
;## force strict boolean evaluation
if { ! [ string equal 1 $test ] } {
set test 0
}
if { [ info exists ::oversized_queue_reported_at ] } {
set last $::oversized_queue_reported_at
} else {
set last 0
}
if { $last < ($now - 3600) || $test } {
set size [ llength $::assistant_queue ]
set max [ expr { $::NUMBER_OF_ASSISTANT_MANAGERS * $qf } ]
if { $size >= $max || $test } {
set sys $::LDAS_SYSTEM
set subject "${sys}: $size jobs are currently queued"
set body [ mgr::activeJobStatusSummary text ]
set msg "Subject: ${subject}; Body: $body"
set ::oversized_queue_reported_at $now
addLogEntry $msg mail
}
}
} err ] } {
return -code error "[ myName ]: $err"
}
}
proc mgr::recryptUsersWithNewKey { old new } {
if { [ catch {
set data [ dumpFile users.queue.crypt ]
set data [ decrypt $data $old ]
set data [ encrypt $data $new ]
set fid [ open users.queue.crypt.NEW w ]
puts $fid $data
::close $fid
} err ] } {
catch { ::close $fid }
return -code error "[ myName ]: $err"
}
}
proc mgr::findExpiredUsers { } {
if { [ catch {
set errs [ list ]
set system $::LDAS_SYSTEM
set now [ clock seconds ]
set oneday [ expr { 24 * 3600 } ]
set tomorrow [ expr { $now + $oneday } ]
set midnight [ clock format $tomorrow -format %D ]
set midnight [ clock scan $midnight ]
set nextweek [ expr { $now + ( 7 * $oneday ) } ]
set nextweek [ clock format $nextweek -format %D ]
set nextweek [ clock scan $nextweek ]
if { ! [ info exists ::LDAS_ACCOUNT_APPLICATION_URL ] || \
! [ string length $::LDAS_ACCOUNT_APPLICATION_URL ] } {
set ::LDAS_ACCOUNT_APPLICATION_URL \
https://www.ldas-sw.ligo.caltech.edu/passwordRequest/ldaspassword.cgi
}
foreach user $::QUEUE(USERS) {
set msg [ list ]
set subject [ list ]
set body [ list ]
set expires [ lindex $user 11 ]
set email [ lindex $user 5 ]
set name [ lindex $user 7 ]
set user [ lindex $user 1 ]
if { [ string equal NULL $user ] } { continue }
if { [ catch {
if { ! [ regexp {^\d+$} $expires ] } {
set expires [ clock scan $expires ]
}
} err ] } {
lappend errs "user: '$user' has bogus expiration: '$expires'"
continue
}
if { [ catch {
if { $expires <= $midnight } {
foreach [ list subject body msg ] \
[ mgr::expiresTomorrow $user $name $email $expires $now ] \
{ break }
} elseif { $expires <= $nextweek } {
foreach [ list subject body ] \
[ mgr::expiresNextWeek $user $expires ] { break }
}
} err ] } {
lappend errs "user: '$user' error: $err"
}
if { [ string length $msg ] } {
addLogEntry "Subject: ${subject}; Body: $msg" email
}
if { [ string length $body ] } {
if { [ catch {
mailTo $email $subject $body
} err ] } {
lappend errs "user: '$user' error: $err"
}
}
}
if { [ string length [ join $errs ] ] } {
return -code error $errs
}
} err ] } {
addLogEntry $err email
}
}
proc mgr::expiresNextWeek { user expires } {
if { [ catch {
set system $::LDAS_SYSTEM
set subject "LDAS $system user account '$user' expires soon!"
set body "Your LDAS $system user account: '$user'\n"
append body "will expire at [ clock format $expires ].\n\n"
append body "You need to apply for an extension using the\n"
append body "web form at $::LDAS_ACCOUNT_APPLICATION_URL"
} err ] } {
return -code error "[ myName ]: $err"
}
return [ list $subject $body ]
}
proc mgr::expiresTomorrow { user name email expires now } {
if { [ catch {
set system $::LDAS_SYSTEM
set subject "LDAS $system user account '$user' about to expire!"
set body "Your LDAS $system user account: '$user'\n"
append body "expires at [ clock format $expires ].\n\n"
append body "You need to apply for an extension using the\n"
append body "web form at $::LDAS_ACCOUNT_APPLICATION_URL"
set delay [ expr { $expires - $now } ]
if { $delay > 60 } {
set delay [ expr { $delay * 1000 } ]
} else {
set delay 0
}
if { $delay } {
set ::user_scheduled_to_expire($user) \
[ after $delay \
[ list mgr::updateUserInfo $user 1 1 1 1 1 1 ] ]
} else {
mgr::updateUserInfo $user 1 1 1 1 1 1
}
set msg "$system user account: '$user' expires "
append msg "at [ clock format $expires ].\n"
append msg "account belongs to '$name'.\n"
append msg "Last known email address was: '$email'\n\n"
append msg "user has been notified via email of\n"
append msg "account status."
} err ] } {
return -code error "[ myName ]: $err"
}
return [ list $subject $body $msg ]
}
proc mgr::throttlePipelines { cmd } {
set F 0
set M 0
set D 0
set W 0
set L 0
if { [ info exists ::STOP_PROCESSING_JOBS_NO_DISK_SPACE ] } {
if { $::STOP_PROCESSING_JOBS_NO_DISK_SPACE == 1 } {
set msg "disk is full, job processing has been suspended."
addLogEntry $msg red
return 1
}
}
foreach [ list PASS Throttles ] \
[ mgr::throttlePipelinesAtApi ] { break }
;## short circuit if no throttles are defined
if { $PASS == [ llength $::API_LIST ] } { return 0 }
array set throttle $Throttles
if { [ catch {
set assistants [ namespace children :: ${::RUNCODE}* ]
;## iterate over RUNNING user commands
foreach assistant $assistants {
set command [ info vars ${assistant}::usrcmdname ]
if { ! [ string length $command ] } { continue }
set command [ set $command ]
set api [ set ${assistant}::api ]
if { [ regexp -nocase {(diskcache|frame)} $api ] } {
incr F
} elseif { [ regexp -nocase {(eventmon|metadata)} $api ] } {
incr M
} elseif { [ regexp -nocase {(datacond)} $api ] } {
incr D
} elseif { [ regexp -nocase {(mpi)} $api ] } {
incr W
} elseif { [ regexp -nocase {(ligolw)} $api ] } {
incr L
}
}
if { [ info exists ::DEBUG_PIPELINE_THROTTLE ] && \
[ string equal 1 $::DEBUG_PIPELINE_THROTTLE ] } {
if { [ info exist throttle(frame) ] && $F >= $throttle(frame) } {
set msg "$F user commands currently in diskcache/frame."
addLogEntry $msg purple
}
if { [ info exist throttle(metadata) ] && $M >= $throttle(metadata) } {
set msg "$M user commands currently in metadata/eventmon."
addLogEntry $msg purple
}
if { [ info exist throttle(datacond) ] && $D >= $throttle(datacond) } {
set msg "$D user commands currently in datacond."
addLogEntry $msg purple
}
if { [ info exist throttle(mpi) ] && $W >= $throttle(mpi) } {
set msg "$W user commands currently in mpi/wrapper."
addLogEntry $msg purple
}
if { [ info exist throttle(ligolw) ] && $L >= $throttle(ligolw) } {
set msg "$L user commands currently in ligolw."
addLogEntry $msg purple
}
}
if { ([ info exist throttle(frame) ] && $F >= $throttle(frame) ) || \
([ info exist throttle(metadata) ] && $M >= $throttle(metadata) ) || \
([ info exist throttle(mpi) ] && $W >= $throttle(mpi) ) || \
([ info exist throttle(ligolw) ] && $L >= $throttle(ligolw) ) } {
set flag 1
} elseif { ([ info exist throttle(datacond) ]) && $D >= $throttle(datacond) } {
set rx {(conditionData|dataStandAlone|dataPipeline)}
if { [ regexp -nocase $rx $cmd ] } {
set flag 1
} else {
set flag 0
}
} else {
set flag 0
}
;## and be a little looser...
;## if frame is busy, and metadata is not
if { ([ info exist throttle(metadata) ] && $M < $throttle(metadata)) && \
([ info exist throttle(frame) ] && $F >= $throttle(frame)) } {
;## when it's a database action
if { [ regexp {MetaData} $cmd ] } {
set flag 0
}
;## when it's definitely gone on ahead of the frame
if { [ regexp {(mpi|datacond|eventmon|metadata|ligolw)} $api ] } {
set flag 0
}
;## if it's the metadata that's jammed, maybe a frame job
;## can get processed
} elseif { ([ info exist throttle(frame) ] && $F < $throttle(frame)) && \
([ info exist throttle(metadata) ] && $M >= $throttle(metadata)) } {
if { [ regexp {(diskcache|frame)} $api ] } {
set flag 0
}
}
if { [ regexp {(getFrameCache|cacheGetFile)} $cmd ] } {
set flag 0
}
} err ] } {
if { [ string length $err ] } {
addLogEntry $err red
}
set flag 0
}
return $flag
}
If a variable is defined in the manager API resource file which matches the pattern:Comments:
::THROTTLE_PIPELINES_AT_XXX_API Where 'XXX' is replaced by the name of an LDAS API with the name rendered entirely in capital letters, i.e. 'FRAME', then the the value of the variable will be used to limit the number of simultaneously running jobs in that API.
proc mgr::throttlePipelinesAtApi { args } {
if { [ catch {
set apis $::API_LIST
foreach api $apis {
set Api [ string toupper $api ]
set PASS 0
if { [ string equal DATACOND $Api ] && \
[ info exists ::DATACOND_MEMORY_LOCK ] && \
$::DATACOND_MEMORY_LOCK == 1 } {
set throttle($api) 0
continue
}
if { [ info exists ::THROTTLE_PIPELINES_AT_${Api}_API ] } {
set throttle($api) \
[ set ::THROTTLE_PIPELINES_AT_${Api}_API ]
if { ! [ regexp {^\d+$} $throttle($api) ] || \
$throttle($api) >= $::NUMBER_OF_ASSISTANT_MANAGERS } {
set throttle($api) $::NUMBER_OF_ASSISTANT_MANAGERS
incr PASS
}
} else {
set throttle($api) $::NUMBER_OF_ASSISTANT_MANAGERS
incr PASS
}
}
} err ] } {
return -code error "[ myName ]: $err"
}
return [ list $PASS [ array get throttle ] ]
}
proc mgr::promoteJobs { pipelinethrottle } {
if { [ catch {
if { $pipelinethrottle } {
if { [ llength $::assistant_queue ] > 1 } {
set promote [ lindex $::assistant_queue 1 ]
set ::assistant_queue \
[ lreplace $::assistant_queue 1 1 ]
set ::assistant_queue \
[ linsert $::assistant_queue 0 $promote ]
}
}
} err ] } {
addLogEntry $err red
}
}
proc mgr::memUsage { args } {
}
Parses the output from:Comments:
dmesg |grep -i nfs - on Solaris grep -i nfs /var/log/messages - on Linux To detect NFS errors which may be adversely affecting the LDAS system.
proc mgr::scanNFSErrors { args } {
if { [ catch {
} err ] } {
return -code error "[ myName ]: $err"
}
}
proc mgr::logReply { args } {
if { [ catch {
set var [ lindex $args 0 ]
regexp {[^:_]+} $var api
regexp {sock\d+} $var cid
catch { ::close $cid }
set retval [ set $var ]
catch { ::unset $var }
if { [ string length $retval ] } {
addLogEntry "${api}API: '$retval'" blue
}
} err ] } {
catch { ::close $cid }
catch { ::unset $var }
addLogEntry $err email
}
}
arguments are the normal userinfo items:Comments:
-name bob_dobbs -password $combined_hash ...
proc mgr::validateUserPassword { args } {
if { [ llength $args ] == 1 } {
set args [ lindex $args 0 ]
}
if { [ catch {
set bool 0
;## this will set salt and digest via uplevel
regsub {md5Protocol} $args md5protocol args
;## this returns -password **** -name **** -md5salt ****
;## only name is used here
set cmd [ cmd::md5Unpack $args ]
array set input $cmd
foreach user $::QUEUE(USERS) {
if { ! [ llength $user ] } { continue }
if { [ llength $user ] != 13 } {
set msg "QUEUE(USERS) has been corrupted!"
addLogEntry $msg 2
return -code error "[ myName ]: $msg"
}
array set q [ lrange $user 0 end-1 ]
if { [ string equal $q(-name) $input(-name) ] } {
set test [ key::md5 $q(-password)$salt ]
if { [ string equal $digest $test ] } {
set bool 1
}
if { ! [ regexp {^\d+$} $q(-expires) ] } {
set q(expires) [ clock scan $q(-expires) ]
}
if { [ clock seconds ] > $q(-expires) } {
set edate [ clock format $q(-expires) -format %D ]
set msg "MOU for user $q(-name) expired on ${edate}. "
append msg "Please contact LDAS admin to apply for "
append msg "an extension."
return -code error "[ myName ]: $msg"
}
}
} ;## end of foreach
} err ] } {
return -code error "[ myName ]: $err"
}
return $bool
}
proc mgr::getDiskUsageInfo { args } {
if { [ catch {
set data [ exec df -m [ pwd ] ]
set data [ lrange [ join [ split $data ] ] end-5 end ]
foreach [ list dev size used avail percent mount ] $data { break }
regexp {\d+} $percent percent
set data [ list $dev $size $used $avail $percent $mount ]
} err ] } {
return -code error "[ myName ]: $err"
}
return $data
}
proc mgr::throttleOnDiskUsage { args } {
if { [ catch {
set report 0
set stopjobs 0
set api $::API
if { ! [ info exists ::REPORT_DISK_GETTING_FULL_USAGE_PERCENT ] } {
set ::REPORT_DISK_GETTING_FULL_USAGE_PERCENT 90
}
if { ! [ info exists ::STOP_ACCEPTING_JOBS_DISK_USAGE_PERCENT ] } {
set ::STOP_ACCEPTING_JOBS_DISK_USAGE_PERCENT 97
}
set reportpercent $::REPORT_DISK_GETTING_FULL_USAGE_PERCENT
set nojobspercent $::STOP_ACCEPTING_JOBS_DISK_USAGE_PERCENT
set now [ gpsTime ]
set ::diskusage($now) [ mgr::getDiskUsageInfo ]
set times [ lsort -decreasing -integer [ array names ::diskusage ] ]
foreach time [ lrange $times 72 end ] {
unset ::diskusage($time)
}
set times [ lrange $times 0 71 ]
set fid [ open ${::PUBDIR}/disk.usage w 0644 ]
foreach time $times {
puts $fid "$time $::diskusage($time)"
}
::close $fid
set points [ llength $times ]
foreach [ list dev size used avail percent mount ] \
$::diskusage($now) { break }
if { $percent >= $nojobspercent } {
set report 1
set stopjobs 1
} elseif { $percent >= $reportpercent } {
set report 1
} elseif { $percent < $reportpercent } {
;## if we were previously in an out-of-space condition
if { [ info exists ::STOP_PROCESSING_JOBS_NO_DISK_SPACE ] && \
$::STOP_PROCESSING_JOBS_NO_DISK_SPACE == 1 } {
set report 1
set points -1
set stopjobs 0
}
}
if { $report } {
set subject "$::LDAS_SYSTEM disk usage at ${percent}%!"
if { $points > 1 } {
set j [ expr { $points - 1 } ]
set old [ set ::diskusage([ lindex $times $j ]) ]
set old [ lindex $old 2 ]
set dMb [ expr { ($used - $old) / double($j) } ]
set dMb [ format %.1f $dMb ]
if { $dMb > 0 } {
set dead [ expr { int($avail / $dMb) } ]
set mod [ expr { $dead % 24 } ]
set dead [ expr { $dead / 24 } ]
set report "Disk usage by LDAS has reached ${percent}%.\n"
append report "Disk usage has been increasing at an\n"
append report "average rate of $dMb Mb per hour for "
append report "$points hours.\nAt this rate $dev will be"
append report " full in\n$dead days $mod hours!\n"
append report "When disk usage reaches "
append report "${nojobspercent}%, the manager\n"
append report "will stop accepting jobs until disk space"
append report "\nis cleared!"
} elseif { $dMb == 0 } {
set report "Disk usage by LDAS has reached ${percent}%.\n"
append report "When disk usage "
append report "reaches ${nojobspercent}%, the manager\n"
append report "will stop accepting jobs until disk space"
append report "\nis cleared!"
} elseif { $dMb < 0 } {
set report "Disk usage by LDAS has reached ${percent}%.\n"
append report "Disk space is being freed at an "
append report "average\nrate of $dMb Mb per hour for "
append report "$points hours."
}
} elseif { $points == 0 } {
# just started, no history data available!!
set report "you should shutdown your LDAS system and clear\n"
append report "some disk space before trying to run LDAS!\n"
append report "perhaps you should archive some of the job\n"
append report "directories under ${::PUBDIR}. the manager\n"
append report "will stop accepting jobs when disk usage "
append report "reaches ${nojobspercent}%, and will not\n"
append report "resume processing until disk space is freed."
} else {
set report "Disk usage has fallen to ${percent}%.\n"
append report "Job processing will resume NOW."
}
if { $stopjobs } {
set report "manager API cannot process any more jobs\n"
append report "until some disk space is freed on ${dev}\n"
append report "AND *either* the disk usage falls below\n"
append report "the value defined by the flag\n"
append report "::REPORT_DISK_GETTING_FULL_USAGE_PERCENT\n"
append report "(currently set to ${reportpercent}%) *or*\n"
append report "the status flag "
append report "::STOP_PROCESSING_JOBS_NO_DISK_SPACE\n"
append report "is set to '0' in the manager API by an\n"
append report "admin user via the control and monitor API."
set ::STOP_PROCESSING_JOBS_NO_DISK_SPACE 1
after 60000 mgr::activelyMonitorDiskUsage
} elseif { [ string equal manager $api ] && ! $stopjobs } {
if { [ info exists ::STOP_PROCESSING_JOBS_NO_DISK_SPACE ] } {
set ::STOP_PROCESSING_JOBS_NO_DISK_SPACE 0
}
}
addLogEntry "Subject: ${subject}; Body: $report" email
}
} err ] } {
addLogEntry $err red
}
}
proc mgr::activelyMonitorDiskUsage { args } {
if { [ catch {
if { $::STOP_PROCESSING_JOBS_NO_DISK_SPACE == 1 } {
foreach [ list dev size used avail percent mount ] \
[ mgr::getDiskUsageInfo ] { break }
if { $percent <= $::REPORT_DISK_GETTING_FULL_USAGE_PERCENT } {
if { [ array exists ::diskusage ] } {
::unset ::diskusage
}
set ::STOP_PROCESSING_JOBS_NO_DISK_SPACE 0
} else {
after 15000 mgr::activelyMonitorDiskUsage
}
}
} err ] } {
return -code error "[ myName ]: $err"
}
}
proc mgr::liveJobListPush { { apis frame } args } {
if { [ info exists ::jobid ] && \
[ string equal SHUTDOWN $::jobid ] } {
return
}
set data [ namespace children :: ::${::RUNCODE}* ]
set data [ lsort -dictionary $data ]
if { [ info exists ::LIVE_JOB_PUSH_API_LIST ] } {
set apis $::LIVE_JOB_PUSH_API_LIST
}
foreach api $apis {
if { ! [ regexp $api $::API_LIST ] || ! [ info exist ::[ string toupper $api]_API_HOST ] } {
continue
}
if { [ catch {
set sid [ sock::open $api emergency ]
fconfigure $sid -blocking off
puts $sid \
"$::MGRKEY set ::SORTED_LIVE_JOB_LIST_REPORT [ list $data ]"
::close $sid
} err ] } {
catch { ::close $sid }
addLogEntry "api: $api error: $err" red
}
}
return $data
}
proc mgr::reportWrongUser { args } {
if { [ info exists ::AUTHORIZED_LDAS_USER ] && \
[ string length $::AUTHORIZED_LDAS_USER ] } {
set authorized $::AUTHORIZED_LDAS_USER
set user $::env(USER)
if { ! [ string equal $authorized $user ] } {
set subject "Attempt to start $::LDAS_SYSTEM by "
append subject "unauthorised user!"
set report "User '$user' is attempting to start LDAS,\n"
append report "but the LDASmanager.rsc file declares\n"
append report "the variable ::AUTHORIZED_LDAS_USER to be\n"
append report "'${::AUTHORIZED_LDAS_USER}.' Please log in\n"
append report "as the correct user or change the value\n"
append report "of ::AUTHORIZED_LDAS_USER to be the name\n"
append report "of the user that is permitted to run LDAS."
catch {
addLogEntry "Subject: ${subject}; Body: $report" email
}
puts stderr $report
exit
}
}
}
proc mgr::reportWrongHost { args } {
set ip $::MY_IP
set user $::env(USER)
set resource $::MANAGER_API_HOST
set running $resource
set interfaces [ list 127.0.0.1 $running ]
catch { set interfaces [ ifConfig ] }
;## try to figure out where we are really running
set mars_rx {^(10\.|192\.168\.|127\.0\.0\.)}
foreach interface $interfaces {
foreach [ list if ip running ] $interface { break }
if { [ regexp -nocase -- $resource $running ] } {
break
}
}
puts stderr "manager host per ::MANAGER_API_HOST: '$resource'"
puts stderr "manager host per system info: '$running'"
if { ! [ regexp -nocase -- $resource $running ] } {
set subject "$::LDAS_SYSTEM $::API API running on wrong host!"
set report "The Manager API on $::LDAS_SYSTEM seems to\n"
append report "be running on '$interfaces', but the\n"
append report "LDASapi.rsc file says it should be running\n"
append report "on '$resource'. If these are not actually\n"
append report "the same host machine, there may be a\n"
append report "problem! This manager API was started by\n"
append report "user: '$user'\n"
if { [ info exists ::ABORT_MANAGER_IF_WRONG_HOST ] && \
[ string equal 1 $::ABORT_MANAGER_IF_WRONG_HOST ] } {
append report "\nYou can prevent the manager from aborting\n"
append report "when this condition is detected by setting\n"
append report "::ABORT_MANAGER_IF_WRONG_HOST in the\n"
append report "LDASmanager.rsc file to '0'."
}
catch {
addLogEntry "Subject: ${subject}; Body: $report" email
}
puts stderr $report
if { [ info exists ::ABORT_MANAGER_IF_WRONG_HOST ] && \
[ string equal 1 $::ABORT_MANAGER_IF_WRONG_HOST ] } {
::exit
}
}
}
proc mgr::chopQueueTo { size } {
if { ! [ info exists ::assistant_queue ] } { return }
if { [ llength $::assistant_queue ] <= $size } { return }
if { [ catch {
set caller [ uplevel myName ]
set chopped [ lindex $::assistant_queue end ]
set ::assistant_queue [ lrange $::assistant_queue 0 end-1 ]
foreach [ list name cmdname cmd ] $chopped { break }
set jobid $name
if { [ info exist ::${jobid}::userSubject ] } {
set userSubject [ set ::${jobid}::userSubject ]
} else {
set userSubject ""
}
set subject "$::LDAS_SYSTEM error! Job $name $userSubject aborted"
set body "$::LDAS_SYSTEM job $name aborted "
append body "by LDAS adiministrator\nas part of "
append body "an unplanned system maintenance emergency.\n\n"
append body "You will need to resubmit this job:\n\n"
append body "$cmd"
if { [ catch {
set user [ ::mgr::job2name $jobid ]
set email [ ::mgr::job2email $jobid ]
mailTo $email $subject $body
set drop [ jobDirectory ]/email.$name
set fid [ open $drop w ]
puts $fid "<html>\n<pre>\n$user ${email}\n${subject}\n$body"
::close $fid
} err ] } {
catch { ::close $fid }
addLogEntry $err red
}
mgr::deleteJobidFromUsersQueue $name
addLogEntry "job removed from queue by LDAS administrator" red
} err ] } {
return -code error "[ myName ]: $err"
}
if { [ string equal mgr::sHuTdOwN $caller ] } {
mgr::chopQueueTo $size
} else {
after 250 [ list mgr::chopQueueTo $size ]
}
}
proc mgr::rmJobFiles { thisjob jobids { extensions all } } {
if { [ catch {
set deleted [ list ]
set notowner [ list ]
set user [ set ::${thisjob}(-userid) ]
;## rationalise job i.d.'s
set preprepro $jobids
regsub -all $::RUNCODE $jobids {} jobids
if { [ regexp -nocase {[a-z]} $jobids ] } {
error "invalid runcode string in jobid list: '$preprepro'"
}
;## sanity check. make sure user intends to hose system
;## if he's going to be allowed to do it!!
;## forces user to break long jobid range into mutliple
;## requests of 1000 each, i.e. 1000-2000 2001-3000...
if { [ regexp {^(\d+)-(\d+)$} $jobids -> begin end ] } {
if { $end - $begin > 1000 } {
set msg "Request exceeds limit of 1000 jobid's:"
append msg " '$preprepro'"
return -code error $msg
}
}
set retval "$::PUBDIR "
set joblist [ numRange $jobids ]
foreach jobid $joblist {
regexp {\d+} $jobid job
set jobid $::RUNCODE$job
set myriad [ expr { $job / 10000 } ]
set myrdir ${::RUNCODE}_$myriad
set jobdir ${::RUNCODE}$job
set subdir [ file join $myrdir $jobdir ]
set jobdir [ file join $::PUBDIR $subdir ]
if { [ string equal -nocase all $extensions ] } {
set extensions [ list gwf ilwd xml ]
}
if { ! [ file isdirectory $jobdir ] } {
set err "invalid job directory: '$jobdir'"
lappend notowner $err
continue
}
set usermsg $jobdir/email.$jobid
if { ! [ file readable $usermsg ] } {
set err "could not read file: '$usermsg'"
lappend notowner $err
continue
}
set usermsg [ split [ dumpFile $usermsg ] "\n" ]
set USER [ lindex [ lindex $usermsg 2 ] 0 ]
if { ! [ string equal $user $USER ] } {
set err "jobid $jobid does not belong to user: '$user'"
append err " (belongs to user '$USER')"
lappend notowner $err
addLogEntry $err red
continue
}
foreach ext $extensions {
;## strip leading dots
regsub {^\.} $ext {} ext
if { ! [ regexp {^(gwf|ilwd|xml)$} $ext ] } {
set msg "Removal of .$ext files is not supported."
addLogEntry $msg red
continue
}
foreach file [ glob -nocomplain $jobdir/*.$ext ] {
file delete $file
lappend deleted [ file tail $file ]
}
}
if { [ string length $retval ] < $::MAX_REPLY_BYTES } {
if { [ llength $deleted ] } {
append retval "$subdir files removed: [ join $deleted ]\n"
set deleted [ list ]
} else {
append retval "$subdir: no files deleted!\n"
}
}
} ;## end of foreach on jobid's
if { [ llength $notowner ] } {
append retval "\n$notowner"
}
} err ] } {
return -code error "[ myName ]: $err"
}
;## suppress very long responses
set len [ string length $retval ]
if { $len > $::MAX_REPLY_BYTES } {
set retval "rmJobFiles completed for $joblist '$retval ...', ($len bytes) exceeded $::MAX_REPLY_BYTES limit."
}
addLogEntry "response has $len bytes" purple
return [ string trim $retval \n ]
}
proc mgr::blockUser { user { option block } } {
if { [ string equal sTaRtUp $user ] } {
if { [ file exists blocked.users ] } {
set ::mgr::blocked_users [ dumpFile blocked.users ]
} else {
set ::mgr::blocked_users [ list ]
}
return {}
}
if { [ catch {
set update 0
if { [ info exists ::mgr::blocked_users ] } {
} elseif { [ file exists blocked.users ] } {
set ::mgr::blocked_users [ dumpFile blocked.users ]
}
set i [ lsearch $::mgr::blocked_users $user ]
switch -exact $option {
block {
if { $i == -1 } {
set update 1
lappend ::mgr::blocked_users $user
set msg "User: '$user' blocked temporarily."
} else {
set msg "User: '$user' already blocked."
}
}
unblock {
if { $i == -1 } {
set msg "User: '$user' was not blocked."
} else {
set update 1
set ::mgr::blocked_users [ lreplace $::mgr::blocked_users $i $i ]
set msg "User: '$user' existing block removed."
}
}
default {
set err "bad option: '$option'"
return -code error $err
}
}
;## only write file if it will be updated
if { $update == 1 } {
set fid [ open blocked.users w 0644 ]
puts $fid $::mgr::blocked_users
::close $fid
}
} err ] } {
catch { ::close $fid }
return -code error "[ myName ]: $err"
}
return $msg
}
proc mgr::blockDso { dso { option block } } {
if { [ string equal sTaRtUp $dso ] } {