Skip to content

Commit 71af524

Browse files
committed
fixed further tests sporadically failing due to not enough host slots with 9.1.0
1 parent 29e6a49 commit 71af524

File tree

5 files changed

+75
-47
lines changed

5 files changed

+75
-47
lines changed

src/checktree/bugs/issuezilla/2378/check.exp

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -27,17 +27,17 @@
2727
#
2828
# All Rights Reserved.
2929
#
30-
# Portions of this software are Copyright (c) 2023-2024 HPC-Gridware GmbH
30+
# Portions of this software are Copyright (c) 2023-2025 HPC-Gridware GmbH
3131
#
3232
##########################################################################
3333
#___INFO__MARK_END__
3434

3535
# define global variable in this namespace
36-
global check_name
37-
global check_category
38-
global check_description
36+
global check_name
37+
global check_category
38+
global check_description
3939
global check_needs
40-
global check_functions
40+
global check_functions
4141
global check_root_access_needs
4242

4343
# define test's name and run level descriptions
@@ -46,7 +46,7 @@ set check_category "ISSUE PARALLEL SCHEDULER VERIFIED"
4646
set check_description(0) "check issue 2378 is fixed"
4747

4848
# define test's dependencies
49-
set check_needs "init_core_system"
49+
set check_needs "init_core_system"
5050

5151
# setup and cleanup functions
5252
set check_setup_function issue_2378_setup
@@ -62,13 +62,14 @@ proc issue_2378_setup {} {
6262
global ts_config
6363
global global_host_backup
6464

65-
ts_log_fine "issue_2378_setup"
65+
ts_log_fine "issue_2378_setup"
6666

6767
if {[llength $ts_config(execd_nodes)] < 2} {
6868
ts_log_config "Need at least 2 hosts for this test. We only have $ts_config(execd_hosts)"
6969
return 99
7070
}
7171

72+
setup_host_slots_for_binding
7273

7374
set pe(slots) "2"
7475
set pe(allocation_rule) "\$round_robin"
@@ -128,6 +129,8 @@ proc issue_2378_cleanup { } {
128129

129130
del_pe "round_robin"
130131
del_pe "fill_up"
132+
133+
cleanup_host_slots_for_binding
131134
}
132135

133136

src/checktree/functional/ssos/ssos_general/check.exp

Lines changed: 36 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -35,11 +35,11 @@
3535
#___INFO__MARK_END__
3636

3737
# define global variable in this namespace
38-
global check_name
39-
global check_category
40-
global check_description
38+
global check_name
39+
global check_category
40+
global check_description
4141
global check_needs
42-
global check_functions
42+
global check_functions
4343
global check_root_access_needs
4444
global check_need_running_system
4545

@@ -53,16 +53,26 @@ set check_description(0) "General Slotwise Suspend on Subordinate test"
5353
set check_description(100) "General Slotwise Suspend on Subordinate test on all architectures"
5454

5555
# define test's dependencies
56-
set check_needs "init_core_system"
56+
set check_needs "init_core_system"
5757

5858
# setup and cleanup functions
59+
set check_setup_function "general_ssos_setup"
60+
set check_cleanup_function "general_ssos_cleanup"
5961
set check_setup_level_function "general_ssos_level_setup"
6062
set check_cleanup_level_function "general_ssos_level_cleanup"
6163

6264
# define test's procedure order
6365
set check_functions {}
6466
lappend check_functions "general_ssos_test"
6567

68+
proc general_ssos_setup {} {
69+
setup_host_slots_for_binding
70+
}
71+
72+
proc general_ssos_cleanup {} {
73+
cleanup_host_slots_for_binding
74+
}
75+
6676
proc general_ssos_level_setup {} {
6777
global CHECK_USER CHECK_HAVE_TDOM
6878
global CHECK_ACT_LEVEL
@@ -88,7 +98,7 @@ proc general_ssos_level_setup {} {
8898
# G.q H.q
8999
# / \
90100
# I.q J.q
91-
#
101+
#
92102
# J.q must be created first, because it is referenced in H.q and must already
93103
# exist when H.q is created, and so on
94104
set ssos_config(queue_list) {}
@@ -156,7 +166,7 @@ proc general_ssos_level_setup {} {
156166

157167
proc general_ssos_level_cleanup {} {
158168
get_current_cluster_config_array ts_config
159-
global ssos_config
169+
global ssos_config
160170

161171
delete_all_jobs
162172
wait_for_end_of_all_jobs
@@ -166,12 +176,12 @@ proc general_ssos_level_cleanup {} {
166176
# and can therefore be deleted immediately. Then B.q and C.q can be deleted,
167177
# and so on.
168178
set ssos_config(queue_list) [lsort -increasing $ssos_config(queue_list)]
169-
179+
170180
foreach queue $ssos_config(queue_list) {
171-
del_queue $queue "" 1 1
181+
del_queue $queue "" 1 1
172182
}
173183

174-
unset ssos_config
184+
unset ssos_config
175185
}
176186

177187

@@ -236,7 +246,7 @@ proc ssos_test_suspension_in_deepest_queue {host djob_list} {
236246
# submit job to "G.q"
237247
set arguments "$my_fix_settings -q G.q@$host $ts_config(product_root)/examples/jobs/sleeper.sh 1000"
238248
set g_job [submit_job $arguments]
239-
# wait for job start on D.q, G.q and 2 x J.q
249+
# wait for job start on D.q, G.q and 2 x J.q
240250
lappend job_list $d_job
241251
lappend job_list $g_job
242252
lappend job_list $first_j_job
@@ -301,7 +311,7 @@ proc ssos_test_unsuspend_on_manual_suspend {host djob_list} {
301311
global my_fix_settings
302312
set error_text ""
303313
#
304-
# Test #2
314+
# Test #2
305315
# Check if the task in J.q that is suspended by slotwise subordination gets
306316
# unsuspended when the job in queue D.q gets manually suspended, because the
307317
# number of running tasks in the subordination tree drops to 4 or lower.
@@ -326,7 +336,7 @@ proc ssos_test_unsuspend_on_manual_suspend {host djob_list} {
326336
if {$queueState != "s"} {
327337
append error_text "queue \"$queue\" reports state \"$queueState\" which should be suspended \"s\"!\n"
328338
}
329-
339+
330340
# job on D.q should be in state "s"
331341
get_job_state $d_job 0 task_info
332342
if {$task_info(0,state) != "S"} {
@@ -403,7 +413,7 @@ proc ssos_test_sequence_number_obeyance {host} {
403413
append error_text "queue \"$queue\" reports state \"$queueState\" which is not expected!\n"
404414
}
405415
}
406-
416+
407417
# Now submit 2 jobs to C.q
408418
ts_log_fine "Submitting two jobs to C.q"
409419
set arguments "$my_fix_settings -q C.q@$host $ts_config(product_root)/examples/jobs/sleeper.sh 1000"
@@ -425,7 +435,7 @@ proc ssos_test_sequence_number_obeyance {host} {
425435
set arguments "$my_fix_settings -q A.q@$host $ts_config(product_root)/examples/jobs/sleeper.sh 1000"
426436
set a_job [submit_job $arguments]
427437
wait_for_jobstart $a_job "test" 999 1 1
428-
438+
429439
# now check that c_job_1 is in state "S", all others must be in state "r"
430440
ts_log_fine "Expecting the younger job in C.q to be suspended, even if it's older than the"
431441
ts_log_fine "jobs in B.q, because C.q has the lower sequence number."
@@ -469,7 +479,7 @@ proc ssos_test_sequence_number_obeyance {host} {
469479
ts_log_fine "Scenario 3 successfully completed"
470480
}
471481
set error_text ""
472-
482+
473483
delete_all_jobs
474484
wait_for_end_of_all_jobs
475485
}
@@ -483,13 +493,13 @@ proc issue_GE_3233 { host } {
483493
ts_log_fine ""
484494
ts_log_fine "Testing for Bug 3233 (leaving one job suspended)"
485495

486-
# submit 4 120 second jobs in C.q
496+
# submit 4 120 second jobs in C.q
487497
set arguments "$my_fix_settings -q C.q@$host $ts_config(product_root)/examples/jobs/sleeper.sh 180"
488498
set c_queue_job_1 [submit_job $arguments]
489499
set c_queue_job_2 [submit_job $arguments]
490500
set c_queue_job_3 [submit_job $arguments]
491501
set c_queue_job_4 [submit_job $arguments]
492-
502+
493503
lappend job_list1 $c_queue_job_1
494504
lappend job_list1 $c_queue_job_2
495505
lappend job_list1 $c_queue_job_3
@@ -498,27 +508,27 @@ proc issue_GE_3233 { host } {
498508
foreach job $job_list1 {
499509
wait_for_jobstart $job "test" 60 1 1
500510
}
501-
502-
# submit 6 20 second jobs in A.q
511+
512+
# submit 6 20 second jobs in A.q
503513
set arguments "$my_fix_settings -q A.q@$host $ts_config(product_root)/examples/jobs/sleeper.sh 10"
504514
set c_queue_job_1 [submit_job $arguments]
505515
set c_queue_job_2 [submit_job $arguments]
506516
set c_queue_job_3 [submit_job $arguments]
507517
set c_queue_job_4 [submit_job $arguments]
508518
set c_queue_job_5 [submit_job $arguments]
509519
set c_queue_job_6 [submit_job $arguments]
510-
520+
511521
lappend job_list2 $c_queue_job_1
512522
lappend job_list2 $c_queue_job_2
513523
lappend job_list2 $c_queue_job_3
514524
lappend job_list2 $c_queue_job_4
515525
lappend job_list2 $c_queue_job_5
516526
lappend job_list2 $c_queue_job_6
517-
527+
518528
foreach job $job_list2 {
519529
wait_for_jobstart $job "test" 60 0 1
520530
}
521-
531+
522532
# A.q jobs must suspend all jobs in C.q
523533

524534
# after all A.q jobs are finished all C.q jobs must be in running state
@@ -527,9 +537,9 @@ proc issue_GE_3233 { host } {
527537
}
528538

529539
# wait for system reacting
530-
sleep 2
540+
sleep 2
531541

532-
# bug is present when one job remains in in S state
542+
# bug is present when one job remains in in S state
533543
foreach job $job_list1 {
534544
get_job_state $job 0 task_info
535545
if {$task_info(0,state) != "r"} {
@@ -544,7 +554,7 @@ proc issue_GE_3233 { host } {
544554
ts_log_fine "Bug GE-3233 not found!"
545555
}
546556
set error_text ""
547-
557+
548558
delete_all_jobs
549559
wait_for_end_of_all_jobs
550560
}

src/checktree/system_tests/config/submit_hosts/check.exp

Lines changed: 24 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -27,17 +27,17 @@
2727
#
2828
# All Rights Reserved.
2929
#
30-
# Portions of this software are Copyright (c) 2023-2024 HPC-Gridware GmbH
30+
# Portions of this software are Copyright (c) 2023-2025 HPC-Gridware GmbH
3131
#
3232
##########################################################################
3333
#___INFO__MARK_END__
3434

3535
# define global variable in this namespace
36-
global check_name
36+
global check_name
3737
global check_category
38-
global check_description
38+
global check_description
3939
global check_needs
40-
global check_functions
40+
global check_functions
4141
global check_root_access_needs
4242

4343
# define test's name and run level descriptions
@@ -47,12 +47,14 @@ set check_description(0) "checking submit host functionality without local conf
4747
set check_description(1) "checking submit host functionality with local qsh_command"
4848

4949
# define test's dependencies
50-
set check_needs "init_core_system display_test"
50+
set check_needs "init_core_system display_test"
5151

5252

5353
# setup and cleanup functions
54-
set check_setup_level_function "submit_hosts_setup"
55-
set check_cleanup_level_function "submit_hosts_cleanup"
54+
set check_setup_function "submit_hosts_setup"
55+
set check_cleanup_function "submit_hosts_cleanup"
56+
set check_setup_level_function "submit_hosts_setup_level"
57+
set check_cleanup_level_function "submit_hosts_cleanup_level"
5658

5759
# define test's procedure order
5860
set check_functions ""
@@ -62,6 +64,15 @@ global submit_hosts_use_local_conf
6264
# -------- local test procedures -----------------------------------------------
6365

6466
proc submit_hosts_setup {} {
67+
setup_host_slots_for_binding
68+
}
69+
70+
proc submit_hosts_cleanup {} {
71+
cleanup_host_slots_for_binding
72+
}
73+
74+
75+
proc submit_hosts_setup_level {} {
6576
global ts_config CHECK_ACT_LEVEL
6677
global CHECK_JOB_OUTPUT_DIR
6778
global submit_hosts_use_local_conf
@@ -109,7 +120,7 @@ proc submit_hosts_setup {} {
109120
}
110121
}
111122

112-
proc submit_hosts_cleanup {} {
123+
proc submit_hosts_cleanup_level {} {
113124
global ts_config
114125
global CHECK_JOB_OUTPUT_DIR
115126
global submit_hosts_use_local_conf
@@ -170,7 +181,7 @@ proc submit_hosts_test {} {
170181
global ts_config
171182
global CHECK_USER
172183
global submit_hosts_use_local_conf
173-
184+
174185

175186
if {[llength $ts_config(submit_only_hosts)] <= 0 || $ts_config(submit_only_hosts) == "none" } {
176187
ts_log_config "No submit hosts specified in test configuration file - run setup option"
@@ -185,7 +196,7 @@ proc submit_hosts_test {} {
185196
}
186197

187198
# we do all following tests for each configured submit host
188-
foreach submit_host $ts_config(submit_only_hosts) {
199+
foreach submit_host $ts_config(submit_only_hosts) {
189200
ts_log_fine "testing submit host $submit_host"
190201

191202
ts_log_fine "*** step 1 on $submit_host: qsub ***"
@@ -216,7 +227,7 @@ proc submit_hosts_test {} {
216227
wait_for_jobend $job_id "leeper" 30 0
217228

218229
# give all our submit only hosts submit privileges
219-
foreach submit_host $ts_config(submit_only_hosts) {
230+
foreach submit_host $ts_config(submit_only_hosts) {
220231
set output [start_sge_bin "qconf" "-as $submit_host"]
221232
if {$prg_exit_state != 0} {
222233
ts_log_severe "qconf -as $submit_host failed:\n$output"
@@ -227,7 +238,7 @@ proc submit_hosts_test {} {
227238

228239
# now test again
229240
set job_ids {}
230-
foreach submit_host $ts_config(submit_only_hosts) {
241+
foreach submit_host $ts_config(submit_only_hosts) {
231242
ts_log_fine "testing submit host host $submit_host"
232243

233244
# submit a job from each submit host
@@ -248,7 +259,7 @@ proc submit_hosts_test {} {
248259
}
249260

250261
# test qstat, qrsh and qconf
251-
foreach submit_host $ts_config(submit_only_hosts) {
262+
foreach submit_host $ts_config(submit_only_hosts) {
252263
ts_log_fine "testing host $submit_host"
253264

254265
set output [start_sge_bin "qstat" "-f" $submit_host $CHECK_USER]

src/checktree/system_tests/qmaster/failover/check.exp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,10 @@ proc qmaster_failover_cleanup {} {
9898
# should not enter state DISABLED after failure of qmaster if previously
9999
# enabled.
100100
#
101+
# This test sometimes fails due to CS-1597 only static load values shall be written to spooling:
102+
# wait_for_load_from_all_queues waits for load values to be there, but if they are spooled,
103+
# then they are available immediately after qmaster restart.
104+
# The queue OTOH will only leave "u" state when execds reconnected.
101105
proc check_queue_state {} {
102106
global test_hostname
103107
global ts_config

src/tcl_files/sge_procedures.tcl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6534,7 +6534,7 @@ proc wait_for_jobstart {jobid jobname seconds {do_errorcheck 1} {do_tsm 0}} {
65346534
if {$do_errorcheck == 1} {
65356535
set qstat_output [start_sge_bin "qstat" "-f -g t -u '*'"]
65366536
set qstat_wp_output [start_sge_bin "qalter" "-w p $jobid"]
6537-
ts_log_severe "timeout waiting for job $jobid \"$jobname\"\n$qstat_output\n$qstat_wp_output"
6537+
ts_log_severe "timeout waiting for job $jobid \"$jobname\" to start\n$qstat_output\n$qstat_wp_output"
65386538
}
65396539
return -1
65406540
}

0 commit comments

Comments
 (0)