Skip to content

Commit b817b65

Browse files
committed
BF: CS-619 queue limits which are overwritten on the submission command line have only effect if they are overwritten in the global scope
1 parent 0a337c1 commit b817b65

File tree

3 files changed

+219
-123
lines changed

3 files changed

+219
-123
lines changed

source/daemons/execd/dispatcher.cc

Lines changed: 22 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,32 +1,32 @@
11
/*___INFO__MARK_BEGIN__*/
22
/*************************************************************************
3-
*
3+
*
44
* The Contents of this file are made available subject to the terms of
55
* the Sun Industry Standards Source License Version 1.2
6-
*
6+
*
77
* Sun Microsystems Inc., March, 2001
8-
*
9-
*
8+
*
9+
*
1010
* Sun Industry Standards Source License Version 1.2
1111
* =================================================
1212
* The contents of this file are subject to the Sun Industry Standards
1313
* Source License Version 1.2 (the "License"); You may not use this file
1414
* except in compliance with the License. You may obtain a copy of the
1515
* License at http://gridengine.sunsource.net/Gridengine_SISSL_license.html
16-
*
16+
*
1717
* Software provided under this License is provided on an "AS IS" basis,
1818
* WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING,
1919
* WITHOUT LIMITATION, WARRANTIES THAT THE SOFTWARE IS FREE OF DEFECTS,
2020
* MERCHANTABLE, FIT FOR A PARTICULAR PURPOSE, OR NON-INFRINGING.
2121
* See the License for the specific provisions governing your rights and
2222
* obligations concerning the Software.
23-
*
23+
*
2424
* The Initial Developer of the Original Code is: Sun Microsystems, Inc.
25-
*
25+
*
2626
* Copyright: 2001 by Sun Microsystems, Inc.
27-
*
27+
*
2828
* All Rights Reserved.
29-
*
29+
*
3030
* Portions of this software are Copyright (c) 2023-2024 HPC-Gridware GmbH
3131
*
3232
************************************************************************/
@@ -88,7 +88,7 @@ int sge_execd_process_messages() {
8888
last_alive_check = sge_get_gmt64();
8989
last_heard = last_alive_check;
9090

91-
/* calculate alive check interval based on load report time POS 1/2
91+
/* calculate alive check interval based on load report time POS 1/2
9292
* If modified, please also change POS 2/2
9393
*/
9494
load_report_time = sge_gmt32_to_gmt64(mconf_get_load_report_time());
@@ -141,13 +141,17 @@ int sge_execd_process_messages() {
141141

142142
switch (msg.tag) {
143143
case ocs::gdi::ClientServerBase::TAG_JOB_EXECUTION:
144+
// Here we just store the job in the master job list or add a pe task to the job->ja_task->task_list.
145+
// It will be executed later in do_ck_to_do() -> sge_start_jobs().
144146
if (init_packbuffer(&apb, 1024) == PACK_SUCCESS) {
145147
do_job_exec(&msg, &apb, from_qmaster);
146148
is_apb_used = true;
147149
atag = msg.tag;
148150
}
149151
break;
150152
case ocs::gdi::ClientServerBase::TAG_SLAVE_ALLOW:
153+
// Here we store the job in the master job list.
154+
// This allows us to start tasks of tightly integrated jobs later (being submitted via qrsh -inherit).
151155
do_job_slave(&msg);
152156
break;
153157
case ocs::gdi::ClientServerBase::TAG_CHANGE_TICKET:
@@ -216,7 +220,7 @@ int sge_execd_process_messages() {
216220
default:
217221
do_reconnect = true;
218222
break;
219-
}
223+
}
220224
cl_commlib_trigger(cl_com_get_handle(component_get_component_name(), 0), 1);
221225
}
222226

@@ -226,7 +230,7 @@ int sge_execd_process_messages() {
226230

227231
if (do_reconnect) {
228232
/*
229-
* we are not connected, reconnect and register at qmaster ...
233+
* we are not connected, reconnect and register at qmaster ...
230234
*/
231235
if (cl_com_get_handle(prognames[EXECD], 1) == nullptr) {
232236
terminate = true; /* if we don't have a handle, we must leave
@@ -237,12 +241,12 @@ int sge_execd_process_messages() {
237241
ret = CL_RETVAL_HANDLE_NOT_FOUND;
238242
}
239243

240-
/*
241-
* trigger re-read of act_qmaster_file
244+
/*
245+
* trigger re-read of act_qmaster_file
242246
*/
243247
if (!terminate) {
244248
static u_long64 last_qmaster_file_read = 0;
245-
249+
246250
/* fix system clock moved back situation */
247251
if (last_qmaster_file_read > now) {
248252
last_qmaster_file_read = 0;
@@ -295,7 +299,7 @@ int sge_execd_process_messages() {
295299
/* fix system clock moved back situation and do test in any case */
296300
if (last_alive_check > now) {
297301
last_alive_check = 0;
298-
}
302+
}
299303
if (last_heard > now) {
300304
last_heard = 0;
301305
}
@@ -321,7 +325,7 @@ int sge_execd_process_messages() {
321325
DPRINTF("now - last_heard = " sge_u64 "\n", now - last_heard);
322326
DPRINTF("alive_check_interval= " sge_u64 "\n", alive_check_interval);
323327
#endif
324-
328+
325329
/*
326330
* last message was send before alive_check_interval seconds
327331
*/
@@ -331,7 +335,7 @@ int sge_execd_process_messages() {
331335
cl_com_handle_t* handle = cl_com_get_handle(prognames[EXECD],1);
332336
cl_com_SIRM_t* ep_status = nullptr;
333337

334-
/*
338+
/*
335339
* qmaster file has not changed, check the endpoint status
336340
*/
337341
ret_val = cl_commlib_get_endpoint_status(handle,

source/daemons/execd/execd_job_exec.cc

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -240,6 +240,9 @@ static int handle_job(lListElem *jelem, lListElem *jatep, int slave) {
240240
while (jep != nullptr) {
241241
if (job_search_task(jep, nullptr, jataskid) != nullptr) {
242242
DPRINTF("Job " sge_u32 "." sge_u32 " is already running - skip the new one\n", jobid, jataskid);
243+
// This also happens when a tightly integrated job is delivered to the master host.
244+
// We get the SLAVE container first, which is stored in the job list
245+
// then the job start order, in which case we get here.
243246
goto Ignore; /* don't set queue in error state */
244247
}
245248

0 commit comments

Comments
 (0)