1515 * All rights reserved.
1616 * Copyright (c) 2011-2017 Cisco Systems, Inc. All rights reserved
1717 * Copyright (c) 2013-2017 Intel, Inc. All rights reserved.
18- * Copyright (c) 2014 Research Organization for Information Science
18+ * Copyright (c) 2014-2017 Research Organization for Information Science
1919 * and Technology (RIST). All rights reserved.
2020 * Copyright (c) 2017 Mellanox Technologies Ltd. All rights reserved.
2121 * Copyright (c) 2017 IBM Corporation. All rights reserved.
@@ -614,6 +614,9 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *buffer,
614614 goto REPORT_ERROR ;
615615 }
616616
617+ /* spin up the spawn threads */
618+ orte_odls_base_start_threads (jdata );
619+
617620 /* to save memory, purge the job map of all procs other than
618621 * our own - for daemons, this will completely release the
619622 * proc structures. For the HNP, the proc structs will
@@ -727,9 +730,6 @@ void orte_odls_base_spawn_proc(int fd, short sd, void *cbdata)
727730 int rc , i ;
728731 bool found ;
729732 orte_proc_state_t state ;
730- char * * argvptr ;
731- char * pathenv = NULL , * mpiexec_pathenv = NULL ;
732- char * full_search ;
733733
734734 ORTE_ACQUIRE_OBJECT (cd );
735735
@@ -772,44 +772,6 @@ void orte_odls_base_spawn_proc(int fd, short sd, void *cbdata)
772772 goto errorout ;
773773 }
774774
775- /* Search for the OMPI_exec_path and PATH settings in the environment. */
776- for (argvptr = app -> env ; * argvptr != NULL ; argvptr ++ ) {
777- if (0 == strncmp ("OMPI_exec_path=" , * argvptr , 15 )) {
778- mpiexec_pathenv = * argvptr + 15 ;
779- }
780- if (0 == strncmp ("PATH=" , * argvptr , 5 )) {
781- pathenv = * argvptr + 5 ;
782- }
783- }
784-
785- /* If OMPI_exec_path is set (meaning --path was used), then create a
786- temporary environment to be used in the search for the executable.
787- The PATH setting in this temporary environment is a combination of
788- the OMPI_exec_path and PATH values. If OMPI_exec_path is not set,
789- then just use existing environment with PATH in it. */
790- if (NULL != mpiexec_pathenv ) {
791- argvptr = NULL ;
792- if (pathenv != NULL ) {
793- asprintf (& full_search , "%s:%s" , mpiexec_pathenv , pathenv );
794- } else {
795- asprintf (& full_search , "%s" , mpiexec_pathenv );
796- }
797- opal_setenv ("PATH" , full_search , true, & argvptr );
798- free (full_search );
799- } else {
800- argvptr = app -> env ;
801- }
802-
803- rc = orte_util_check_context_app (app , argvptr );
804- /* do not ERROR_LOG - it will be reported elsewhere */
805- if (NULL != mpiexec_pathenv ) {
806- opal_argv_free (argvptr );
807- }
808- if (ORTE_SUCCESS != rc ) {
809- state = ORTE_PROC_STATE_FAILED_TO_LAUNCH ;
810- goto errorout ;
811- }
812-
813775 /* did the user request we display output in xterms? */
814776 if (NULL != orte_xterm && !ORTE_FLAG_TEST (jobdat , ORTE_JOB_FLAG_DEBUGGER_DAEMON )) {
815777 opal_list_item_t * nmitem ;
@@ -878,15 +840,14 @@ void orte_odls_base_spawn_proc(int fd, short sd, void *cbdata)
878840 cd -> argv [0 ] = param ;
879841 }
880842
881- if ( 5 < opal_output_get_verbosity ( orte_odls_base_framework .framework_output )) {
882- opal_output ( orte_odls_base_framework . framework_output , "%s odls:launch spawning child %s" ,
883- ORTE_NAME_PRINT (ORTE_PROC_MY_NAME ),
884- ORTE_NAME_PRINT (& child -> name ));
843+ opal_output_verbose ( 5 , orte_odls_base_framework .framework_output ,
844+ "%s odls:launch spawning child %s" ,
845+ ORTE_NAME_PRINT (ORTE_PROC_MY_NAME ),
846+ ORTE_NAME_PRINT (& child -> name ));
885847
848+ if (15 < opal_output_get_verbosity (orte_odls_base_framework .framework_output )) {
886849 /* dump what is going to be exec'd */
887- if (7 < opal_output_get_verbosity (orte_odls_base_framework .framework_output )) {
888- opal_dss .dump (orte_odls_base_framework .framework_output , app , ORTE_APP_CONTEXT );
889- }
850+ opal_dss .dump (orte_odls_base_framework .framework_output , app , ORTE_APP_CONTEXT );
890851 }
891852
892853 if (ORTE_SUCCESS != (rc = cd -> fork_local (cd ))) {
@@ -923,6 +884,9 @@ void orte_odls_base_default_launch_local(int fd, short sd, void *cbdata)
923884 orte_odls_spawn_caddy_t * cd ;
924885 opal_event_base_t * evb ;
925886 char * effective_dir = NULL ;
887+ char * * argvptr ;
888+ char * pathenv = NULL , * mpiexec_pathenv = NULL ;
889+ char * full_search ;
926890
927891 ORTE_ACQUIRE_OBJECT (caddy );
928892
@@ -1105,6 +1069,44 @@ void orte_odls_base_default_launch_local(int fd, short sd, void *cbdata)
11051069 goto GETOUT ;
11061070 }
11071071
1072+ /* Search for the OMPI_exec_path and PATH settings in the environment. */
1073+ for (argvptr = app -> env ; * argvptr != NULL ; argvptr ++ ) {
1074+ if (0 == strncmp ("OMPI_exec_path=" , * argvptr , 15 )) {
1075+ mpiexec_pathenv = * argvptr + 15 ;
1076+ }
1077+ if (0 == strncmp ("PATH=" , * argvptr , 5 )) {
1078+ pathenv = * argvptr + 5 ;
1079+ }
1080+ }
1081+
1082+ /* If OMPI_exec_path is set (meaning --path was used), then create a
1083+ temporary environment to be used in the search for the executable.
1084+ The PATH setting in this temporary environment is a combination of
1085+ the OMPI_exec_path and PATH values. If OMPI_exec_path is not set,
1086+ then just use existing environment with PATH in it. */
1087+ if (NULL != mpiexec_pathenv ) {
1088+ argvptr = NULL ;
1089+ if (pathenv != NULL ) {
1090+ asprintf (& full_search , "%s:%s" , mpiexec_pathenv , pathenv );
1091+ } else {
1092+ asprintf (& full_search , "%s" , mpiexec_pathenv );
1093+ }
1094+ opal_setenv ("PATH" , full_search , true, & argvptr );
1095+ free (full_search );
1096+ } else {
1097+ argvptr = app -> env ;
1098+ }
1099+
1100+ rc = orte_util_check_context_app (app , argvptr );
1101+ /* do not ERROR_LOG - it will be reported elsewhere */
1102+ if (NULL != mpiexec_pathenv ) {
1103+ opal_argv_free (argvptr );
1104+ }
1105+ if (ORTE_SUCCESS != rc ) {
1106+ goto GETOUT ;
1107+ }
1108+
1109+
11081110 /* tell all children that they are being launched via ORTE */
11091111 opal_setenv (OPAL_MCA_PREFIX "orte_launch" , "1" , true, & app -> env );
11101112
@@ -1186,10 +1188,17 @@ void orte_odls_base_default_launch_local(int fd, short sd, void *cbdata)
11861188 ORTE_NAME_PRINT (ORTE_PROC_MY_NAME ),
11871189 ORTE_NAME_PRINT (& child -> name )));
11881190
1191+ /* determine the thread that will handle this child */
1192+ ++ orte_odls_globals .next_base ;
1193+ if (orte_odls_globals .num_threads <= orte_odls_globals .next_base ) {
1194+ orte_odls_globals .next_base = 0 ;
1195+ }
1196+ evb = orte_odls_globals .ev_bases [orte_odls_globals .next_base ];
1197+
11891198 /* set the waitpid callback here for thread protection and
11901199 * to ensure we can capture the callback on shortlived apps */
11911200 ORTE_FLAG_SET (child , ORTE_PROC_FLAG_ALIVE );
1192- orte_wait_cb (child , ompi_odls_base_default_wait_local_proc , NULL );
1201+ orte_wait_cb (child , orte_odls_base_default_wait_local_proc , evb , NULL );
11931202
11941203 /* dispatch this child to the next available launch thread */
11951204 cd = OBJ_NEW (orte_odls_spawn_caddy_t );
@@ -1228,16 +1237,11 @@ void orte_odls_base_default_launch_local(int fd, short sd, void *cbdata)
12281237 goto GETOUT ;
12291238 }
12301239 }
1231- ++ orte_odls_globals .next_base ;
1232- if (orte_odls_globals .num_threads <= orte_odls_globals .next_base ) {
1233- orte_odls_globals .next_base = 0 ;
1234- }
12351240 opal_output_verbose (1 , orte_odls_base_framework .framework_output ,
12361241 "%s odls:dispatch %s to thread %d" ,
12371242 ORTE_NAME_PRINT (ORTE_PROC_MY_NAME ),
12381243 ORTE_NAME_PRINT (& child -> name ),
12391244 orte_odls_globals .next_base );
1240- evb = orte_odls_globals .ev_bases [orte_odls_globals .next_base ];
12411245 opal_event_set (evb , & cd -> ev , -1 ,
12421246 OPAL_EV_WRITE , orte_odls_base_spawn_proc , cd );
12431247 opal_event_set_priority (& cd -> ev , ORTE_MSG_PRI );
@@ -1255,11 +1259,6 @@ void orte_odls_base_default_launch_local(int fd, short sd, void *cbdata)
12551259 free (effective_dir );
12561260 effective_dir = NULL ;
12571261 }
1258- /* tell the state machine that all local procs for this job
1259- * were launched so that it can do whatever it needs to do,
1260- * like send a state update message for all procs to the HNP
1261- */
1262- ORTE_ACTIVATE_JOB_STATE (jobdat , ORTE_JOB_STATE_LOCAL_LAUNCH_COMPLETE );
12631262
12641263 ERROR_OUT :
12651264 /* ensure we reset our working directory back to our default location */
@@ -1323,8 +1322,10 @@ int orte_odls_base_default_signal_local_procs(const orte_process_name_t *proc, i
13231322 * Wait for a callback indicating the child has completed.
13241323 */
13251324
1326- void ompi_odls_base_default_wait_local_proc ( orte_proc_t * proc , void * cbdata )
1325+ void orte_odls_base_default_wait_local_proc ( int fd , short sd , void * cbdata )
13271326{
1327+ orte_wait_tracker_t * t2 = (orte_wait_tracker_t * )cbdata ;
1328+ orte_proc_t * proc = t2 -> child ;
13281329 int i ;
13291330 orte_job_t * jobdat ;
13301331 orte_proc_state_t state = ORTE_PROC_STATE_WAITPID_FIRED ;
@@ -1528,6 +1529,8 @@ void ompi_odls_base_default_wait_local_proc(orte_proc_t *proc, void* cbdata)
15281529 /* cancel the wait as this proc has already terminated */
15291530 orte_wait_cb_cancel (proc );
15301531 ORTE_ACTIVATE_PROC_STATE (& proc -> name , state );
1532+ /* cleanup the tracker */
1533+ OBJ_RELEASE (t2 );
15311534}
15321535
15331536typedef struct {
@@ -1903,17 +1906,17 @@ int orte_odls_base_default_restart_proc(orte_proc_t *child,
19031906 goto CLEANUP ;
19041907 }
19051908 }
1906- orte_wait_cb (child , ompi_odls_base_default_wait_local_proc , NULL );
1907-
19081909 ++ orte_odls_globals .next_base ;
19091910 if (orte_odls_globals .num_threads <= orte_odls_globals .next_base ) {
19101911 orte_odls_globals .next_base = 0 ;
19111912 }
1913+ evb = orte_odls_globals .ev_bases [orte_odls_globals .next_base ];
1914+ orte_wait_cb (child , orte_odls_base_default_wait_local_proc , evb , NULL );
1915+
19121916 OPAL_OUTPUT_VERBOSE ((5 , orte_odls_base_framework .framework_output ,
19131917 "%s restarting app %s" ,
19141918 ORTE_NAME_PRINT (ORTE_PROC_MY_NAME ), app -> app ));
19151919
1916- evb = orte_odls_globals .ev_bases [orte_odls_globals .next_base ];
19171920 opal_event_set (evb , & cd -> ev , -1 ,
19181921 OPAL_EV_WRITE , orte_odls_base_spawn_proc , cd );
19191922 opal_event_set_priority (& cd -> ev , ORTE_MSG_PRI );
0 commit comments