From a6ea906a698d4a9cec2883d26fd3d165b36d4b34 Mon Sep 17 00:00:00 2001 From: Yulu Jia Date: Wed, 11 Dec 2019 15:23:34 -0800 Subject: [PATCH] enable psm2 nameserver let psm2 use ip:port addr format and use nameserver to resolve it to the psm2 native addr. This patch is based off the cart commit used by daos master as of 2019-12-11. note: 1) orterun needs to use this flag: --mca mtl ^psm2,ofi 2) server side needs to pass this shell variable under orterun: -x FI_PSM2_NAME_SERVER=1 this variable tells psm2 to start the name server 3) optionally, a OFI_PORT variable can be specified on either the server side or client side. If OFI_PORT is not specified, cart will pick a unused port on its own. 4) to make a server use the same address across restarts, supply this variable when launching the server: -x OFI_PORT=xxx Signed-off-by: Yulu Jia --- SConstruct | 2 +- src/cart/crt_group.c | 3 +- src/cart/crt_hg.c | 10 ++++-- src/cart/crt_init.c | 63 ++++++++++++++++++++++++++++++--- src/crt_launch/crt_launch.c | 3 ++ src/test/SConscript | 8 ++--- src/test/test_ep_cred_client.c | 4 ++- src/test/test_group_np_cli.c | 2 +- src/test/test_hlc_net.c | 2 +- src/test/tests_common.h | 6 ++++ test/rpc/cart_rpc_two_node.yaml | 11 +++--- test/util/cart_utils.py | 20 ++++++++++- 12 files changed, 114 insertions(+), 20 deletions(-) diff --git a/SConstruct b/SConstruct index 41fbf9d75..187eeee3f 100644 --- a/SConstruct +++ b/SConstruct @@ -107,7 +107,7 @@ def scons(): # Compiler options env.Append(CCFLAGS=['-g3', '-Wshadow', '-Wall', '-Werror', '-fpic', '-D_GNU_SOURCE']) - env.Append(CCFLAGS=['-O2', '-pthread']) + env.Append(CCFLAGS=['-Og', '-pthread']) env.Append(CFLAGS=['-std=gnu99']) if not GetOption('clean'): env.AppendIfSupported(CCFLAGS=DESIRED_FLAGS) diff --git a/src/cart/crt_group.c b/src/cart/crt_group.c index 08d8c21c5..da4fb4234 100644 --- a/src/cart/crt_group.c +++ b/src/cart/crt_group.c @@ -4128,7 +4128,8 @@ crt_rank_uri_get(crt_group_t *group, d_rank_t rank, int tag, char **uri_str) D_GOTO(out, rc = -DER_INVAL); } - if (rank == grp_priv->gp_self) + if (rank == grp_priv->gp_self && crt_is_service() + && grp_priv == crt_gdata.cg_grp->gg_srv_pri_grp) return crt_self_uri_get(tag, uri_str); rc = crt_grp_lc_lookup(grp_priv, 0, rank, tag, &uri, &hg_addr); diff --git a/src/cart/crt_hg.c b/src/cart/crt_hg.c index 42e502acf..8a5f3fb34 100644 --- a/src/cart/crt_hg.c +++ b/src/cart/crt_hg.c @@ -453,11 +453,13 @@ crt_get_info_string(char **string) } else { /* OFI_PORT is only for context 0 to use */ port = crt_na_ofi_conf.noc_port; - crt_na_ofi_conf.noc_port = -1; + crt_na_ofi_conf.noc_port++; D_ASPRINTF(*string, "%s://%s/%s:%d", plugin_str, crt_na_ofi_conf.noc_domain, crt_na_ofi_conf.noc_ip_str, port); +// D_ASPRINTF(*string, "%s://%s:%d", plugin_str, +// crt_na_ofi_conf.noc_ip_str, port); } if (*string == NULL) @@ -575,7 +577,11 @@ crt_hg_init(crt_phy_addr_t *addr, bool server) } } - D_DEBUG(DB_NET, "in crt_hg_init, listen address: %s.\n", *addr); + if (server) + D_DEBUG(DB_NET, "listening address: %s.\n", *addr); + else + D_DEBUG(DB_NET, "passive address: %s.\n", *addr); + crt_gdata.cg_hg = hg_gdata; out: diff --git a/src/cart/crt_init.c b/src/cart/crt_init.c index 610c7e026..5b0ed39f8 100644 --- a/src/cart/crt_init.c +++ b/src/cart/crt_init.c @@ -545,8 +545,8 @@ crt_finalize(void) crt_gdata.cg_inited = 0; gdata_init_flag = 0; - if (crt_gdata.cg_na_plugin == CRT_NA_OFI_SOCKETS) - crt_na_ofi_config_fini(); +// if (crt_gdata.cg_na_plugin == CRT_NA_OFI_SOCKETS) + crt_na_ofi_config_fini(); } else { D_RWLOCK_UNLOCK(&crt_gdata.cg_rwlock); } @@ -589,6 +589,55 @@ static inline na_bool_t is_integer_str(char *str) return NA_TRUE; } +static inline int +crt_get_port(int *port) +{ + int socketfd; + struct sockaddr_in tmp_socket; + socklen_t slen = sizeof(struct sockaddr); + int rc; + + socketfd = socket(AF_INET, SOCK_STREAM, 0); + if (socketfd == -1) { + D_ERROR("cannot create socket, errno: %d(%s).\n", + errno, strerror(errno)); + D_GOTO(out, rc = -DER_ADDRSTR_GEN); + } + tmp_socket.sin_family = AF_INET; + tmp_socket.sin_addr.s_addr = INADDR_ANY; + tmp_socket.sin_port = 0; + + rc = bind(socketfd, (const struct sockaddr *)&tmp_socket, + sizeof(tmp_socket)); + if (rc != 0) { + D_ERROR("cannot bind socket, errno: %d(%s).\n", + errno, strerror(errno)); + close(socketfd); + D_GOTO(out, rc = -DER_ADDRSTR_GEN); + } + + rc = getsockname(socketfd, (struct sockaddr *)&tmp_socket, &slen); + if (rc != 0) { + D_ERROR("cannot create getsockname, errno: %d(%s).\n", + errno, strerror(errno)); + close(socketfd); + D_GOTO(out, rc = -DER_ADDRSTR_GEN); + } + rc = close(socketfd); + if (rc != 0) { + D_ERROR("cannot close socket, errno: %d(%s).\n", + errno, strerror(errno)); + D_GOTO(out, rc = -DER_ADDRSTR_GEN); + } + + D_ASSERT(port != NULL); + *port = ntohs(tmp_socket.sin_port); + D_DEBUG(DB_ALL, "get a port: %d.\n", *port); + +out: + return rc; +} + int crt_na_ofi_config_init(void) { char *port_str; @@ -678,13 +727,19 @@ int crt_na_ofi_config_init(void) port_str = getenv("OFI_PORT"); if (crt_is_service() && port_str != NULL && strlen(port_str) > 0) { if (!is_integer_str(port_str)) { - D_DEBUG(DB_ALL, "ignore invalid OFI_PORT %s.", + D_DEBUG(DB_ALL, "ignoring invalid OFI_PORT %s.", port_str); } else { port = atoi(port_str); - D_DEBUG(DB_ALL, "OFI_PORT %d, use it as service " + D_DEBUG(DB_ALL, "OFI_PORT %d, using it as service " "port.\n", port); } + } else { + rc = crt_get_port(&port); + if (rc != 0) { + D_ERROR("crt_get_port failed, rc: %d.\n", rc); + D_GOTO(out, rc); + } } crt_na_ofi_conf.noc_port = port; diff --git a/src/crt_launch/crt_launch.c b/src/crt_launch/crt_launch.c index 0f961a3bc..22fa97eba 100644 --- a/src/crt_launch/crt_launch.c +++ b/src/crt_launch/crt_launch.c @@ -85,6 +85,7 @@ struct host { }; static int my_rank; +volatile static int myflag = 0; struct options_t { int is_client; @@ -153,6 +154,8 @@ get_self_uri(struct host *h) char *p; int len; int rc; + while (myflag) + sched_yield(); rc = crt_init(0, CRT_FLAG_BIT_SERVER | CRT_FLAG_BIT_PMIX_DISABLE | CRT_FLAG_BIT_LM_DISABLE); diff --git a/src/test/SConscript b/src/test/SConscript index 1d8813a31..0c55c7f65 100644 --- a/src/test/SConscript +++ b/src/test/SConscript @@ -78,10 +78,10 @@ def scons(): tenv.Requires(target, [cart_lib, gurt_lib]) tenv.Install(os.path.join("$PREFIX", 'TESTING', 'tests'), target) - for test in ECHO_TEST_SRC: - target = tenv.Program(test) - tenv.Requires(target, [cart_lib, gurt_lib]) - tenv.Install(os.path.join("$PREFIX", 'TESTING', 'tests'), target) +# for test in ECHO_TEST_SRC: +# target = tenv.Program(test) +# tenv.Requires(target, [cart_lib, gurt_lib]) +# tenv.Install(os.path.join("$PREFIX", 'TESTING', 'tests'), target) for test in IV_TESTS: target = tenv.Program(test) diff --git a/src/test/test_ep_cred_client.c b/src/test/test_ep_cred_client.c index 45baaad8c..88216973f 100644 --- a/src/test/test_ep_cred_client.c +++ b/src/test/test_ep_cred_client.c @@ -69,6 +69,7 @@ rpc_handle_ping_front_q(const struct crt_cb_info *info) static void test_run() { + myflag = 0; crt_group_t *grp = NULL; d_rank_list_t *rank_list = NULL; crt_rpc_t *rpc = NULL; @@ -92,10 +93,11 @@ test_run() DBG_PRINT("Number of credits: %d Number of burst: %d\n", test.tg_credits, test.tg_burst_count); + sleep(2); tc_cli_start_basic(test.tg_local_group_name, test.tg_remote_group_name, &grp, &rank_list, &test.tg_crt_ctx, - &test.tg_tid, true, test.tg_save_cfg, &opt); + &test.tg_tid, 1, test.tg_save_cfg, &opt); rc = sem_init(&test.tg_token_to_proceed, 0, 0); D_ASSERTF(rc == 0, "sem_init() failed.\n"); diff --git a/src/test/test_group_np_cli.c b/src/test/test_group_np_cli.c index 93eefbba1..40489f719 100644 --- a/src/test/test_group_np_cli.c +++ b/src/test/test_group_np_cli.c @@ -167,7 +167,7 @@ int main(int argc, char **argv) } /* rank, num_attach_retries, is_server, assert_on_error */ - tc_test_init(0, 20, false, true); + tc_test_init(0, 100, false, true); test_run(); diff --git a/src/test/test_hlc_net.c b/src/test/test_hlc_net.c index 43551c36e..e664b84e0 100644 --- a/src/test/test_hlc_net.c +++ b/src/test/test_hlc_net.c @@ -280,7 +280,7 @@ static int srv_init(void) int main(int argc, char *argv[]) { - int i, rc; + int i, rc = 0; dbg("---%s--->", __func__); diff --git a/src/test/tests_common.h b/src/test/tests_common.h index 6466dca61..2938ec0ea 100644 --- a/src/test/tests_common.h +++ b/src/test/tests_common.h @@ -44,6 +44,7 @@ #include #include "crt_internal.h" +volatile int myflag = 0; #define DBG_PRINT(x...) \ do { \ @@ -372,6 +373,9 @@ tc_cli_start_basic(char *local_group_name, char *srv_group_name, int attach_retries = opts.num_attach_retries; int rc = 0; + while (myflag) + sched_yield(); + D_ASSERTF(opts.is_initialized == true, "tc_test_init not called.\n"); rc = d_log_init(); @@ -461,6 +465,8 @@ tc_srv_start_basic(char *srv_group_name, crt_context_t *crt_ctx, rc = d_log_init(); D_ASSERT(rc == 0); + while (myflag) + sched_yield(); if (init_opt) { rc = crt_init_opt(srv_group_name, CRT_FLAG_BIT_SERVER | CRT_FLAG_BIT_PMIX_DISABLE | diff --git a/test/rpc/cart_rpc_two_node.yaml b/test/rpc/cart_rpc_two_node.yaml index a9d56c5ea..8dc5a0f16 100755 --- a/test/rpc/cart_rpc_two_node.yaml +++ b/test/rpc/cart_rpc_two_node.yaml @@ -5,8 +5,11 @@ defaultENV: #!filter-only : /run/env_CRT_CTX_SHARE_ADDR/sep #!filter-only : /run/tests/rpc_error D_LOG_MASK: "DEBUG,MEM=ERR" - CRT_PHY_ADDR_STR: "ofi+sockets" - OFI_INTERFACE: "eth0" + D_LOG_FILE_APPEND_PID: "1" + CRT_PHY_ADDR_STR: "ofi+psm2" + OFI_INTERFACE: "ib0" + OFI_PORT: "22222" + FI_PSM2_NAME_SERVER: "1" srv_CRT_CTX_NUM: "16" cli_CRT_CTX_NUM: "16" env_CRT_CTX_SHARE_ADDR: !mux @@ -49,11 +52,11 @@ tests: !mux name: test_group_basic srv_bin: ../bin/crt_launch srv_arg: "-e tests/test_group_np_srv --name tg_srv_grp --cfg_path=." - srv_env: "-x D_FI_CONFIG=../etc/fault-inject-cart.yaml" + srv_env: "-x FI_PSM2_NAME_SERVER=1 -x D_LOG_FILE_APPEND_PID=1 -x OFI_PORT=44444 -x D_FI_CONFIG=../etc/fault-inject-cart.yaml" srv_ppn: "1" cli_bin: tests/test_group_np_cli cli_arg: "--name client_group --attach_to tg_srv_grp --cfg_path=." - cli_env: "-x D_FI_CONFIG=../etc/fault-inject-cart.yaml" + cli_env: "-x D_LOG_FILE_APPEND_PID=1 -x OFI_PORT=33333 -x D_FI_CONFIG=../etc/fault-inject-cart.yaml" cli_ppn: "1" ep_credits_1: name: ep_credits_1 diff --git a/test/util/cart_utils.py b/test/util/cart_utils.py index 5768bb681..69a8d3e03 100755 --- a/test/util/cart_utils.py +++ b/test/util/cart_utils.py @@ -139,9 +139,15 @@ def get_env(self, cartobj): log_file = os.path.join(log_path, "output.log") log_mask = cartobj.params.get("D_LOG_MASK", "/run/defaultENV/") + log_append_pid = cartobj.params.get("D_LOG_FILE_APPEND_PID", + "/run/defaultENV/") crt_phy_addr = cartobj.params.get("CRT_PHY_ADDR_STR", "/run/defaultENV/") ofi_interface = cartobj.params.get("OFI_INTERFACE", "/run/defaultENV/") + ofi_port = cartobj.params.get("OFI_PORT", "/run/defaultENV/") + fi_psm2_name_server = cartobj.params.get("FI_PSM2_NAME_SERVER", + "/run/defaultENV/") + ofi_share_addr = cartobj.params.get("CRT_CTX_SHARE_ADDR", "/run/env_CRT_CTX_SHARE_ADDR/*/") @@ -152,12 +158,21 @@ def get_env(self, cartobj): if log_mask is not None: env += " -x D_LOG_MASK={!s}".format(log_mask) + if log_append_pid: + env += " -x D_LOG_FILE_APPEND_PID={!s}".format(log_append_pid) + if crt_phy_addr is not None: env += " -x CRT_PHY_ADDR_STR={!s}".format(crt_phy_addr) if ofi_interface is not None: env += " -x OFI_INTERFACE={!s}".format(ofi_interface) + if ofi_port: + env += " -x OFI_PORT={!s}".format(ofi_port) + + if fi_psm2_name_server: + env += " -x FI_PSM2_NAME_SERVER={!s}".format(fi_psm2_name_server) + if ofi_share_addr is not None: env += " -x CRT_CTX_SHARE_ADDR={!s}".format(ofi_share_addr) @@ -225,7 +240,7 @@ def build_cmd(self, cartobj, env, host, report_uri=True, urifile=None): else: hostfile = self.write_host_file(tst_host,tst_ppn) - tst_cmd = "{} --mca btl self,tcp -N {} --hostfile {} "\ + tst_cmd = "{} --mca mtl ^psm2,ofi -N {} --hostfile {} "\ .format(orterun_bin, tst_ppn, hostfile) if urifile is not None: @@ -234,6 +249,9 @@ def build_cmd(self, cartobj, env, host, report_uri=True, urifile=None): else: tst_cmd += "--ompi-server file:{} ".format(urifile) + if host in ["srv", "srv2"]: +# env += " -x OFI_PORT=44444 -x FI_PSM2_NAME_SERVER=1 " + env += " -x FI_PSM2_NAME_SERVER=1 " tst_cmd += env if tst_ctx is not None: