Skip to content

Commit

Permalink
b/390177544 cart: Support dynamic mercury/libfabric loglevels (#15738)
Browse files Browse the repository at this point in the history
Mercury and libfabric are much too spammy to enable at lower
than warning level by default. Mercury does support adjustment
of its log level and logged subsystems at runtime, so this
patch provides access to that functionality from the
existing `dmg server set-logmasks` tool.

This version of the patch adds support for writing mercury/libfabric
debug logs to a separate file, in order to avoid undue burden on
the cloud logging pipeline. To use it, set HG_DBG_LOG_FILE in the
engine config to a logfile path. NB: This is a "raw" log file, i.e.
no rotation is done and it's flushed once per second.

To enable debug logging for just the mercury/libfabric libraries:
  dmg server set-logmasks -m EXTERNAL=DEBUG

To disable debug logging again:
  dmg server set-logmasks -m EXTERNAL=ERR

Note that the CaRT/Mercury wrapper (crt_hg.c) still uses the normal
dlog logger with the "hg" facility. If debug logging in this file is
desired, then the mask HG=DEBUG may be used for set-logmasks.
Be aware that the debug logging produced by this file is rather
copious and may cause problems for cloud logging, however!

Change-Id: Ib6fb74c42507dbfba2c5984a92433ad43b6e2df3
Signed-off-by: Michael MacDonald <[email protected]>
  • Loading branch information
mjmac committed Mar 9, 2025
1 parent c7508ad commit 2fc61f7
Show file tree
Hide file tree
Showing 5 changed files with 126 additions and 3 deletions.
73 changes: 71 additions & 2 deletions src/cart/crt_hg.c
Original file line number Diff line number Diff line change
Expand Up @@ -737,16 +737,37 @@ crt_get_info_string(bool primary, crt_provider_t provider, int iface_idx,
return rc;
}

static FILE *hg_dbg_log = NULL;
static uint64_t last_hg_dbg_flush = 0;

static int
crt_hg_log(FILE *stream, const char *fmt, ...)
{
va_list ap;
struct timeval tv;
int flags;

flags = d_log_check((intptr_t)stream);
if (flags == 0)
return 0;

/*
* b/401315986: If a separate logfile has been defined for
* Mercury debug logging, send DEBUG-level messages to it.
*/
if (hg_dbg_log != NULL && (flags & DLOG_PRIMASK) == DLOG_DBG) {
va_start(ap, fmt);
vfprintf(hg_dbg_log, fmt, ap);
va_end(ap);

(void)gettimeofday(&tv, 0);
if (tv.tv_sec - last_hg_dbg_flush > 1) {
fflush(hg_dbg_log);
last_hg_dbg_flush = tv.tv_sec;
}
return 0;
}

va_start(ap, fmt);
d_vlog(flags, fmt, ap);
va_end(ap);
Expand All @@ -768,10 +789,49 @@ crt_hg_free_protocol_info(struct na_protocol_info *na_protocol_info)
HG_Free_na_protocol_info(na_protocol_info);
}

void
crt_hg_reset_log_level()
{
char *e_loglev = NULL;

if (d_agetenv_str(&e_loglev, "HG_LOG_LEVEL") == 0) {
HG_Set_log_subsys(e_loglev);
d_freeenv_str(&e_loglev);
return;
}
HG_Set_log_level("warning");
}

void
crt_hg_set_log_level(const char *level)
{
HG_Set_log_level(level);
}

void
crt_hg_reset_log_subsys()
{
char *e_subsys = NULL;

if (d_agetenv_str(&e_subsys, "HG_LOG_SUBSYS") == 0) {
HG_Set_log_subsys(e_subsys);
d_freeenv_str(&e_subsys);
return;
}
HG_Set_log_subsys("hg,na");
}

void
crt_hg_set_log_subsys(const char *subsys)
{
HG_Set_log_subsys(subsys);
}

/* to be called only in crt_init */
int
crt_hg_init(void)
{
char *hg_dbg_log_file = NULL;
int rc = 0;

if (crt_initialized()) {
Expand All @@ -783,8 +843,17 @@ crt_hg_init(void)

if (!d_isenv_def("HG_LOG_SUBSYS")) {
if (!d_isenv_def("HG_LOG_LEVEL"))
HG_Set_log_level("warning");
HG_Set_log_subsys("hg,na");
crt_hg_reset_log_level();
crt_hg_reset_log_subsys();
}

d_agetenv_str(&hg_dbg_log_file, "HG_DBG_LOG_FILE");
if (hg_dbg_log_file) {
hg_dbg_log = fopen(hg_dbg_log_file, "w");
if (hg_dbg_log == NULL)
D_ERROR("failed to open mercury debug log %s: %s\n", hg_dbg_log_file,
strerror(errno));
d_freeenv_str(&hg_dbg_log_file);
}

/* import HG log */
Expand Down
8 changes: 8 additions & 0 deletions src/gurt/dlog.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
/*
* (C) Copyright 2016-2024 Intel Corporation.
* (C) Copyright 2025 Google LLC
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand Down Expand Up @@ -1340,6 +1341,13 @@ int d_log_setmasks(const char *mstr, int mlen0)
else {
/* apply to all facilities */
for (facno = 0; facno < d_log_xst.fac_cnt; facno++) {
/* don't include external when setting blanket DEBUG -- must be
* enabled explicitly */
if (prino == DLOG_DBG &&
strncasecmp(d_log_xst.dlog_facs[facno].fac_aname, "external",
8) == 0)
continue;

tmp = d_log_setlogmask(facno, prino);
if (rv != -1)
rv = tmp;
Expand Down
12 changes: 12 additions & 0 deletions src/include/cart/api.h
Original file line number Diff line number Diff line change
Expand Up @@ -2385,6 +2385,18 @@ crt_req_get_proto_ver(crt_rpc_t *req);
char *
crt_req_origin_addr_get(crt_rpc_t *rpc);

void
crt_hg_reset_log_level();

void
crt_hg_set_log_level(const char *level);

void
crt_hg_reset_log_subsys();

void
crt_hg_set_log_subsys(const char *subsys);

/** @}
*/

Expand Down
15 changes: 14 additions & 1 deletion src/mgmt/srv_drpc.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
/*
* (C) Copyright 2019-2024 Intel Corporation.
* (C) Copyright 2025 Google LLC
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand Down Expand Up @@ -104,7 +105,7 @@ ds_mgmt_drpc_set_log_masks(Drpc__Call *drpc_req, Drpc__Response *drpc_resp)
Ctl__SetLogMasksResp resp;
uint8_t *body;
size_t len;
char retbuf[1024];
char retbuf[1024] = {'\0'};

/* Unpack the inner request from the drpc call body */
req = ctl__set_log_masks_req__unpack(&alloc.alloc,
Expand All @@ -130,6 +131,18 @@ ds_mgmt_drpc_set_log_masks(Drpc__Call *drpc_req, Drpc__Response *drpc_resp)
D_INFO("Received request to set log masks '%s' masks are now %s, debug streams (DD_MASK) "
"set to '%s'\n", req->masks, retbuf, req->streams);

/* b/390177544: Adjust mercury/libfabric logging at runtime */
if (strstr(retbuf, "external=DBUG") != NULL) {
crt_hg_set_log_level("debug");
crt_hg_set_log_subsys("hg,na,libfabric");
D_INFO("enabled DEBUG logging for external facility\n");
} else {
/* if it's not DEBUG, reset it to defaults */
crt_hg_reset_log_level();
crt_hg_reset_log_subsys();
D_INFO("reset logging to defaults for external facility\n");
}

len = ctl__set_log_masks_resp__get_packed_size(&resp);
D_ALLOC(body, len);
if (body == NULL) {
Expand Down
21 changes: 21 additions & 0 deletions src/mgmt/tests/mocks.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
/*
* (C) Copyright 2019-2024 Intel Corporation.
* (C) Copyright 2025 Google LLC
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand Down Expand Up @@ -497,6 +498,26 @@ crt_rank_self_set(d_rank_t rank, uint32_t group_version_min)
return 0;
}

void
crt_hg_reset_log_level()
{
}

void
crt_hg_set_log_level(const char *level)
{
}

void
crt_hg_reset_log_subsys()
{
}

void
crt_hg_set_log_subsys(const char *subsys)
{
}

void
dss_init_state_set(enum dss_init_state state)
{
Expand Down

0 comments on commit 2fc61f7

Please sign in to comment.