From c60d8a13c573e226b54addcb9385d4149f2230fa Mon Sep 17 00:00:00 2001
From: KaiGai Kohei <kaigai@heterodb.com>
Date: Tue, 2 Apr 2024 15:15:16 +0900
Subject: [PATCH] gpucache: add config option validator - to avoid too large
 GPU memory consumption

---
 src/gpu_cache.c | 244 +++++++++++++++++++++++++++++++-----------------
 src/pg_strom.h  |   1 +
 src/pg_utils.h  |  73 +++++++++++++++
 src/relscan.c   |   2 +-
 4 files changed, 231 insertions(+), 89 deletions(-)

diff --git a/src/gpu_cache.c b/src/gpu_cache.c
index ce982e6b6..c4f5291d1 100644
--- a/src/gpu_cache.c
+++ b/src/gpu_cache.c
@@ -78,6 +78,33 @@ GpuCacheOptionsEqual(const GpuCacheOptions *a, const GpuCacheOptions *b)
 			a->redo_buffer_size   == b->redo_buffer_size);
 }
 
+/*
+ * GpuCacheTableSignatureBuffer
+ */
+typedef struct
+{
+	Oid		reltablespace;
+	Oid		relfilenode;	/* if 0, cannot have gpucache */
+	int16	relnatts;
+	GpuCacheOptions gc_options;
+	struct {
+		Oid		atttypid;
+		int32	atttypmod;
+		int16	attlen;
+		bool	attbyval;
+		char	attalign;
+		bool	attnotnull;
+		bool	attisdropped;
+	} attrs[FLEXIBLE_ARRAY_MEMBER];
+} GpuCacheTableSignatureBuffer;
+
+typedef struct
+{
+	Oid			table_oid;
+	uint64_t	signature;
+	GpuCacheOptions gc_options;
+} GpuCacheTableSignatureCache;
+
 /*
  * GpuCacheSharedState (shared structure; dynamic memory mapped)
  */
@@ -290,19 +317,17 @@ __parseSyncTriggerOptions(const char *__config, GpuCacheOptions *gc_options)
 
 		if (strcmp(key, "gpu_device_id") == 0)
 		{
-			int		i, gpu_device_id;
-			char   *end;
+			int		gpu_device_id;
 
-			gpu_device_id = strtol(value, &end, 10);
-			if (*end != '\0')
+			gpu_device_id = __strtol(value);
+			if (errno != 0)
 			{
-				elog(WARNING, "gpucache: invalid option [%s]=[%s]",
-					 key, value);
+				elog(WARNING, "gpucache: invalid option [%s]=[%s] : %m", key, value);
 				return false;
 			}
 
 			cuda_dindex = -1;
-			for (i=0; i < numGpuDevAttrs; i++)
+			for (int i=0; i < numGpuDevAttrs; i++)
 			{
 				if (gpuDevAttrs[i].DEV_ID == gpu_device_id)
 				{
@@ -313,81 +338,44 @@ __parseSyncTriggerOptions(const char *__config, GpuCacheOptions *gc_options)
 
 			if (cuda_dindex < 0)
 			{
-				elog(WARNING, "gpucache: gpu_device_id (%d) not found",
-					 gpu_device_id);
+				elog(WARNING, "gpucache: gpu_device_id (%d) not found", gpu_device_id);
 				return false;
 			}
 		}
 		else if (strcmp(key, "max_num_rows") == 0)
 		{
-			char   *end;
-
-			max_num_rows = strtol(value, &end, 10);
-			if (*end != '\0')
+			max_num_rows = __strtol(value);
+			if (errno != 0)
 			{
-				elog(WARNING, "gpucache: invalid option [%s]=[%s]",
-					 key, value);
-				return false;
-			}
-			if (max_num_rows >= UINT_MAX)
-			{
-				elog(WARNING, "gpucache: max_num_rows too large (%lu)",
-					 max_num_rows);
+				elog(WARNING, "gpucache: invalid option [%s]=[%s] : %m", key, value);
 				return false;
 			}
 		}
 		else if (strcmp(key, "gpu_sync_interval") == 0)
 		{
-			char   *end;
-
-			gpu_sync_interval = strtol(value, &end, 10);
-			if (*end != '\0')
+			gpu_sync_interval = __strtol(value);
+			if (errno != 0)
 			{
-				elog(WARNING, "gpucache: invalid option [%s]=[%s]",
-					 key, value);
+				elog(WARNING, "gpucache: invalid option [%s]=[%s] : %m", key, value);
 				return false;
 			}
 			gpu_sync_interval *= 1000000L;	/* [sec -> us] */
 		}
 		else if (strcmp(key, "gpu_sync_threshold") == 0)
 		{
-			char   *end;
-
-			gpu_sync_threshold = strtol(value, &end, 10);
-			if (strcasecmp(end, "g") == 0 || strcasecmp(end, "gb") == 0)
-				gpu_sync_threshold = (gpu_sync_threshold << 30);
-			else if (strcasecmp(end, "m") == 0 || strcasecmp(end, "mb") == 0)
-				gpu_sync_threshold = (gpu_sync_threshold << 20);
-			else if (strcasecmp(end, "k") == 0 || strcasecmp(end, "kb") == 0)
-				gpu_sync_threshold = (gpu_sync_threshold << 10);
-			else if (*end != '\0')
+			gpu_sync_threshold = __strtosz(value);
+			if (errno != 0)
 			{
-				elog(WARNING, "gpucache: invalid option [%s]=[%s]",
-					 key, value);
+				elog(WARNING, "gpucache: invalid option [%s]=[%s]", key, value);
 				return false;
 			}
 		}
 		else if (strcmp(key, "redo_buffer_size") == 0)
 		{
-			char   *end;
-
-			redo_buffer_size = strtol(value, &end, 10);
-			if (strcasecmp(end, "g") == 0 || strcasecmp(end, "gb") == 0)
-				redo_buffer_size = (redo_buffer_size << 30);
-			else if (strcasecmp(end, "m") == 0 || strcasecmp(end, "mb") == 0)
-				redo_buffer_size = (redo_buffer_size << 20);
-			else if (strcasecmp(end, "k") == 0 || strcasecmp(end, "kb") == 0)
-				redo_buffer_size = (redo_buffer_size << 10);
-			else if (*end != '\0')
-			{
-				elog(WARNING, "gpucache: invalid option [%s]=[%s]",
-					 key, value);
-				return false;
-			}
-			if (redo_buffer_size < (16UL << 20))
+			redo_buffer_size = __strtosz(value);
+			if (errno != 0)
 			{
-				elog(WARNING, "gpucache: 'redo_buffer_size' too small (%zu)",
-					 redo_buffer_size);
+				elog(WARNING, "gpucache: invalid option [%s]=[%s]", key, value);
 				return false;
 			}
 		}
@@ -401,11 +389,6 @@ __parseSyncTriggerOptions(const char *__config, GpuCacheOptions *gc_options)
 	{
 		if (gpu_sync_threshold < 0)
 			gpu_sync_threshold = redo_buffer_size / 4;
-		if (gpu_sync_threshold > redo_buffer_size / 2)
-		{
-			elog(WARNING, "gpucache: gpu_sync_threshold is too small");
-			return false;
-		}
 		if (rowid_hash_nslots < 0)
 			rowid_hash_nslots = (max_num_rows + max_num_rows / 5);
 		gc_options->cuda_dindex       = cuda_dindex;
@@ -418,6 +401,85 @@ __parseSyncTriggerOptions(const char *__config, GpuCacheOptions *gc_options)
 	return true;
 }
 
+/*
+ * __validateSyncTriggerOptions
+ */
+static bool
+__validateSyncTriggerOptions(const GpuCacheTableSignatureBuffer *sig)
+{
+	const GpuCacheOptions *gc_options = &sig->gc_options;
+	int			nr_colmeta = sig->relnatts + 1;
+	int64_t		max_num_rows = gc_options->max_num_rows;
+	size_t		main_sz = 0;
+	size_t		extra_sz = 0;
+	int			unitsz;
+
+	if (gc_options->redo_buffer_size < (16UL << 20))
+	{
+		elog(WARNING, "gpucache: 'redo_buffer_size' is too small (%zu)",
+			 gc_options->redo_buffer_size);
+		return false;
+	}
+	if (gc_options->gpu_sync_threshold > gc_options->redo_buffer_size / 2)
+	{
+		elog(WARNING, "gpucache: gpu_sync_threshold is too small (%.2f%% of 'redo_buffer_size')",
+			 ((double)gc_options->gpu_sync_threshold /
+			  (double)gc_options->redo_buffer_size) * 100.0);
+		return false;
+	}
+	if (gc_options->max_num_rows >= UINT_MAX)
+	{
+		elog(WARNING, "gpucache: max_num_rows too large (%lu)",
+			 gc_options->max_num_rows);
+		return false;
+	}
+
+	/* check initial kds_column/kds_extra size */
+	for (int j=0; j < sig->relnatts; j++)
+	{
+		if (!sig->attrs[j].attnotnull)
+			main_sz += MAXALIGN(BITMAPLEN(max_num_rows));
+		if (sig->attrs[j].attlen > 0)
+		{
+			unitsz = att_align_nominal(sig->attrs[j].attlen,
+									   sig->attrs[j].attalign);
+			main_sz += MAXALIGN(unitsz * max_num_rows);
+		}
+		else if (sig->attrs[j].attlen == -1)
+		{
+			main_sz += MAXALIGN(sizeof(uint32_t) * max_num_rows);
+			unitsz = get_typavgwidth(sig->attrs[j].atttypid,
+									 sig->attrs[j].atttypmod);
+			extra_sz += MAXALIGN(unitsz) * max_num_rows;
+		}
+		else
+		{
+			elog(ERROR, "unexpected type length (%d) for type %s",
+				 sig->attrs[j].attlen,
+				 format_type_be(sig->attrs[j].atttypid));
+		}
+		nr_colmeta += count_num_of_subfields(sig->attrs[j].atttypid);
+	}
+	main_sz += (MAXALIGN(offsetof(kern_data_store, colmeta[nr_colmeta])) +	/* KDS Header */
+				MAXALIGN(sizeof(GpuCacheSysattr) * max_num_rows));			/* System Column */
+
+	if (extra_sz > 0)
+	{
+		/* 25% margin + header */
+		extra_sz += extra_sz / 4;
+		extra_sz += offsetof(kern_data_extra, data);
+	}
+	if (main_sz >= __KDS_LENGTH_LIMIT || extra_sz >= __KDS_LENGTH_LIMIT)
+	{
+		elog(WARNING, "gpucache: max_num_rows = %ld consumes too much GPU device memory (main: %s, extra: %s), so we recommend to reduce 'max_num_rows' configuration",
+			 max_num_rows,
+			 format_bytesz(main_sz),
+			 format_bytesz(extra_sz));
+		return false;
+	}
+	return true;
+}
+
 /* ------------------------------------------------------------
  *
  * Routines to manage the table signature
@@ -428,27 +490,6 @@ __parseSyncTriggerOptions(const char *__config, GpuCacheOptions *gc_options)
  * The table signature is a simple and lightweight way to detect these cases.
  * ------------------------------------------------------------
  */
-typedef struct
-{
-	Oid		reltablespace;
-	Oid		relfilenode;	/* if 0, cannot have gpucache */
-	int16	relnatts;
-	GpuCacheOptions gc_options;
-	struct {
-		Oid		atttypid;
-		int32	atttypmod;
-		bool	attnotnull;
-		bool	attisdropped;
-	} attrs[FLEXIBLE_ARRAY_MEMBER];
-} GpuCacheTableSignatureBuffer;
-
-typedef struct
-{
-	Oid			table_oid;
-	uint64_t	signature;
-	GpuCacheOptions gc_options;
-} GpuCacheTableSignatureCache;
-
 static void
 __gpuCacheTableSignature(Relation rel, GpuCacheTableSignatureCache *entry)
 {
@@ -502,6 +543,7 @@ __gpuCacheTableSignature(Relation rel, GpuCacheTableSignatureCache *entry)
 										   &sig->gc_options)))
 			{
 				sig->gc_options.tg_sync_row = trig->tgoid;
+				break;
 			}
 			else
 			{
@@ -517,11 +559,17 @@ __gpuCacheTableSignature(Relation rel, GpuCacheTableSignatureCache *entry)
 	{
 		Form_pg_attribute attr = TupleDescAttr(tupdesc, j);
 
-		sig->attrs[j].atttypid	= attr->atttypid;
-		sig->attrs[j].atttypmod	= attr->atttypmod;
+		sig->attrs[j].atttypid 	 = attr->atttypid;
+		sig->attrs[j].atttypmod  = attr->atttypmod;
+		sig->attrs[j].attlen     = attr->attlen;
+		sig->attrs[j].attbyval   = attr->attbyval;
+		sig->attrs[j].attalign   = attr->attalign;
 		sig->attrs[j].attnotnull = attr->attnotnull;
 		sig->attrs[j].attisdropped = attr->attisdropped;
 	}
+	/* validate option */
+	if (__validateSyncTriggerOptions(sig))
+		goto no_gpu_cache;
 	memcpy(&entry->gc_options, &sig->gc_options,
 		   sizeof(GpuCacheOptions));
 	entry->signature = hash_any((unsigned char *)sig, len) | 0x100000000UL;
@@ -667,11 +715,18 @@ __gpuCacheTableSignatureSnapshot(HeapTuple pg_class_tuple,
 
 		Assert(attr->attnum > 0 && attr->attnum <= sig->relnatts);
 		j = attr->attnum - 1;
-		sig->attrs[j].atttypid  = attr->atttypid;
-		sig->attrs[j].atttypmod = attr->atttypmod;
+		sig->attrs[j].atttypid   = attr->atttypid;
+		sig->attrs[j].atttypmod  = attr->atttypmod;
+		sig->attrs[j].attlen     = attr->attlen;
+		sig->attrs[j].attbyval   = attr->attbyval;
+		sig->attrs[j].attalign   = attr->attalign;
 		sig->attrs[j].attnotnull = attr->attnotnull;
 		sig->attrs[j].attisdropped = attr->attisdropped;
 	}
+	/* validate options */
+	if (!__validateSyncTriggerOptions(sig))
+		goto no_gpu_cache;
+
 	systable_endscan(sscan);
 	table_close(srel, AccessShareLock);
 
@@ -1016,8 +1071,11 @@ __resetGpuCacheSharedState(GpuCacheSharedState *gc_sstate)
     gc_sstate->redo_sync_pos = 0;
 	pthreadMutexUnlock(&gc_sstate->redo_mutex);
 
+	/* initial buffer size should be legal */
+	Assert(gc_sstate->kds_head.length <= __KDS_LENGTH_LIMIT &&
+		   gc_sstate->kds_extra_sz    <= __KDS_LENGTH_LIMIT);
 	/* make this GpuCache available again */
-	pg_atomic_init_u32(&gc_sstate->phase, GCACHE_PHASE__IS_EMPTY);
+	pg_atomic_write_u32(&gc_sstate->phase, GCACHE_PHASE__IS_EMPTY);
 }
 
 /*
@@ -3314,6 +3372,16 @@ __gpucacheExecCompactionKernel(GpuCacheControlCommand *cmd,
 	if (kds_extra->usage > kds_extra->length)
 	{
 		gcache_extra_size = PAGE_ALIGN(kds_extra->usage * 5 / 4);	/* 25% margin */
+		if (gcache_extra_size >= __KDS_LENGTH_LIMIT)
+		{
+#if 1
+			fprintf(stderr,
+					"GpuCache (%s) extra buffer (%ldMB) exceeds the hard limit\n",
+					gc_sstate->table_name,
+					gcache_extra_size >> 20);
+#endif
+			goto bailout;
+		}
 		cuMemFree(m_kds_extra);
 		m_kds_extra = 0UL;
 		goto retry;
diff --git a/src/pg_strom.h b/src/pg_strom.h
index 5a71841fe..f69fdf89f 100644
--- a/src/pg_strom.h
+++ b/src/pg_strom.h
@@ -693,6 +693,7 @@ extern Path	   *pgstromTryFindGistIndex(PlannerInfo *root,
 extern Bitmapset *pickup_outer_referenced(PlannerInfo *root,
 										  RelOptInfo *base_rel,
 										  Bitmapset *referenced);
+extern int		count_num_of_subfields(Oid type_oid);
 extern size_t	estimate_kern_data_store(TupleDesc tupdesc);
 extern size_t	setup_kern_data_store(kern_data_store *kds,
 									  TupleDesc tupdesc,
diff --git a/src/pg_utils.h b/src/pg_utils.h
index a295e94cc..608e03ad0 100644
--- a/src/pg_utils.h
+++ b/src/pg_utils.h
@@ -112,6 +112,79 @@ __trim(char *token)
 	return token;
 }
 
+/*
+ * __strtol / __strtoul / __strtosz - set errno if token is not pure digits
+ */
+static inline long int
+__strtol(const char *token)
+{
+	long int ival;
+	char   *end;
+
+	errno = 0;		/* clear */
+	ival = strtol(token, &end, 10);
+	if (*end != '\0')
+		errno = EINVAL;
+	return ival;
+}
+
+static inline unsigned long int
+__strtoul(const char *token)
+{
+	unsigned long int ival;
+	char   *end;
+
+	errno = 0;		/* clear */
+	ival = strtoul(token, &end, 10);
+	if (*end != '\0')
+		errno = EINVAL;
+	return ival;
+}
+
+static inline size_t
+__strtosz(const char *token)
+{
+	size_t	sz;
+	char   *end;
+
+	errno = 0;		/* clear */
+	sz = strtoul(token, &end, 10);
+	if (errno == 0)
+	{
+		if (strcasecmp(end, "t") == 0 || strcasecmp(end, "tb") == 0)
+		{
+			if (sz > 0x0000000000ffffffUL)
+				errno = ERANGE;
+			else
+				sz <<= 40;
+		}
+		else if (strcasecmp(end, "g") == 0 || strcasecmp(end, "gb") == 0)
+		{
+			if (sz > 0x00000003ffffffffUL)
+				errno = ERANGE;
+			else
+				sz <<= 30;
+		}
+		else if (strcasecmp(end, "m") == 0 || strcasecmp(end, "mb") == 0)
+		{
+			if (sz > 0x00000fffffffffffUL)
+				errno = ERANGE;
+			else
+				sz <<= 20;
+		}
+		else if (strcasecmp(end, "k") == 0 || strcasecmp(end, "kb") == 0)
+		{
+			if (sz > 0x003fffffffffffffUL)
+				errno = ERANGE;
+			else
+				sz <<= 10;
+		}
+		else if (*end != '\0')
+			errno = EINVAL;
+	}
+	return sz;
+}
+
 /* lappend on the specified memory-context */
 static inline List *
 lappend_cxt(MemoryContext memcxt, List *list, void *datum)
diff --git a/src/relscan.c b/src/relscan.c
index d9cdae2f9..86ce27ea1 100644
--- a/src/relscan.c
+++ b/src/relscan.c
@@ -86,7 +86,7 @@ pickup_outer_referenced(PlannerInfo *root,
  *
  * ----------------------------------------------------------------
  */
-static int
+int
 count_num_of_subfields(Oid type_oid)
 {
 	TypeCacheEntry *tcache;