From 690632d0ee6fdb58e517c59926755f1d44104fab Mon Sep 17 00:00:00 2001 From: KaiGai Kohei Date: Thu, 4 May 2023 11:34:41 +0900 Subject: [PATCH] v5.0 alpha-1 --- PG_VERSIONS | 1 - {utils => deadcode}/pystrom/pystrom.c | 0 {utils => deadcode}/pystrom/setup.py | 0 {utils => deadcode}/pystrom/test.py | 0 next/Makefile | 132 - next/aggfuncs.c | 1225 ---- next/arrow_fdw.c | 4657 ------------- next/codegen.c | 2881 -------- next/cuda_common.h | 274 - next/cuda_gpujoin.cu | 602 -- next/cuda_gpupreagg.cu | 1750 ----- next/cuda_gpuscan.cu | 494 -- next/extra.c | 424 -- next/gpu_device.c | 698 -- next/main.c | 533 -- next/pg_strom.h | 941 --- next/relscan.c | 918 --- Makefile => old/Makefile | 21 +- Makefile.cuda => old/Makefile.cuda | 0 old/aggfuncs.c | 1407 ++++ {next => old}/arrow_defs.h | 78 +- old/arrow_fdw.c | 6239 +++++++++++++++++ {next => old}/arrow_ipc.h | 4 +- {next => old}/arrow_nodes.c | 156 +- {src => old}/arrow_pgsql.c | 0 {src => old}/arrow_write.c | 0 old/codegen.c | 4929 ++++++++++++++ {src => old}/cuda_basetype.h | 0 {src => old}/cuda_codegen.h | 0 {src => old}/cuda_common.cu | 0 old/cuda_common.h | 1837 +++++ {src => old}/cuda_gcache.cu | 0 {src => old}/cuda_gcache.h | 0 old/cuda_gpujoin.cu | 1927 ++++++ {src => old}/cuda_gpujoin.h | 0 old/cuda_gpupreagg.cu | 1773 +++++ {src => old}/cuda_gpupreagg.h | 0 old/cuda_gpuscan.cu | 750 +++ {src => old}/cuda_gpuscan.h | 0 {src => old}/cuda_gpusort.cu | 0 {src => old}/cuda_gpusort.h | 0 {src => old}/cuda_jsonlib.cu | 0 {src => old}/cuda_jsonlib.h | 0 {src => old}/cuda_misclib.cu | 0 {src => old}/cuda_misclib.h | 0 {src => old}/cuda_numeric.cu | 0 {src => old}/cuda_numeric.h | 0 {src => old}/cuda_postgis.cu | 0 {src => old}/cuda_postgis.h | 0 {src => old}/cuda_primitive.cu | 0 {src => old}/cuda_primitive.h | 0 {src => old}/cuda_program.c | 0 {src => old}/cuda_rangetype.cu | 0 {src => old}/cuda_rangetype.h | 0 {src => old}/cuda_textlib.cu | 0 {src => old}/cuda_textlib.h | 0 {src => old}/cuda_timelib.cu | 0 {src => old}/cuda_timelib.h | 0 {src => old}/cuda_utils.h | 0 {src => old}/datastore.c | 0 {src => old}/device_attrs.h | 0 old/extra.c | 611 ++ {next => old}/float2.c | 687 +- {next => old}/float2.h | 132 +- {src => old}/gpu_cache.c | 0 {src => old}/gpu_context.c | 0 old/gpu_device.c | 685 ++ {src => old}/gpu_mmgr.c | 0 {src => old}/gpu_tasks.c | 0 {utils => old}/gpuinfo.c | 2 +- {src => old}/gpujoin.c | 0 {src => old}/gpupreagg.c | 0 {src => old}/gpuscan.c | 0 {next => old}/heterodb_extra.h | 26 +- old/main.c | 628 ++ {next => old}/misc.c | 928 +-- {src => old}/nvrtc.c | 0 {src => old}/pg_compat.h | 0 pg_strom.control => old/pg_strom.control | 0 old/pg_strom.h | 1938 ++++++ old/relscan.c | 2222 ++++++ {src => old}/shmbuf.c | 0 {sql => old/sql}/pg_strom--2.2--2.3.sql | 0 {sql => old/sql}/pg_strom--2.2.sql | 0 {sql => old/sql}/pg_strom--2.3--3.0.sql | 0 {sql => old/sql}/pg_strom--3.0--4.0.sql | 0 {sql => old/sql}/pg_strom--3.0.sql | 0 {next => old}/tinyint.c | 4 +- src/Makefile | 132 +- {next => src}/Makefile.cuda | 0 src/aggfuncs.c | 1256 ++-- src/arrow_defs.h | 78 +- src/arrow_fdw.c | 7868 +++++++++------------- src/arrow_ipc.h | 4 +- src/arrow_nodes.c | 156 +- {next => src}/brin.c | 0 src/codegen.c | 6766 +++++++------------ src/cuda_common.h | 1977 +----- src/cuda_gpujoin.cu | 2213 ++---- src/cuda_gpupreagg.cu | 3051 +++++---- src/cuda_gpuscan.cu | 1084 ++- {next => src}/dpu/Makefile | 0 {next => src}/dpu/arrow_defs.h | 0 {next => src}/dpu/dpuserv.c | 0 {next => src}/dpu/dpuserv.h | 0 {next => src}/dpu/float2.h | 0 {next => src}/dpu/heterodb_extra.h | 0 {next => src}/dpu/xpu_basetype.cc | 0 {next => src}/dpu/xpu_basetype.h | 0 {next => src}/dpu/xpu_common.cc | 0 {next => src}/dpu/xpu_common.h | 0 {next => src}/dpu/xpu_misclib.cc | 0 {next => src}/dpu/xpu_misclib.h | 0 {next => src}/dpu/xpu_numeric.cc | 0 {next => src}/dpu/xpu_numeric.h | 0 {next => src}/dpu/xpu_opcodes.h | 0 {next => src}/dpu/xpu_textlib.cc | 0 {next => src}/dpu/xpu_textlib.h | 0 {next => src}/dpu/xpu_timelib.cc | 0 {next => src}/dpu/xpu_timelib.h | 0 {next => src}/dpu_device.c | 0 {next => src}/dpu_join.c | 0 {next => src}/dpu_preagg.c | 0 {next => src}/dpu_scan.c | 0 {next => src}/executor.c | 0 src/extra.c | 453 +- src/float2.c | 687 +- src/float2.h | 132 +- src/gpu_device.c | 999 +-- {next => src}/gpu_join.c | 0 {next => src}/gpu_preagg.c | 0 {next => src}/gpu_scan.c | 0 {next => src}/gpu_service.c | 0 src/heterodb_extra.h | 26 +- src/main.c | 467 +- src/misc.c | 928 ++- {next => src}/multirels.c | 0 {next => src}/pcie.c | 0 {next => src}/pg_strom.control | 0 src/pg_strom.h | 2483 ++----- {next => src}/pg_utils.h | 0 src/relscan.c | 2706 ++------ {next => src/sql}/pg_strom--5.0.sql | 0 src/tinyint.c | 4 +- {next => src}/xpu_basetype.cu | 0 {next => src}/xpu_basetype.h | 0 {next => src}/xpu_common.cu | 0 {next => src}/xpu_common.h | 0 {next => src}/xpu_misclib.cu | 0 {next => src}/xpu_misclib.h | 0 {next => src}/xpu_numeric.cu | 0 {next => src}/xpu_numeric.h | 0 {next => src}/xpu_opcodes.h | 0 {next => src}/xpu_textlib.cu | 0 {next => src}/xpu_textlib.h | 0 {next => src}/xpu_timelib.cu | 0 {next => src}/xpu_timelib.h | 0 utils/Makefile | 4 - utils/ssbm/bcd2.c | 237 - utils/ssbm/bcd2.h | 11 - utils/ssbm/bm_utils.c | 638 -- utils/ssbm/build.c | 802 --- utils/ssbm/config.h | 179 - utils/ssbm/dbgen | Bin 188424 -> 0 bytes utils/ssbm/dists.dss | 817 --- utils/ssbm/driver.c | 1156 ---- utils/ssbm/dss.h | 596 -- utils/ssbm/dsstypes.h | 312 - utils/ssbm/load_stub.c | 282 - utils/ssbm/permute.c | 175 - utils/ssbm/print.c | 1013 --- utils/ssbm/rnd.c | 262 - utils/ssbm/rnd.h | 80 - utils/ssbm/shared.h | 140 - utils/ssbm/speed_seed.c | 325 - utils/ssbm/ssbm-ddl.sql | 175 - utils/ssbm/text.c | 313 - 177 files changed, 37989 insertions(+), 45512 deletions(-) delete mode 100644 PG_VERSIONS rename {utils => deadcode}/pystrom/pystrom.c (100%) rename {utils => deadcode}/pystrom/setup.py (100%) rename {utils => deadcode}/pystrom/test.py (100%) delete mode 100644 next/Makefile delete mode 100644 next/aggfuncs.c delete mode 100644 next/arrow_fdw.c delete mode 100644 next/codegen.c delete mode 100644 next/cuda_common.h delete mode 100644 next/cuda_gpujoin.cu delete mode 100644 next/cuda_gpupreagg.cu delete mode 100644 next/cuda_gpuscan.cu delete mode 100644 next/extra.c delete mode 100644 next/gpu_device.c delete mode 100644 next/main.c delete mode 100644 next/pg_strom.h delete mode 100644 next/relscan.c rename Makefile => old/Makefile (95%) rename Makefile.cuda => old/Makefile.cuda (100%) create mode 100644 old/aggfuncs.c rename {next => old}/arrow_defs.h (91%) create mode 100644 old/arrow_fdw.c rename {next => old}/arrow_ipc.h (99%) rename {next => old}/arrow_nodes.c (95%) rename {src => old}/arrow_pgsql.c (100%) rename {src => old}/arrow_write.c (100%) create mode 100644 old/codegen.c rename {src => old}/cuda_basetype.h (100%) rename {src => old}/cuda_codegen.h (100%) rename {src => old}/cuda_common.cu (100%) create mode 100644 old/cuda_common.h rename {src => old}/cuda_gcache.cu (100%) rename {src => old}/cuda_gcache.h (100%) create mode 100644 old/cuda_gpujoin.cu rename {src => old}/cuda_gpujoin.h (100%) create mode 100644 old/cuda_gpupreagg.cu rename {src => old}/cuda_gpupreagg.h (100%) create mode 100644 old/cuda_gpuscan.cu rename {src => old}/cuda_gpuscan.h (100%) rename {src => old}/cuda_gpusort.cu (100%) rename {src => old}/cuda_gpusort.h (100%) rename {src => old}/cuda_jsonlib.cu (100%) rename {src => old}/cuda_jsonlib.h (100%) rename {src => old}/cuda_misclib.cu (100%) rename {src => old}/cuda_misclib.h (100%) rename {src => old}/cuda_numeric.cu (100%) rename {src => old}/cuda_numeric.h (100%) rename {src => old}/cuda_postgis.cu (100%) rename {src => old}/cuda_postgis.h (100%) rename {src => old}/cuda_primitive.cu (100%) rename {src => old}/cuda_primitive.h (100%) rename {src => old}/cuda_program.c (100%) rename {src => old}/cuda_rangetype.cu (100%) rename {src => old}/cuda_rangetype.h (100%) rename {src => old}/cuda_textlib.cu (100%) rename {src => old}/cuda_textlib.h (100%) rename {src => old}/cuda_timelib.cu (100%) rename {src => old}/cuda_timelib.h (100%) rename {src => old}/cuda_utils.h (100%) rename {src => old}/datastore.c (100%) rename {src => old}/device_attrs.h (100%) create mode 100644 old/extra.c rename {next => old}/float2.c (50%) rename {next => old}/float2.h (56%) rename {src => old}/gpu_cache.c (100%) rename {src => old}/gpu_context.c (100%) create mode 100644 old/gpu_device.c rename {src => old}/gpu_mmgr.c (100%) rename {src => old}/gpu_tasks.c (100%) rename {utils => old}/gpuinfo.c (99%) rename {src => old}/gpujoin.c (100%) rename {src => old}/gpupreagg.c (100%) rename {src => old}/gpuscan.c (100%) rename {next => old}/heterodb_extra.h (59%) create mode 100644 old/main.c rename {next => old}/misc.c (68%) rename {src => old}/nvrtc.c (100%) rename {src => old}/pg_compat.h (100%) rename pg_strom.control => old/pg_strom.control (100%) create mode 100644 old/pg_strom.h create mode 100644 old/relscan.c rename {src => old}/shmbuf.c (100%) rename {sql => old/sql}/pg_strom--2.2--2.3.sql (100%) rename {sql => old/sql}/pg_strom--2.2.sql (100%) rename {sql => old/sql}/pg_strom--2.3--3.0.sql (100%) rename {sql => old/sql}/pg_strom--3.0--4.0.sql (100%) rename {sql => old/sql}/pg_strom--3.0.sql (100%) rename {next => old}/tinyint.c (99%) rename {next => src}/Makefile.cuda (100%) rename {next => src}/brin.c (100%) rename {next => src}/dpu/Makefile (100%) rename {next => src}/dpu/arrow_defs.h (100%) rename {next => src}/dpu/dpuserv.c (100%) rename {next => src}/dpu/dpuserv.h (100%) rename {next => src}/dpu/float2.h (100%) rename {next => src}/dpu/heterodb_extra.h (100%) rename {next => src}/dpu/xpu_basetype.cc (100%) rename {next => src}/dpu/xpu_basetype.h (100%) rename {next => src}/dpu/xpu_common.cc (100%) rename {next => src}/dpu/xpu_common.h (100%) rename {next => src}/dpu/xpu_misclib.cc (100%) rename {next => src}/dpu/xpu_misclib.h (100%) rename {next => src}/dpu/xpu_numeric.cc (100%) rename {next => src}/dpu/xpu_numeric.h (100%) rename {next => src}/dpu/xpu_opcodes.h (100%) rename {next => src}/dpu/xpu_textlib.cc (100%) rename {next => src}/dpu/xpu_textlib.h (100%) rename {next => src}/dpu/xpu_timelib.cc (100%) rename {next => src}/dpu/xpu_timelib.h (100%) rename {next => src}/dpu_device.c (100%) rename {next => src}/dpu_join.c (100%) rename {next => src}/dpu_preagg.c (100%) rename {next => src}/dpu_scan.c (100%) rename {next => src}/executor.c (100%) rename {next => src}/gpu_join.c (100%) rename {next => src}/gpu_preagg.c (100%) rename {next => src}/gpu_scan.c (100%) rename {next => src}/gpu_service.c (100%) rename {next => src}/multirels.c (100%) rename {next => src}/pcie.c (100%) rename {next => src}/pg_strom.control (100%) rename {next => src}/pg_utils.h (100%) rename {next => src/sql}/pg_strom--5.0.sql (100%) rename {next => src}/xpu_basetype.cu (100%) rename {next => src}/xpu_basetype.h (100%) rename {next => src}/xpu_common.cu (100%) rename {next => src}/xpu_common.h (100%) rename {next => src}/xpu_misclib.cu (100%) rename {next => src}/xpu_misclib.h (100%) rename {next => src}/xpu_numeric.cu (100%) rename {next => src}/xpu_numeric.h (100%) rename {next => src}/xpu_opcodes.h (100%) rename {next => src}/xpu_textlib.cu (100%) rename {next => src}/xpu_textlib.h (100%) rename {next => src}/xpu_timelib.cu (100%) rename {next => src}/xpu_timelib.h (100%) delete mode 100644 utils/Makefile delete mode 100644 utils/ssbm/bcd2.c delete mode 100644 utils/ssbm/bcd2.h delete mode 100644 utils/ssbm/bm_utils.c delete mode 100644 utils/ssbm/build.c delete mode 100644 utils/ssbm/config.h delete mode 100755 utils/ssbm/dbgen delete mode 100644 utils/ssbm/dists.dss delete mode 100644 utils/ssbm/driver.c delete mode 100644 utils/ssbm/dss.h delete mode 100644 utils/ssbm/dsstypes.h delete mode 100644 utils/ssbm/load_stub.c delete mode 100644 utils/ssbm/permute.c delete mode 100644 utils/ssbm/print.c delete mode 100644 utils/ssbm/rnd.c delete mode 100644 utils/ssbm/rnd.h delete mode 100644 utils/ssbm/shared.h delete mode 100644 utils/ssbm/speed_seed.c delete mode 100644 utils/ssbm/ssbm-ddl.sql delete mode 100644 utils/ssbm/text.c diff --git a/PG_VERSIONS b/PG_VERSIONS deleted file mode 100644 index 8053d2cc1..000000000 --- a/PG_VERSIONS +++ /dev/null @@ -1 +0,0 @@ -12 13 14 15 diff --git a/utils/pystrom/pystrom.c b/deadcode/pystrom/pystrom.c similarity index 100% rename from utils/pystrom/pystrom.c rename to deadcode/pystrom/pystrom.c diff --git a/utils/pystrom/setup.py b/deadcode/pystrom/setup.py similarity index 100% rename from utils/pystrom/setup.py rename to deadcode/pystrom/setup.py diff --git a/utils/pystrom/test.py b/deadcode/pystrom/test.py similarity index 100% rename from utils/pystrom/test.py rename to deadcode/pystrom/test.py diff --git a/next/Makefile b/next/Makefile deleted file mode 100644 index f30e226c8..000000000 --- a/next/Makefile +++ /dev/null @@ -1,132 +0,0 @@ -# -# PG-Strom Makefile -# -PG_CONFIG ?= pg_config - -ifndef STROM_BUILD_ROOT -STROM_BUILD_ROOT=.. -endif - -# -# PG-Strom version -# -PGSTROM_VERSION := 5.0 -PGSTROM_RELEASE := devel - -# -# Source of PG-Strom host code -# -__STROM_OBJS = main.o extra.o codegen.o misc.o executor.o \ - gpu_device.o gpu_scan.o gpu_join.o gpu_preagg.o \ - dpu_device.o dpu_scan.o dpu_join.o dpu_preagg.o \ - relscan.o brin.o gpu_service.o \ - arrow_fdw.o arrow_nodes.o \ - pcie.o float2.o tinyint.o aggfuncs.o -STROM_OBJS = $(addprefix $(STROM_BUILD_ROOT)/next/,$(__STROM_OBJS)) - -GPU_DEVATTRS_H = $(STROM_BUILD_ROOT)/next/gpu_devattrs.h -GENERATED-HEADERS = $(GPU_DEVATTRS_H) - -# -# Source of NVIDIA GPU device code -# -include $(STROM_BUILD_ROOT)/next/Makefile.cuda -__CUDA_OBJS = xpu_common cuda_gpuscan cuda_gpujoin cuda_gpupreagg \ - xpu_basetype xpu_numeric xpu_timelib xpu_textlib xpu_misclib -__CUDA_HEADERS = cuda_common.h xpu_common.h xpu_opcodes.h xpu_basetype.h \ - xpu_numeric.h xpu_textlib.h xpu_timelib.h xpu_misclib.h -__CUDA_OPT_OBJS = $(addsuffix .fatbin,$(__CUDA_OBJS)) -__CUDA_DBG_OBJS = $(addsuffix .debug.fatbin,$(__CUDA_OBJS)) -CUDA_HEADERS = $(addprefix $(STROM_BUILD_ROOT)/next/,$(__CUDA_HEADERS)) -CUDA_OPT_OBJS = $(addprefix $(STROM_BUILD_ROOT)/next/,$(__CUDA_OPT_OBJS)) -CUDA_DBG_OBJS = $(addprefix $(STROM_BUILD_ROOT)/next/,$(__CUDA_DBG_OBJS)) -CUDA_OPT_MODULE = $(STROM_BUILD_ROOT)/next/pgstrom-core.fatbin -CUDA_DBG_MODULE = $(STROM_BUILD_ROOT)/next/pgstrom-core.debug.fatbin - - -# -# Installation Scripts -# -__STROM_SQL = pg_strom--5.0.sql -STROM_SQL = $(addprefix $(STROM_BUILD_ROOT)/next/,$(__STROM_SQL)) - -# -# GitHash to build -# -ifdef PGSTROM_GITHASH -ifeq ($(PGSTROM_GITHASH),HEAD) -PGSTROM_GITHASH = $(shell git rev-parse HEAD) -endif -else -ifeq ($(shell test -e $(STROM_BUILD_ROOT)/.git/config && echo -n 1),1) -PGSTROM_GITHASH = $(shell git rev-parse HEAD) -ifneq ($(shell git diff | wc -l),0) -PGSTROM_GITHASH_SUFFIX = ::local_changes -endif -else -ifeq ($(shell test -e $(STROM_BUILD_ROOT)/GITHASH && echo -n 1),1) -PGSTROM_GITHASH = $(shell cat $(STROM_BUILD_ROOT)/GITHASH) -else -PGSTROM_GITHASH = HEAD -endif -endif -endif - -# -# Flags to build -# -PGSTROM_FLAGS += $(PGSTROM_FLAGS_CUSTOM) -PGSTROM_FLAGS += -D__PGSTROM_MODULE__=1 -PGSTROM_FLAGS += "-DPGSTROM_VERSION=\"$(PGSTROM_VERSION)\"" - -PGSTROM_DEBUG = 1 -ifeq ($(PGSTROM_DEBUG),1) -PGSTROM_FLAGS += -g -O0 -DPGSTROM_DEBUG_BUILD=1 -endif -PGSTROM_FLAGS += -D__STROM_HOST__=1 -ifeq ($(shell uname -m),aarch64) -PGSTROM_FLAGS += -DHAVE_FLOAT2 -mfp16-format=ieee -endif -PGSTROM_FLAGS += -DPGSTROM_GITHASH=\"$(PGSTROM_GITHASH)$(PGSTROM_GITHASH_SUFFIX)\" -PGSTROM_FLAGS += -DPGSHAREDIR=\"$(shell $(PG_CONFIG) --sharedir)\" -PGSTROM_FLAGS += -DCUDA_MAXREGCOUNT=$(MAXREGCOUNT) -PGSTROM_FLAGS += -DCMD_GPUINFO_PATH=\"$(shell $(PG_CONFIG) --bindir)/gpuinfo\" -PGSTROM_FLAGS += -DCUDA_BUILTIN_OBJS="\"$(__CUDA_OBJS)\"" -PG_CPPFLAGS := $(PGSTROM_FLAGS) -I $(CUDA_IPATH) -SHLIB_LINK := -L $(CUDA_LPATH) -lcuda - -# -# Definition of PG-Strom Extension -# -MODULE_big = pg_strom -MODULEDIR = pg_strom -DATA = $(STROM_SQL) -OBJS = $(STROM_OBJS) -DATA_built = $(CUDA_OPT_OBJS) $(CUDA_DBG_OBJS) -EXTRA_CLEAN = $(DATA_built) $(GENERATED-HEADERS) -EXTENSION = pg_strom - -PGXS := $(shell $(PG_CONFIG) --pgxs) -include $(PGXS) - -# -# Device Attributes -# -submake-generated-headers: $(GENERATED-HEADERS) - -$(GPU_DEVATTRS_H): $(CUDA_IPATH)/cuda.h - cat $(CUDA_IPATH)/cuda.h | \ - grep -E '^[ ]+CU_DEVICE_ATTRIBUTE_' | \ - grep -v -E 'CU_DEVICE_ATTRIBUTE_MAX$$' | \ - grep -v 'Deprecated[,\.]' | \ - sed -e 's|[ ]*CU_DEVICE_ATTRIBUTE_|DEV_ATTR(|g' \ - -e 's| =.*/\*\*<[ ]*|, "|g' \ - -e 's|[ ]*\*/|")|g' > $@ - -# -# GPU Device Code -# -%.fatbin: %.cu $(CUDA_HEADERS) - $(NVCC) $(NVCC_FLAGS) -o $@ $< -%.debug.fatbin: %.cu $(CUDA_HEADERS) - $(NVCC) $(NVCC_DEBUG_FLAGS) -o $@ $< diff --git a/next/aggfuncs.c b/next/aggfuncs.c deleted file mode 100644 index a09def20f..000000000 --- a/next/aggfuncs.c +++ /dev/null @@ -1,1225 +0,0 @@ -/* - * aggfuncs.c - * - * Definition of self-defined aggregate functions, used by GpuPreAgg - * ---- - * Copyright 2011-2023 (C) KaiGai Kohei - * Copyright 2014-2023 (C) PG-Strom Developers Team - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the PostgreSQL License. - */ -#include "pg_strom.h" -#include "float2.h" - -/* - * Functions Declaration - */ -PG_FUNCTION_INFO_V1(pgstrom_partial_nrows); - -PG_FUNCTION_INFO_V1(pgstrom_partial_minmax_int32); -PG_FUNCTION_INFO_V1(pgstrom_partial_minmax_int64); -PG_FUNCTION_INFO_V1(pgstrom_partial_minmax_fp32); -PG_FUNCTION_INFO_V1(pgstrom_partial_minmax_fp64); -PG_FUNCTION_INFO_V1(pgstrom_fmin_trans_int64); -PG_FUNCTION_INFO_V1(pgstrom_fmin_trans_fp64); -PG_FUNCTION_INFO_V1(pgstrom_fmax_trans_int64); -PG_FUNCTION_INFO_V1(pgstrom_fmax_trans_fp64); -PG_FUNCTION_INFO_V1(pgstrom_fminmax_final_int8); -PG_FUNCTION_INFO_V1(pgstrom_fminmax_final_int16); -PG_FUNCTION_INFO_V1(pgstrom_fminmax_final_int32); -PG_FUNCTION_INFO_V1(pgstrom_fminmax_final_int64); -PG_FUNCTION_INFO_V1(pgstrom_fminmax_final_fp16); -PG_FUNCTION_INFO_V1(pgstrom_fminmax_final_fp32); -PG_FUNCTION_INFO_V1(pgstrom_fminmax_final_fp64); -PG_FUNCTION_INFO_V1(pgstrom_fminmax_final_numeric); - -PG_FUNCTION_INFO_V1(pgstrom_partial_sum_asis); - -PG_FUNCTION_INFO_V1(pgstrom_partial_avg_int); -PG_FUNCTION_INFO_V1(pgstrom_partial_avg_fp); -PG_FUNCTION_INFO_V1(pgstrom_favg_trans_int); -PG_FUNCTION_INFO_V1(pgstrom_favg_trans_fp); -PG_FUNCTION_INFO_V1(pgstrom_favg_final_int); -PG_FUNCTION_INFO_V1(pgstrom_favg_final_fp); -PG_FUNCTION_INFO_V1(pgstrom_favg_final_num); - -PG_FUNCTION_INFO_V1(pgstrom_partial_variance); -PG_FUNCTION_INFO_V1(pgstrom_stddev_trans); -PG_FUNCTION_INFO_V1(pgstrom_stddev_samp_final); -PG_FUNCTION_INFO_V1(pgstrom_stddev_sampf_final); -PG_FUNCTION_INFO_V1(pgstrom_stddev_pop_final); -PG_FUNCTION_INFO_V1(pgstrom_stddev_popf_final); -PG_FUNCTION_INFO_V1(pgstrom_var_samp_final); -PG_FUNCTION_INFO_V1(pgstrom_var_sampf_final); -PG_FUNCTION_INFO_V1(pgstrom_var_pop_final); -PG_FUNCTION_INFO_V1(pgstrom_var_popf_final); - -PG_FUNCTION_INFO_V1(pgstrom_partial_covar); -PG_FUNCTION_INFO_V1(pgstrom_covar_accum); -PG_FUNCTION_INFO_V1(pgstrom_covar_samp_final); -PG_FUNCTION_INFO_V1(pgstrom_covar_pop_final); - -PG_FUNCTION_INFO_V1(pgstrom_regr_avgx_final); -PG_FUNCTION_INFO_V1(pgstrom_regr_avgy_final); -PG_FUNCTION_INFO_V1(pgstrom_regr_count_final); -PG_FUNCTION_INFO_V1(pgstrom_regr_intercept_final); -PG_FUNCTION_INFO_V1(pgstrom_regr_r2_final); -PG_FUNCTION_INFO_V1(pgstrom_regr_slope_final); -PG_FUNCTION_INFO_V1(pgstrom_regr_sxx_final); -PG_FUNCTION_INFO_V1(pgstrom_regr_sxy_final); -PG_FUNCTION_INFO_V1(pgstrom_regr_syy_final); - -/* - * float8 validator - */ -static inline void -check_float8_value(float8 value, bool inf_is_valid, bool zero_is_valid) -{ - if (isinf(value) && !inf_is_valid) - ereport(ERROR, - (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), - errmsg("value out of range: overflow"))); - if (value == 0.0 && !zero_is_valid) - ereport(ERROR, - (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), - errmsg("value out of range: underflow"))); -} - -/* - * NROWS - */ -Datum -pgstrom_partial_nrows(PG_FUNCTION_ARGS) -{ - int i; - - for (i=0; i < PG_NARGS(); i++) - { - if (PG_ARGISNULL(i) || !PG_GETARG_BOOL(i)) - PG_RETURN_INT64(0); - } - PG_RETURN_INT64(1); -} - -/* - * MIN(X) and MAX(X) functions - */ -Datum -pgstrom_partial_minmax_int64(PG_FUNCTION_ARGS) -{ - kagg_state__pminmax_int64_packed *r; - - r = palloc(sizeof(kagg_state__pminmax_int64_packed)); - r->nitems = 1; - r->value = PG_GETARG_INT64(0); - SET_VARSIZE(r, sizeof(kagg_state__pminmax_int64_packed)); - - PG_RETURN_POINTER(r); -} - -Datum -pgstrom_partial_minmax_fp64(PG_FUNCTION_ARGS) -{ - kagg_state__pminmax_fp64_packed *r; - - r = palloc(sizeof(kagg_state__pminmax_fp64_packed)); - r->nitems = 1; - r->value = PG_GETARG_FLOAT8(0); - SET_VARSIZE(r, sizeof(kagg_state__pminmax_fp64_packed)); - - PG_RETURN_POINTER(r); -} - -#define __MINMAX_TRANS_TEMPLATE(TYPE,OPER) \ - kagg_state__pminmax_##TYPE##_packed *state; \ - kagg_state__pminmax_##TYPE##_packed *arg; \ - MemoryContext aggcxt; \ - \ - if (!AggCheckCallContext(fcinfo, &aggcxt)) \ - elog(ERROR, "aggregate function called in non-aggregate context"); \ - if (PG_ARGISNULL(0)) \ - { \ - if (PG_ARGISNULL(1)) \ - PG_RETURN_NULL(); \ - arg = (kagg_state__pminmax_##TYPE##_packed *) \ - PG_GETARG_BYTEA_P(1); \ - state = MemoryContextAlloc(aggcxt, sizeof(*state)); \ - memcpy(state, arg, sizeof(*state)); \ - } \ - else \ - { \ - state = (kagg_state__pminmax_##TYPE##_packed *) \ - PG_GETARG_BYTEA_P(0); \ - if (!PG_ARGISNULL(1)) \ - { \ - arg = (kagg_state__pminmax_##TYPE##_packed *) \ - PG_GETARG_BYTEA_P(1); \ - if (arg->nitems > 0) \ - { \ - if (state->nitems == 0) \ - memcpy(state, arg, sizeof(*state)); \ - else \ - state->value = OPER(state->value, arg->value); \ - } \ - } \ - } \ - PG_RETURN_POINTER(state); - -Datum -pgstrom_fmin_trans_int64(PG_FUNCTION_ARGS) -{ - __MINMAX_TRANS_TEMPLATE(int64,Min); -} - -Datum -pgstrom_fmin_trans_fp64(PG_FUNCTION_ARGS) -{ - __MINMAX_TRANS_TEMPLATE(fp64,Min); -} - -Datum -pgstrom_fmax_trans_int64(PG_FUNCTION_ARGS) -{ - __MINMAX_TRANS_TEMPLATE(int64,Max); -} - -Datum -pgstrom_fmax_trans_fp64(PG_FUNCTION_ARGS) -{ - __MINMAX_TRANS_TEMPLATE(fp64,Max); -} - -Datum -pgstrom_fminmax_final_int8(PG_FUNCTION_ARGS) -{ - kagg_state__pminmax_int64_packed *state - = (kagg_state__pminmax_int64_packed *)PG_GETARG_BYTEA_P(0); - if (state->nitems == 0) - PG_RETURN_NULL(); - if (state->value < SCHAR_MIN || state->value > SCHAR_MAX) - elog(ERROR, "min(int8) out of range"); - PG_RETURN_INT32(state->value); -} - -Datum -pgstrom_fminmax_final_int16(PG_FUNCTION_ARGS) -{ - kagg_state__pminmax_int64_packed *state - = (kagg_state__pminmax_int64_packed *)PG_GETARG_BYTEA_P(0); - if (state->nitems == 0) - PG_RETURN_NULL(); - if (state->value < SHRT_MIN || state->value > SHRT_MAX) - elog(ERROR, "min(int16) out of range"); - PG_RETURN_INT32(state->value); -} - -Datum -pgstrom_fminmax_final_int32(PG_FUNCTION_ARGS) -{ - kagg_state__pminmax_int64_packed *state - = (kagg_state__pminmax_int64_packed *)PG_GETARG_BYTEA_P(0); - if (state->nitems == 0) - PG_RETURN_NULL(); - if (state->value < INT_MIN || state->value > INT_MAX) - elog(ERROR, "min(int32) out of range"); - PG_RETURN_INT32(state->value); -} - -Datum -pgstrom_fminmax_final_int64(PG_FUNCTION_ARGS) -{ - kagg_state__pminmax_int64_packed *state - = (kagg_state__pminmax_int64_packed *)PG_GETARG_BYTEA_P(0); - if (state->nitems == 0) - PG_RETURN_NULL(); - PG_RETURN_INT64(state->value); -} - -Datum -pgstrom_fminmax_final_fp16(PG_FUNCTION_ARGS) -{ - kagg_state__pminmax_fp64_packed *state - = (kagg_state__pminmax_fp64_packed *)PG_GETARG_BYTEA_P(0); - if (state->nitems == 0) - PG_RETURN_NULL(); - PG_RETURN_UINT16(__half_as_short__(fp64_to_fp16(state->value))); -} - -Datum -pgstrom_fminmax_final_fp32(PG_FUNCTION_ARGS) -{ - kagg_state__pminmax_fp64_packed *state - = (kagg_state__pminmax_fp64_packed *)PG_GETARG_BYTEA_P(0); - if (state->nitems == 0) - PG_RETURN_NULL(); - PG_RETURN_FLOAT4(state->value); -} - -Datum -pgstrom_fminmax_final_fp64(PG_FUNCTION_ARGS) -{ - kagg_state__pminmax_fp64_packed *state - = (kagg_state__pminmax_fp64_packed *)PG_GETARG_BYTEA_P(0); - if (state->nitems == 0) - PG_RETURN_NULL(); - PG_RETURN_FLOAT8(state->value); -} - -Datum -pgstrom_fminmax_final_numeric(PG_FUNCTION_ARGS) -{ - kagg_state__pminmax_fp64_packed *state - = (kagg_state__pminmax_fp64_packed *)PG_GETARG_BYTEA_P(0); - if (state->nitems == 0) - PG_RETURN_NULL(); - return DirectFunctionCall1(float8_numeric, - Float8GetDatum(state->value)); -} - -/* - * SUM(X) functions - */ -Datum -pgstrom_partial_sum_asis(PG_FUNCTION_ARGS) -{ - PG_RETURN_DATUM(PG_GETARG_DATUM(0)); -} - -/* - * AVG(X) functions - */ -Datum -pgstrom_partial_avg_int(PG_FUNCTION_ARGS) -{ - kagg_state__pavg_int_packed *r = palloc(sizeof(kagg_state__pavg_int_packed)); - - r->nitems = 1; - r->sum = PG_GETARG_INT64(0); - SET_VARSIZE(r, sizeof(kagg_state__pavg_int_packed)); - - PG_RETURN_POINTER(r); -} - -Datum -pgstrom_partial_avg_fp(PG_FUNCTION_ARGS) -{ - kagg_state__pavg_fp_packed *r = palloc(sizeof(kagg_state__pavg_fp_packed)); - - r->nitems = 1; - r->sum = PG_GETARG_FLOAT8(0); - SET_VARSIZE(r, sizeof(kagg_state__pavg_fp_packed)); - - PG_RETURN_POINTER(r); -} - -Datum -pgstrom_favg_trans_int(PG_FUNCTION_ARGS) -{ - kagg_state__pavg_int_packed *state; - kagg_state__pavg_int_packed *arg; - MemoryContext aggcxt; - - if (!AggCheckCallContext(fcinfo, &aggcxt)) - elog(ERROR, "aggregate function called in non-aggregate context"); - if (PG_ARGISNULL(0)) - { - if (PG_ARGISNULL(1)) - PG_RETURN_NULL(); - arg = (kagg_state__pavg_int_packed *)PG_GETARG_BYTEA_P(1); - state = MemoryContextAlloc(aggcxt, sizeof(*state)); - memcpy(state, arg, sizeof(*state)); - } - else - { - state = (kagg_state__pavg_int_packed *)PG_GETARG_BYTEA_P(0); - if (!PG_ARGISNULL(1)) - { - arg = (kagg_state__pavg_int_packed *)PG_GETARG_BYTEA_P(1); - - state->nitems += arg->nitems; - state->sum += arg->sum; - } - } - PG_RETURN_POINTER(state); -} - -Datum -pgstrom_favg_trans_fp(PG_FUNCTION_ARGS) -{ - kagg_state__pavg_fp_packed *state; - kagg_state__pavg_fp_packed *arg; - MemoryContext aggcxt; - - if (!AggCheckCallContext(fcinfo, &aggcxt)) - elog(ERROR, "aggregate function called in non-aggregate context"); - if (PG_ARGISNULL(0)) - { - if (PG_ARGISNULL(1)) - PG_RETURN_NULL(); - arg = (kagg_state__pavg_fp_packed *)PG_GETARG_BYTEA_P(1); - state = MemoryContextAlloc(aggcxt, sizeof(*state)); - memcpy(state, arg, sizeof(*state)); - } - else - { - state = (kagg_state__pavg_fp_packed *)PG_GETARG_BYTEA_P(0); - if (!PG_ARGISNULL(1)) - { - arg = (kagg_state__pavg_fp_packed *)PG_GETARG_BYTEA_P(1); - - state->nitems += arg->nitems; - state->sum += arg->sum; - } - } - PG_RETURN_POINTER(state); -} - -Datum -pgstrom_favg_final_int(PG_FUNCTION_ARGS) -{ - kagg_state__pavg_int_packed *state; - Datum n, sum; - - state = (kagg_state__pavg_int_packed *)PG_GETARG_BYTEA_P(0); - if (state->nitems == 0) - PG_RETURN_NULL(); - n = DirectFunctionCall1(int4_numeric, Int32GetDatum(state->nitems)); - sum = DirectFunctionCall1(int8_numeric, Int64GetDatum(state->sum)); - - PG_RETURN_DATUM(DirectFunctionCall2(numeric_div, sum, n)); -} - -Datum -pgstrom_favg_final_fp(PG_FUNCTION_ARGS) -{ - kagg_state__pavg_fp_packed *state - = (kagg_state__pavg_fp_packed *)PG_GETARG_BYTEA_P(0); - if (state->nitems == 0) - PG_RETURN_NULL(); - PG_RETURN_FLOAT8((double)state->sum / (double)state->nitems); -} - -Datum -pgstrom_favg_final_num(PG_FUNCTION_ARGS) -{ - kagg_state__pavg_fp_packed *state; - Datum n, sum; - - state = (kagg_state__pavg_fp_packed *)PG_GETARG_BYTEA_P(0); - if (state->nitems == 0) - PG_RETURN_NULL(); - n = DirectFunctionCall1(int4_numeric, Int32GetDatum(state->nitems)); - sum = DirectFunctionCall1(float8_numeric, Float8GetDatum(state->sum)); - - PG_RETURN_DATUM(DirectFunctionCall2(numeric_div, sum, n)); -} - -/* - * STDDEV/VARIANCE - */ -Datum -pgstrom_partial_variance(PG_FUNCTION_ARGS) -{ - kagg_state__stddev_packed *r = palloc(sizeof(kagg_state__stddev_packed)); - float8_t fval = PG_GETARG_FLOAT8(0); - - r->nitems = 1; - r->sum_x2 = fval * fval; - SET_VARSIZE(r, sizeof(kagg_state__stddev_packed)); - - PG_RETURN_POINTER(r); -} - -Datum -pgstrom_stddev_trans(PG_FUNCTION_ARGS) -{ - kagg_state__stddev_packed *state; - kagg_state__stddev_packed *arg; - MemoryContext aggcxt; - - if (!AggCheckCallContext(fcinfo, &aggcxt)) - elog(ERROR, "aggregate function called in non-aggregate context"); - if (PG_ARGISNULL(0)) - { - if (PG_ARGISNULL(1)) - PG_RETURN_NULL(); - arg = (kagg_state__stddev_packed *)PG_GETARG_BYTEA_P(1); - state = MemoryContextAlloc(aggcxt, sizeof(*state)); - memcpy(state, arg, sizeof(*state)); - } - else - { - state = (kagg_state__stddev_packed *)PG_GETARG_BYTEA_P(0); - if (!PG_ARGISNULL(1)) - { - arg = (kagg_state__stddev_packed *)PG_GETARG_BYTEA_P(1); - - state->nitems += arg->nitems; - state->sum_x += arg->sum_x; - state->sum_x2 += arg->sum_x2; - } - } - PG_RETURN_POINTER(state); -} - -Datum -pgstrom_var_sampf_final(PG_FUNCTION_ARGS) -{ - kagg_state__stddev_packed *state - = (kagg_state__stddev_packed *)PG_GETARG_BYTEA_P(0); - if (state->nitems > 1) - { - float8_t N = (double)state->nitems; - float8_t fval = N * state->sum_x2 - state->sum_x * state->sum_x; - - PG_RETURN_FLOAT8(fval / (N * (N - 1.0))); - } - PG_RETURN_NULL(); -} - -Datum -pgstrom_var_samp_final(PG_FUNCTION_ARGS) -{ - Datum datum = pgstrom_var_sampf_final(fcinfo); - - if (fcinfo->isnull) - PG_RETURN_NULL(); - PG_RETURN_NUMERIC(DirectFunctionCall1(float8_numeric, datum)); -} - -Datum -pgstrom_var_popf_final(PG_FUNCTION_ARGS) -{ - kagg_state__stddev_packed *state - = (kagg_state__stddev_packed *)PG_GETARG_BYTEA_P(0); - if (state->nitems > 0) - { - float8_t N = (double)state->nitems; - float8_t fval = N * state->sum_x2 - state->sum_x * state->sum_x; - - PG_RETURN_FLOAT8(fval / (N * N)); - } - PG_RETURN_NULL(); -} - -Datum -pgstrom_var_pop_final(PG_FUNCTION_ARGS) -{ - Datum datum = pgstrom_var_popf_final(fcinfo); - - if (fcinfo->isnull) - PG_RETURN_NULL(); - PG_RETURN_NUMERIC(DirectFunctionCall1(float8_numeric, datum)); -} - -Datum -pgstrom_stddev_sampf_final(PG_FUNCTION_ARGS) -{ - Datum datum = pgstrom_var_sampf_final(fcinfo); - - if (fcinfo->isnull) - PG_RETURN_NULL(); - PG_RETURN_FLOAT8(sqrt(DatumGetFloat8(datum))); -} - -Datum -pgstrom_stddev_samp_final(PG_FUNCTION_ARGS) -{ - Datum datum = pgstrom_stddev_sampf_final(fcinfo); - - if (fcinfo->isnull) - PG_RETURN_NULL(); - PG_RETURN_NUMERIC(DirectFunctionCall1(float8_numeric, datum)); -} - -Datum -pgstrom_stddev_popf_final(PG_FUNCTION_ARGS) -{ - Datum datum = pgstrom_var_popf_final(fcinfo); - - if (fcinfo->isnull) - PG_RETURN_NULL(); - PG_RETURN_FLOAT8(sqrt(DatumGetFloat8(datum))); -} - -Datum -pgstrom_stddev_pop_final(PG_FUNCTION_ARGS) -{ - Datum datum = pgstrom_stddev_popf_final(fcinfo); - - if (fcinfo->isnull) - PG_RETURN_NULL(); - PG_RETURN_NUMERIC(DirectFunctionCall1(float8_numeric, datum)); -} - -/* - * COVAR/REGR_* - */ -Datum -pgstrom_partial_covar(PG_FUNCTION_ARGS) -{ - kagg_state__covar_packed *r = palloc(sizeof(kagg_state__covar_packed)); - float8_t x = PG_GETARG_FLOAT8(0); - float8_t y = PG_GETARG_FLOAT8(1); - - r->nitems = 1; - r->sum_x = x; - r->sum_xx = x * x; - r->sum_y = y; - r->sum_yy = y * y; - r->sum_xy = x * y; - SET_VARSIZE(r, sizeof(kagg_state__covar_packed)); - - PG_RETURN_POINTER(r); -} - -Datum -pgstrom_covar_accum(PG_FUNCTION_ARGS) -{ - kagg_state__covar_packed *state; - kagg_state__covar_packed *arg; - MemoryContext aggcxt; - - if (!AggCheckCallContext(fcinfo, &aggcxt)) - elog(ERROR, "aggregate function called in non-aggregate context"); - if (PG_ARGISNULL(0)) - { - if (PG_ARGISNULL(1)) - PG_RETURN_NULL(); - arg = (kagg_state__covar_packed *)PG_GETARG_BYTEA_P(1); - state = MemoryContextAlloc(aggcxt, sizeof(*state)); - memcpy(state, arg, sizeof(*state)); - } - else - { - state = (kagg_state__covar_packed *)PG_GETARG_BYTEA_P(0); - if (!PG_ARGISNULL(1)) - { - arg = (kagg_state__covar_packed *)PG_GETARG_BYTEA_P(1); - - state->nitems += arg->nitems; - state->sum_x += arg->sum_x; - state->sum_xx += arg->sum_xx; - state->sum_y += arg->sum_y; - state->sum_yy += arg->sum_yy; - state->sum_xy += arg->sum_xy; - } - } - PG_RETURN_POINTER(state); -} - -Datum -pgstrom_covar_samp_final(PG_FUNCTION_ARGS) -{ - kagg_state__covar_packed *state - = (kagg_state__covar_packed *)PG_GETARG_BYTEA_P(0); - - if (state->nitems > 1) - { - float8_t N = (float8_t)state->nitems; - float8_t fval = N * state->sum_xy - state->sum_x * state->sum_y; - - PG_RETURN_FLOAT8(fval / (N * (N - 1.0))); - } - PG_RETURN_NULL(); -} - -Datum -pgstrom_covar_pop_final(PG_FUNCTION_ARGS) -{ - kagg_state__covar_packed *state - = (kagg_state__covar_packed *)PG_GETARG_BYTEA_P(0); - - if (state->nitems > 0) - { - float8_t N = (float8_t)state->nitems; - float8_t fval = N * state->sum_xy - state->sum_x * state->sum_y; - - PG_RETURN_FLOAT8(fval / (N * N)); - } - PG_RETURN_NULL(); -} - -Datum -pgstrom_regr_avgx_final(PG_FUNCTION_ARGS) -{ - kagg_state__covar_packed *state - = (kagg_state__covar_packed *)PG_GETARG_BYTEA_P(0); - if (state->nitems > 0) - { - float8_t N = (float8_t)state->nitems; - - PG_RETURN_FLOAT8(state->sum_x / N); - } - PG_RETURN_NULL(); -} - -Datum -pgstrom_regr_avgy_final(PG_FUNCTION_ARGS) -{ - kagg_state__covar_packed *state - = (kagg_state__covar_packed *)PG_GETARG_BYTEA_P(0); - if (state->nitems > 0) - { - float8_t N = (float8_t)state->nitems; - - PG_RETURN_FLOAT8(state->sum_y / N); - } - PG_RETURN_NULL(); -} - -Datum -pgstrom_regr_count_final(PG_FUNCTION_ARGS) -{ - kagg_state__covar_packed *state - = (kagg_state__covar_packed *)PG_GETARG_BYTEA_P(0); - - PG_RETURN_FLOAT8((float8_t)state->nitems); -} - -Datum -pgstrom_regr_intercept_final(PG_FUNCTION_ARGS) -{ - kagg_state__covar_packed *state - = (kagg_state__covar_packed *)PG_GETARG_BYTEA_P(0); - if (state->nitems > 0 && state->sum_xx != 0.0) - { - float8_t N = (float8_t)state->nitems; - - PG_RETURN_FLOAT8((state->sum_y - - state->sum_x * state->sum_xy / state->sum_xx) / N); - } - PG_RETURN_NULL(); -} - -Datum -pgstrom_regr_r2_final(PG_FUNCTION_ARGS) -{ - kagg_state__covar_packed *state - = (kagg_state__covar_packed *)PG_GETARG_BYTEA_P(0); - if (state->nitems > 0 && - state->sum_xx != 0.0 && - state->sum_yy != 0.0) - { - PG_RETURN_FLOAT8((state->sum_xy * state->sum_xy) / - (state->sum_xx * state->sum_yy)); - } - PG_RETURN_NULL(); -} - -Datum -pgstrom_regr_slope_final(PG_FUNCTION_ARGS) -{ - kagg_state__covar_packed *state - = (kagg_state__covar_packed *)PG_GETARG_BYTEA_P(0); - if (state->nitems > 0 && state->sum_xx != 0.0) - { - PG_RETURN_FLOAT8(state->sum_xy / state->sum_xx); - } - PG_RETURN_NULL(); -} - -Datum -pgstrom_regr_sxx_final(PG_FUNCTION_ARGS) -{ - kagg_state__covar_packed *state - = (kagg_state__covar_packed *)PG_GETARG_BYTEA_P(0); - if (state->nitems > 0) - { - PG_RETURN_FLOAT8(state->sum_xx); - } - PG_RETURN_NULL(); -} - -Datum -pgstrom_regr_sxy_final(PG_FUNCTION_ARGS) -{ - kagg_state__covar_packed *state - = (kagg_state__covar_packed *)PG_GETARG_BYTEA_P(0); - if (state->nitems > 0) - { - PG_RETURN_FLOAT8(state->sum_xy); - } - PG_RETURN_NULL(); -} - -Datum -pgstrom_regr_syy_final(PG_FUNCTION_ARGS) -{ - kagg_state__covar_packed *state - = (kagg_state__covar_packed *)PG_GETARG_BYTEA_P(0); - if (state->nitems > 0) - { - PG_RETURN_FLOAT8(state->sum_yy); - } - PG_RETURN_NULL(); -} - -#if 0 -/* - * ---------------------------------------------------------------- - * - * Hyper-Log-Log support functions - * - * ---------------------------------------------------------------- - */ - -/* - * Hash-function based on Sip-Hash - * - * See https://en.wikipedia.org/wiki/SipHash - * and https://github.com/veorq/SipHash - */ -/* default: SipHash-2-4 */ -#define cROUNDS 2 -#define dROUNDS 4 -#define ROTL(x, b) (uint64_t)(((x) << (b)) | ((x) >> (64 - (b)))) - -#define U8TO64_LE(p) \ - (((uint64_t)((p)[0])) | ((uint64_t)((p)[1]) << 8) | \ - ((uint64_t)((p)[2]) << 16) | ((uint64_t)((p)[3]) << 24) | \ - ((uint64_t)((p)[4]) << 32) | ((uint64_t)((p)[5]) << 40) | \ - ((uint64_t)((p)[6]) << 48) | ((uint64_t)((p)[7]) << 56)) - -#define SIPROUND \ - do { \ - v0 += v1; \ - v1 = ROTL(v1, 13); \ - v1 ^= v0; \ - v0 = ROTL(v0, 32); \ - v2 += v3; \ - v3 = ROTL(v3, 16); \ - v3 ^= v2; \ - v0 += v3; \ - v3 = ROTL(v3, 21); \ - v3 ^= v0; \ - v2 += v1; \ - v1 = ROTL(v1, 17); \ - v1 ^= v2; \ - v2 = ROTL(v2, 32); \ - } while (0) - -static uint64_t -__pgstrom_hll_siphash_value(const void *ptr, const size_t len) -{ - const unsigned char *ni = (const unsigned char *)ptr; - uint64_t v0 = 0x736f6d6570736575UL; - uint64_t v1 = 0x646f72616e646f6dUL; - uint64_t v2 = 0x6c7967656e657261UL; - uint64_t v3 = 0x7465646279746573UL; - uint64_t k0 = 0x9c38151cda15a76bUL; /* random key-0 */ - uint64_t k1 = 0xfb4ff68fbd3e6658UL; /* random key-1 */ - uint64_t m; - int i; - const unsigned char *end = ni + len - (len % sizeof(uint64_t)); - const int left = len & 7; - uint64_t b = ((uint64_t)len) << 56; - - v3 ^= k1; - v2 ^= k0; - v1 ^= k1; - v0 ^= k0; - - for (; ni != end; ni += 8) - { - m = U8TO64_LE(ni); - v3 ^= m; - - for (i = 0; i < cROUNDS; ++i) - SIPROUND; - - v0 ^= m; - } - -#if 1 - if (left > 0) - { - uint64_t temp = 0; - - memcpy(&temp, ni, left); - b |= (temp & ((1UL << (BITS_PER_BYTE * left)) - 1)); - } -#else - /* original code */ - switch (left) - { - case 7: - b |= ((uint64_t)ni[6]) << 48; __attribute__ ((fallthrough)); - case 6: - b |= ((uint64_t)ni[5]) << 40; __attribute__ ((fallthrough)); - case 5: - b |= ((uint64_t)ni[4]) << 32; __attribute__ ((fallthrough)); - case 4: - b |= ((uint64_t)ni[3]) << 24; __attribute__ ((fallthrough)); - case 3: - b |= ((uint64_t)ni[2]) << 16; __attribute__ ((fallthrough)); - case 2: - b |= ((uint64_t)ni[1]) << 8; __attribute__ ((fallthrough)); - case 1: - b |= ((uint64_t)ni[0]); - break; - case 0: - break; - } -#endif - - v3 ^= b; - for (i = 0; i < cROUNDS; ++i) - SIPROUND; - - v0 ^= b; - - v2 ^= 0xff; - - for (i = 0; i < dROUNDS; ++i) - SIPROUND; - - b = v0 ^ v1 ^ v2 ^ v3; - - return b; -} - -/* - * pgstrom_hll_hash_xxxx functions - */ -static uint64 -__pgstrom_hll_hash_int1(Datum datum) -{ - return __pgstrom_hll_siphash_value(&datum, sizeof(int8)); -} - -static uint64 -__pgstrom_hll_hash_int2(Datum datum) -{ - return __pgstrom_hll_siphash_value(&datum, sizeof(int16)); -} - -static uint64 -__pgstrom_hll_hash_int4(Datum datum) -{ - return __pgstrom_hll_siphash_value(&datum, sizeof(int32)); -} - -static uint64 -__pgstrom_hll_hash_int8(Datum datum) -{ - return __pgstrom_hll_siphash_value(&datum, sizeof(int64)); -} - -static uint64 -__pgstrom_hll_hash_numeric(Datum datum) -{ - xpu_numeric_t num; - const char *emsg; - - memset(&num, 0, sizeof(num)); - emsg = __xpu_numeric_from_varlena(&num, (struct varlena *)datum); - if (emsg) - elog(ERROR, "failed on hash calculation of device numeric: %s", emsg); - return __pgstrom_hll_siphash_value(&num.weight, - offsetof(xpu_numeric_t, value) - + sizeof(int128_t) - - offsetof(xpu_numeric_t, weight)); -} - -static uint64 -__pgstrom_hll_hash_date(Datum datum) -{ - return __pgstrom_hll_siphash_value(&datum, sizeof(DateADT)); -} - -static uint64 -__pgstrom_hll_hash_time(Datum datum) -{ - return __pgstrom_hll_siphash_value(&datum, sizeof(TimeADT)); -} - -static uint64 -__pgstrom_hll_hash_timetz(Datum datum) -{ - return __pgstrom_hll_siphash_value(DatumGetPointer(datum), sizeof(TimeTzADT)); -} - -static uint64 -__pgstrom_hll_hash_timestamp(Datum datum) -{ - return __pgstrom_hll_siphash_value(&datum, sizeof(Timestamp)); -} - -static uint64 -__pgstrom_hll_hash_timestamptz(Datum datum) -{ - return __pgstrom_hll_siphash_value(&datum, sizeof(TimestampTz)); -} - -static uint64 -__pgstrom_hll_hash_bpchar(Datum datum) -{ - BpChar *val = DatumGetBpCharPP(datum); - int len = bpchartruelen(VARDATA_ANY(val), - VARSIZE_ANY_EXHDR(val)); - return __pgstrom_hll_siphash_value(VARDATA_ANY(val), len); -} - -static uint64 -__pgstrom_hll_hash_varlena(Datum datum) -{ - struct varlena *val = PG_DETOAST_DATUM(datum); - - return __pgstrom_hll_siphash_value(VARDATA_ANY(val), VARSIZE_ANY_EXHDR(val)); -} - -static uint64 -__pgstrom_hll_hash_uuid(Datum datum) -{ - return __pgstrom_hll_siphash_value(DatumGetUUIDP(datum), sizeof(pg_uuid_t)); -} - -static bytea * -__pgstrom_hll_sketch_update_common(PG_FUNCTION_ARGS, uint64 hash) -{ - MemoryContext aggcxt; - bytea *hll_state; - uint8 *hll_regs; - uint64 nrooms; - uint32 index; - uint32 count; - - if (!AggCheckCallContext(fcinfo, &aggcxt)) - elog(ERROR, "aggregate function called in non-aggregate context"); - nrooms = (1UL << pgstrom_hll_register_bits); - if (PG_ARGISNULL(0)) - { - size_t sz = VARHDRSZ + sizeof(uint8) * nrooms; - hll_state = MemoryContextAllocZero(aggcxt, sz); - SET_VARSIZE(hll_state, sz); - } - else - { - hll_state = PG_GETARG_BYTEA_P(0); - } - Assert(VARSIZE(hll_state) == VARHDRSZ + sizeof(uint8) * nrooms); - hll_regs = (uint8 *)VARDATA(hll_state); - - index = hash & (nrooms - 1); - count = __builtin_ctzll(hash >> pgstrom_hll_register_bits) + 1; - if (hll_regs[index] < count) - hll_regs[index] = count; - return hll_state; -} - -#define PGSTROM_HLL_HANDLER_TEMPLATE(NAME) \ - PG_FUNCTION_INFO_V1(pgstrom_hll_hash_##NAME); \ - PG_FUNCTION_INFO_V1(pgstrom_hll_sketch_update_##NAME); \ - Datum \ - pgstrom_hll_hash_##NAME(PG_FUNCTION_ARGS) \ - { \ - Datum arg = PG_GETARG_DATUM(0); \ - PG_RETURN_UINT64(__pgstrom_hll_hash_##NAME(arg)); \ - } \ - Datum \ - pgstrom_hll_sketch_update_##NAME(PG_FUNCTION_ARGS) \ - { \ - if (PG_ARGISNULL(1)) \ - { \ - if (PG_ARGISNULL(0)) \ - PG_RETURN_NULL(); \ - PG_RETURN_DATUM(PG_GETARG_DATUM(0)); \ - } \ - else \ - { \ - Datum arg = PG_GETARG_DATUM(1); \ - uint64 hash = __pgstrom_hll_hash_##NAME(arg); \ - bytea *state; \ - \ - state = __pgstrom_hll_sketch_update_common(fcinfo, hash); \ - PG_RETURN_BYTEA_P(state); \ - } \ - } - -PGSTROM_HLL_HANDLER_TEMPLATE(int1) -PGSTROM_HLL_HANDLER_TEMPLATE(int2) -PGSTROM_HLL_HANDLER_TEMPLATE(int4) -PGSTROM_HLL_HANDLER_TEMPLATE(int8) -PGSTROM_HLL_HANDLER_TEMPLATE(numeric) -PGSTROM_HLL_HANDLER_TEMPLATE(date) -PGSTROM_HLL_HANDLER_TEMPLATE(time) -PGSTROM_HLL_HANDLER_TEMPLATE(timetz) -PGSTROM_HLL_HANDLER_TEMPLATE(timestamp) -PGSTROM_HLL_HANDLER_TEMPLATE(timestamptz) -PGSTROM_HLL_HANDLER_TEMPLATE(bpchar) -PGSTROM_HLL_HANDLER_TEMPLATE(varlena) -PGSTROM_HLL_HANDLER_TEMPLATE(uuid) - -/* - * pgstrom_hll_sketch_new - */ -Datum -pgstrom_hll_sketch_new(PG_FUNCTION_ARGS) -{ - uint64 nrooms = (1UL << pgstrom_hll_register_bits); - uint64 hll_hash = DatumGetUInt64(PG_GETARG_DATUM(0)); - bytea *hll_state; - uint8 *hll_regs; - uint32 count; - uint32 index; - - hll_state = palloc0(VARHDRSZ + sizeof(uint8) * nrooms); - SET_VARSIZE(hll_state, VARHDRSZ + sizeof(uint8) * nrooms); - hll_regs = (uint8 *)VARDATA(hll_state); - - index = hll_hash & (nrooms - 1); - Assert(index < nrooms); - count = __builtin_ctzll(hll_hash >> pgstrom_hll_register_bits) + 1; - if (hll_regs[index] < count) - hll_regs[index] = count; - - PG_RETURN_BYTEA_P(hll_state); -} - -/* - * pgstrom_hll_sketch_merge - */ -Datum -pgstrom_hll_sketch_merge(PG_FUNCTION_ARGS) -{ - MemoryContext aggcxt; - bytea *hll_state = NULL; - uint8 *hll_regs; - bytea *new_state; - uint8 *new_regs; - uint32 nrooms; - uint32 index; - - if (!AggCheckCallContext(fcinfo, &aggcxt)) - elog(ERROR, "aggregate function called in non-aggregate context"); - if (PG_ARGISNULL(0)) - { - if (PG_ARGISNULL(1)) - PG_RETURN_NULL(); - new_state = PG_GETARG_BYTEA_P(1); - nrooms = VARSIZE_ANY_EXHDR(new_state); - if (nrooms < 1 || (nrooms & (nrooms - 1)) != 0) - elog(ERROR, "HLL sketch must have 2^N rooms (%u)", nrooms); - hll_state = MemoryContextAllocZero(aggcxt, VARHDRSZ + nrooms); - SET_VARSIZE(hll_state, VARHDRSZ + nrooms); - memcpy(VARDATA_ANY(hll_state), VARDATA_ANY(new_state), nrooms); - } - else - { - hll_state = PG_GETARG_BYTEA_P(0); - nrooms = VARSIZE_ANY_EXHDR(hll_state); - if (nrooms < 1 || (nrooms & (nrooms - 1)) != 0) - elog(ERROR, "HLL sketch must have 2^N rooms (%u)", nrooms); - if (!PG_ARGISNULL(1)) - { - new_state = PG_GETARG_BYTEA_P(1); - if (VARSIZE_ANY_EXHDR(hll_state) != VARSIZE_ANY_EXHDR(new_state)) - elog(ERROR, "incompatible HLL sketch"); - hll_regs = (uint8 *)VARDATA_ANY(hll_state); - new_regs = (uint8 *)VARDATA_ANY(new_state); - for (index=0; index < nrooms; index++) - { - if (hll_regs[index] < new_regs[index]) - hll_regs[index] = new_regs[index]; - } - } - } - PG_RETURN_POINTER(hll_state); -} - -/* - * pgstrom_hll_count_final - */ -Datum -pgstrom_hll_count_final(PG_FUNCTION_ARGS) -{ - bytea *hll_state; - uint8 *hll_regs; - uint32 nrooms; - uint32 index; - double divider = 0.0; - double weight; - double estimate; - -#if 0 - /* - * MEMO: Here to no reason to prohibit to use pgstrom.hll_count_final() - * towards preliminary calculated HLL sketch. - */ - if (!AggCheckCallContext(fcinfo, NULL)) - elog(ERROR, "aggregate function called in non-aggregate context"); -#endif - if (PG_ARGISNULL(0)) - PG_RETURN_INT64(0); - /* - * MEMO: Hyper-Log-Log merge algorithm - * https://ja.wikiqube.net/wiki/HyperLogLog - */ - hll_state = PG_GETARG_BYTEA_P(0); - nrooms = VARSIZE_ANY_EXHDR(hll_state); - if (nrooms < 1 || (nrooms & (nrooms - 1)) != 0) - elog(ERROR, "HLL sketch must have 2^N rooms (%u)", nrooms); - hll_regs = (uint8 *)VARDATA(hll_state); - - for (index = 0; index < nrooms; index++) - divider += 1.0 / (double)(1UL << hll_regs[index]); - if (nrooms <= 16) - weight = 0.673; - else if (nrooms <= 32) - weight = 0.697; - else if (nrooms <= 64) - weight = 0.709; - else - weight = 0.7213 / (1.0 + 1.079 / (double)nrooms); - - estimate = (weight * (double)nrooms * (double)nrooms) / divider; - PG_RETURN_INT64((int64)estimate); -} - - - -/* - * pgstrom_hll_sketch_histogram - */ -Datum -pgstrom_hll_sketch_histogram(PG_FUNCTION_ARGS) -{ - bytea *hll_state = PG_GETARG_BYTEA_P(0); - uint8 *hll_regs; - uint32 nrooms; - uint32 index; - Datum hll_hist[64]; - int max_hist = -1; - ArrayType *result; - - nrooms = VARSIZE_ANY_EXHDR(hll_state); - if (nrooms < 1 || (nrooms & (nrooms - 1)) != 0) - elog(ERROR, "HLL sketch must have 2^N rooms (%u)", nrooms); - hll_regs = (uint8 *)VARDATA(hll_state); - - memset(hll_hist, 0, sizeof(hll_hist)); - for (index=0; index < nrooms; index++) - { - int value = (int)hll_regs[index]; - - if (value < 0 || value >= 64) - elog(ERROR, "HLL sketch looks corrupted"); - hll_hist[value]++; - if (max_hist < value) - max_hist = value; - } - - if (max_hist < 0) - PG_RETURN_NULL(); - - result = construct_array(hll_hist, - max_hist + 1, - INT4OID, - sizeof(int32), - true, - 'i'); - PG_RETURN_POINTER(result); -} -#endif diff --git a/next/arrow_fdw.c b/next/arrow_fdw.c deleted file mode 100644 index 89a787ffb..000000000 --- a/next/arrow_fdw.c +++ /dev/null @@ -1,4657 +0,0 @@ -/* - * arrow_fdw.c - * - * Routines to map Apache Arrow files as PG's Foreign-Table. - * ---- - * Copyright 2011-2023 (C) KaiGai Kohei - * Copyright 2014-2023 (C) PG-Strom Developers Team - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the PostgreSQL License. - */ -#include "pg_strom.h" -#include "arrow_defs.h" -#include "arrow_ipc.h" -#include "xpu_numeric.h" - -/* - * min/max statistics datum - */ -typedef struct -{ - bool isnull; - union { - Datum datum; - NumericData numeric; /* if NUMERICOID */ - } min; - union { - Datum datum; - NumericData numeric; /* if NUMERICOID */ - } max; -} MinMaxStatDatum; - -/* - * RecordBatchState - */ -typedef struct RecordBatchFieldState -{ - /* common fields with cache */ - Oid atttypid; - int atttypmod; - ArrowTypeOptions attopts; - int64 nitems; /* usually, same with rb_nitems */ - int64 null_count; - off_t nullmap_offset; - size_t nullmap_length; - off_t values_offset; - size_t values_length; - off_t extra_offset; - size_t extra_length; - MinMaxStatDatum stat_datum; - /* sub-fields if any */ - int num_children; - struct RecordBatchFieldState *children; -} RecordBatchFieldState; - -typedef struct RecordBatchState -{ - struct ArrowFileState *af_state; /* reference to ArrowFileState */ - int rb_index; /* index number in a file */ - off_t rb_offset; /* offset from the head */ - size_t rb_length; /* length of the entire RecordBatch */ - int64 rb_nitems; /* number of items */ - /* per column information */ - int nfields; - RecordBatchFieldState fields[FLEXIBLE_ARRAY_MEMBER]; -} RecordBatchState; - -typedef struct ArrowFileState -{ - const char *filename; - const char *dpu_path; /* relative pathname, if DPU */ - struct stat stat_buf; - List *rb_list; /* list of RecordBatchState */ -} ArrowFileState; - -/* - * ArrowFdwState - executor state to run apache arrow - */ -typedef struct -{ - Bitmapset *stat_attrs; - Bitmapset *load_attrs; - List *orig_quals; /* for EXPLAIN */ - List *eval_quals; - ExprState *eval_state; - ExprContext *econtext; -} arrowStatsHint; - -struct ArrowFdwState -{ - Bitmapset *referenced; /* referenced columns */ - arrowStatsHint *stats_hint; /* min/max statistics, if any */ - pg_atomic_uint32 *rbatch_index; - pg_atomic_uint32 __rbatch_index_local; /* if single process */ - pg_atomic_uint32 *rbatch_nload; - pg_atomic_uint32 __rbatch_nload_local; /* if single process */ - pg_atomic_uint32 *rbatch_nskip; - pg_atomic_uint32 __rbatch_nskip_local; /* if single process */ - StringInfoData chunk_buffer; /* buffer to load record-batch */ - File curr_filp; /* current arrow file to read */ - kern_data_store *curr_kds; /* current chunk to read */ - uint32_t curr_index; /* current index on the chunk */ - List *af_states_list; /* list of ArrowFileState */ - uint32_t rb_nitems; /* number of record-batches */ - RecordBatchState *rb_states[FLEXIBLE_ARRAY_MEMBER]; /* flatten RecordBatchState */ -}; - -/* - * Metadata Cache (on shared memory) - */ -#define ARROW_METADATA_BLOCKSZ (128 * 1024) /* 128kB */ -typedef struct -{ - dlist_node chain; /* link to free_blocks; NULL if active */ - int32_t unitsz; /* unit size of slab items */ - int32_t n_actives; /* number of active items */ - char data[FLEXIBLE_ARRAY_MEMBER]; -} arrowMetadataCacheBlock; -#define ARROW_METADATA_CACHE_FREE_MAGIC (0xdeadbeafU) -#define ARROW_METADATA_CACHE_ACTIVE_MAGIC (0xcafebabeU) - -typedef struct arrowMetadataFieldCache arrowMetadataFieldCache; -typedef struct arrowMetadataCache arrowMetadataCache; - -struct arrowMetadataFieldCache -{ - arrowMetadataCacheBlock *owner; - dlist_node chain; /* link to free/fields[children] list */ - /* common fields with cache */ - Oid atttypid; - int atttypmod; - ArrowTypeOptions attopts; - int64 nitems; /* usually, same with rb_nitems */ - int64 null_count; - off_t nullmap_offset; - size_t nullmap_length; - off_t values_offset; - size_t values_length; - off_t extra_offset; - size_t extra_length; - MinMaxStatDatum stat_datum; - /* sub-fields if any */ - int num_children; - dlist_head children; - uint32_t magic; -}; - -struct arrowMetadataCache -{ - arrowMetadataCacheBlock *owner; - dlist_node chain; /* link to free/hash list */ - dlist_node lru_chain; /* link to lru_list */ - struct timeval lru_tv; /* last access time */ - arrowMetadataCache *next; /* next record-batch if any */ - struct stat stat_buf; /* result of stat(2) */ - int rb_index; /* index number in a file */ - off_t rb_offset; /* offset from the head */ - size_t rb_length; /* length of the entire RecordBatch */ - int64 rb_nitems; /* number of items */ - /* per column information */ - int nfields; - dlist_head fields; /* list of arrowMetadataFieldCache */ - uint32_t magic; -}; - -/* - * Metadata cache management - */ -#define ARROW_METADATA_HASH_NSLOTS 2000 -typedef struct -{ - LWLock mutex; - slock_t lru_lock; /* protect lru related stuff */ - dlist_head lru_list; - dlist_head free_blocks; /* list of arrowMetadataCacheBlock */ - dlist_head free_mcaches; /* list of arrowMetadataCache */ - dlist_head free_fcaches; /* list of arrowMetadataFieldCache */ - dlist_head hash_slots[ARROW_METADATA_HASH_NSLOTS]; -} arrowMetadataCacheHead; - -/* - * Static variables - */ -static FdwRoutine pgstrom_arrow_fdw_routine; -static shmem_request_hook_type shmem_request_next = NULL; -static shmem_startup_hook_type shmem_startup_next = NULL; -static arrowMetadataCacheHead *arrow_metadata_cache = NULL; -static bool arrow_fdw_enabled; /* GUC */ -static bool arrow_fdw_stats_hint_enabled; /* GUC */ -static int arrow_metadata_cache_size_kb; /* GUC */ - -PG_FUNCTION_INFO_V1(pgstrom_arrow_fdw_handler); -PG_FUNCTION_INFO_V1(pgstrom_arrow_fdw_validator); -PG_FUNCTION_INFO_V1(pgstrom_arrow_fdw_import_file); -PG_FUNCTION_INFO_V1(pgstrom_arrow_fdw_precheck_schema); - -/* ---------------------------------------------------------------- - * - * Apache Arrow <--> PG Types Mapping Routines - * - * ---------------------------------------------------------------- - */ - -/* - * arrowFieldGetPGTypeHint - */ -static Oid -arrowFieldGetPGTypeHint(const ArrowField *field) -{ - for (int i=0; i < field->_num_custom_metadata; i++) - { - ArrowKeyValue *kv = &field->custom_metadata[i]; - char *namebuf, *pos; - Oid namespace_oid = PG_CATALOG_NAMESPACE; - HeapTuple tup; - - if (strcmp(kv->key, "pg_type") != 0) - continue; - namebuf = alloca(kv->_value_len + 10); - strcpy(namebuf, kv->value); - pos = strchr(namebuf, '.'); - if (pos) - { - *pos++ = '\0'; - namespace_oid = get_namespace_oid(namebuf, true); - if (!OidIsValid(namespace_oid)) - continue; - namebuf = pos; - } - tup = SearchSysCache2(TYPENAMENSP, - PointerGetDatum(namebuf), - ObjectIdGetDatum(namespace_oid)); - if (HeapTupleIsValid(tup)) - { - Oid hint = ((Form_pg_type) GETSTRUCT(tup))->oid; - - ReleaseSysCache(tup); - - return hint; - } - } - return InvalidOid; -} - -/* ------------------------------------------------ - * Metadata Cache Management Routines - * - * MEMO: all of them requires the caller must have exclusive lock - * on the arrowMetadataCache::mutex - * ------------------------------------------------ - */ -static void -__releaseMetadataFieldCache(arrowMetadataFieldCache *fcache) -{ - arrowMetadataCacheBlock *mc_block = fcache->owner; - - Assert(fcache->magic == ARROW_METADATA_CACHE_ACTIVE_MAGIC); - /* also release sub-fields if any */ - while (!dlist_is_empty(&fcache->children)) - { - arrowMetadataFieldCache *__fcache - = dlist_container(arrowMetadataFieldCache, chain, - dlist_pop_head_node(&fcache->children)); - __releaseMetadataFieldCache(__fcache); - } - fcache->magic = ARROW_METADATA_CACHE_FREE_MAGIC; - dlist_push_tail(&arrow_metadata_cache->free_fcaches, - &fcache->chain); - - /* also back the owner block if all slabs become free */ - Assert(mc_block->n_actives > 0); - if (--mc_block->n_actives == 0) - { - char *pos = mc_block->data; - char *end = (char *)mc_block + ARROW_METADATA_BLOCKSZ; - - Assert(mc_block->unitsz == MAXALIGN(sizeof(arrowMetadataFieldCache))); - while (pos + mc_block->unitsz <= end) - { - arrowMetadataFieldCache *__fcache = (arrowMetadataFieldCache *)pos; - Assert(__fcache->owner == mc_block && - __fcache->magic == ARROW_METADATA_CACHE_FREE_MAGIC); - dlist_delete(&__fcache->chain); - pos += mc_block->unitsz; - } - Assert(!mc_block->chain.prev && - !mc_block->chain.next); /* must be active block */ - dlist_push_tail(&arrow_metadata_cache->free_blocks, - &mc_block->chain); - } -} - -static void -__releaseMetadataCache(arrowMetadataCache *mcache) -{ - while (mcache) - { - arrowMetadataCacheBlock *mc_block = mcache->owner; - arrowMetadataCache *__mcache_next = mcache->next; - - Assert(mcache->magic == ARROW_METADATA_CACHE_ACTIVE_MAGIC); - /* - * MEMO: Caller already detach the leader mcache from the hash- - * slot and the LRU-list. The follower mcaches should never be - * linked to hash-slot and LRU-list. - * So, we just put Assert() here. - */ - Assert(!mcache->chain.prev && !mcache->chain.next && - !mcache->lru_chain.prev && !mcache->lru_chain.next); - - /* also release arrowMetadataFieldCache */ - while (!dlist_is_empty(&mcache->fields)) - { - arrowMetadataFieldCache *fcache - = dlist_container(arrowMetadataFieldCache, chain, - dlist_pop_head_node(&mcache->fields)); - __releaseMetadataFieldCache(fcache); - } - mcache->magic = ARROW_METADATA_CACHE_FREE_MAGIC; - dlist_push_tail(&arrow_metadata_cache->free_mcaches, - &mcache->chain); - /* also back the owner block if all slabs become free */ - Assert(mc_block->n_actives > 0); - if (--mc_block->n_actives == 0) - { - char *pos = mc_block->data; - char *end = (char *)mc_block + ARROW_METADATA_BLOCKSZ; - - Assert(mc_block->unitsz == MAXALIGN(sizeof(arrowMetadataCache))); - while (pos + mc_block->unitsz <= end) - { - arrowMetadataCache *__mcache = (arrowMetadataCache *)pos; - - Assert(__mcache->owner == mc_block && - __mcache->magic == ARROW_METADATA_CACHE_FREE_MAGIC); - dlist_delete(&__mcache->chain); - pos += mc_block->unitsz; - } - Assert(!mc_block->chain.prev && - !mc_block->chain.next); /* must be active block */ - dlist_push_tail(&arrow_metadata_cache->free_blocks, - &mc_block->chain); - } - mcache = __mcache_next; - } -} - -static bool -__reclaimMetadataCache(void) -{ - SpinLockAcquire(&arrow_metadata_cache->lru_lock); - if (!dlist_is_empty(&arrow_metadata_cache->lru_list)) - { - arrowMetadataCache *mcache; - dlist_node *dnode; - struct timeval curr_tv; - int64_t elapsed; - - gettimeofday(&curr_tv, NULL); - dnode = dlist_tail_node(&arrow_metadata_cache->lru_list); - mcache = dlist_container(arrowMetadataCache, lru_chain, dnode); - elapsed = ((curr_tv.tv_sec - mcache->lru_tv.tv_sec) * 1000000 + - (curr_tv.tv_usec - mcache->lru_tv.tv_usec)); - if (elapsed > 30000000UL) /* > 30s */ - { - dlist_delete(&mcache->lru_chain); - memset(&mcache->lru_chain, 0, sizeof(dlist_node)); - SpinLockRelease(&arrow_metadata_cache->lru_lock); - dlist_delete(&mcache->chain); - memset(&mcache->chain, 0, sizeof(dlist_node)); - - __releaseMetadataCache(mcache); - return true; - } - } - SpinLockRelease(&arrow_metadata_cache->lru_lock); - return false; -} - -static arrowMetadataFieldCache * -__allocMetadataFieldCache(void) -{ - arrowMetadataFieldCache *fcache; - dlist_node *dnode; - - while (dlist_is_empty(&arrow_metadata_cache->free_fcaches)) - { - arrowMetadataCacheBlock *mc_block; - char *pos, *end; - - while (dlist_is_empty(&arrow_metadata_cache->free_blocks)) - { - if (!__reclaimMetadataCache()) - return NULL; - } - dnode = dlist_pop_head_node(&arrow_metadata_cache->free_blocks); - mc_block = dlist_container(arrowMetadataCacheBlock, chain, dnode); - memset(mc_block, 0, offsetof(arrowMetadataCacheBlock, data)); - mc_block->unitsz = MAXALIGN(sizeof(arrowMetadataFieldCache)); - for (pos = mc_block->data, end = (char *)mc_block + ARROW_METADATA_BLOCKSZ; - pos + mc_block->unitsz <= end; - pos += mc_block->unitsz) - { - fcache = (arrowMetadataFieldCache *)pos; - fcache->owner = mc_block; - fcache->magic = ARROW_METADATA_CACHE_FREE_MAGIC; - dlist_push_tail(&arrow_metadata_cache->free_fcaches, - &fcache->chain); - } - } - dnode = dlist_pop_head_node(&arrow_metadata_cache->free_fcaches); - fcache = dlist_container(arrowMetadataFieldCache, chain, dnode); - fcache->owner->n_actives++; - Assert(fcache->magic == ARROW_METADATA_CACHE_FREE_MAGIC); - memset(&fcache->chain, 0, (offsetof(arrowMetadataFieldCache, magic) - - offsetof(arrowMetadataFieldCache, chain))); - fcache->magic = ARROW_METADATA_CACHE_ACTIVE_MAGIC; - return fcache; -} - -static arrowMetadataCache * -__allocMetadataCache(void) -{ - arrowMetadataCache *mcache; - dlist_node *dnode; - - if (dlist_is_empty(&arrow_metadata_cache->free_mcaches)) - { - arrowMetadataCacheBlock *mc_block; - char *pos, *end; - - while (dlist_is_empty(&arrow_metadata_cache->free_blocks)) - { - if (!__reclaimMetadataCache()) - return NULL; - } - dnode = dlist_pop_head_node(&arrow_metadata_cache->free_blocks); - mc_block = dlist_container(arrowMetadataCacheBlock, chain, dnode); - memset(mc_block, 0, offsetof(arrowMetadataCacheBlock, data)); - mc_block->unitsz = MAXALIGN(sizeof(arrowMetadataCache)); - for (pos = mc_block->data, end = (char *)mc_block + ARROW_METADATA_BLOCKSZ; - pos + mc_block->unitsz <= end; - pos += mc_block->unitsz) - { - mcache = (arrowMetadataCache *)pos; - mcache->owner = mc_block; - mcache->magic = ARROW_METADATA_CACHE_FREE_MAGIC; - dlist_push_tail(&arrow_metadata_cache->free_mcaches, - &mcache->chain); - } - } - dnode = dlist_pop_head_node(&arrow_metadata_cache->free_mcaches); - mcache = dlist_container(arrowMetadataCache, chain, dnode); - mcache->owner->n_actives++; - Assert(mcache->magic == ARROW_METADATA_CACHE_FREE_MAGIC); - memset(&mcache->chain, 0, (offsetof(arrowMetadataCache, magic) - - offsetof(arrowMetadataCache, chain))); - mcache->magic = ARROW_METADATA_CACHE_ACTIVE_MAGIC; - return mcache; -} - -/* - * lookupArrowMetadataCache - * - * caller must hold "at least" shared lock on the arrow_metadata_cache->mutex. - * if exclusive lock is held, it may invalidate legacy cache if any. - */ -static inline uint32_t -arrowMetadataHashIndex(struct stat *stat_buf) -{ - struct { - dev_t st_dev; - ino_t st_ino; - } hkey; - uint32_t hash; - - hkey.st_dev = stat_buf->st_dev; - hkey.st_ino = stat_buf->st_ino; - hash = hash_bytes((unsigned char *)&hkey, sizeof(hkey)); - return hash % ARROW_METADATA_HASH_NSLOTS; -} - -static arrowMetadataCache * -lookupArrowMetadataCache(struct stat *stat_buf, bool has_exclusive) -{ - arrowMetadataCache *mcache; - uint32_t hindex; - dlist_iter iter; - - hindex = arrowMetadataHashIndex(stat_buf); - dlist_foreach(iter, &arrow_metadata_cache->hash_slots[hindex]) - { - mcache = dlist_container(arrowMetadataCache, chain, iter.cur); - - if (stat_buf->st_dev == mcache->stat_buf.st_dev && - stat_buf->st_ino == mcache->stat_buf.st_ino) - { - /* - * Is the metadata cache still valid? - */ - if (stat_buf->st_mtim.tv_sec < mcache->stat_buf.st_mtim.tv_sec || - (stat_buf->st_mtim.tv_sec == mcache->stat_buf.st_mtim.tv_sec && - stat_buf->st_mtim.tv_nsec <= mcache->stat_buf.st_mtim.tv_nsec)) - { - /* ok, found */ - SpinLockAcquire(&arrow_metadata_cache->lru_lock); - gettimeofday(&mcache->lru_tv, NULL); - dlist_move_head(&arrow_metadata_cache->lru_list, - &mcache->lru_chain); - SpinLockRelease(&arrow_metadata_cache->lru_lock); - return mcache; - } - else if (has_exclusive) - { - /* - * Unfortunatelly, metadata cache is already invalid. - * If caller has exclusive lock, we release it. - */ - SpinLockAcquire(&arrow_metadata_cache->lru_lock); - dlist_delete(&mcache->lru_chain); - memset(&mcache->lru_chain, 0, sizeof(dlist_node)); - SpinLockRelease(&arrow_metadata_cache->lru_lock); - dlist_delete(&mcache->chain); - memset(&mcache->chain, 0, sizeof(dlist_node)); - - __releaseMetadataCache(mcache); - } - } - } - return NULL; -} - -/* ---------------------------------------------------------------- - * - * buildArrowStatsBinary - * - * ...and, routines related to Arrow Min/Max statistics - * - * ---------------------------------------------------------------- - */ -typedef struct arrowFieldStatsBinary -{ - uint32 nrooms; /* number of record-batches */ - MinMaxStatDatum *stat_values; - int nfields; /* if List/Struct data type */ - struct arrowFieldStatsBinary *subfields; -} arrowFieldStatsBinary; - -typedef struct -{ - int nitems; /* number of record-batches */ - int nfields; /* number of columns */ - arrowFieldStatsBinary fields[FLEXIBLE_ARRAY_MEMBER]; -} arrowStatsBinary; - -static void -__releaseArrowFieldStatsBinary(arrowFieldStatsBinary *bstats) -{ - if (bstats->subfields) - { - for (int j=0; j < bstats->nfields; j++) - __releaseArrowFieldStatsBinary(&bstats->subfields[j]); - pfree(bstats->subfields); - } - if (bstats->stat_values) - pfree(bstats->stat_values); -} - -static void -releaseArrowStatsBinary(arrowStatsBinary *arrow_bstats) -{ - if (arrow_bstats) - { - for (int j=0; j < arrow_bstats->nfields; j++) - __releaseArrowFieldStatsBinary(&arrow_bstats->fields[j]); - pfree(arrow_bstats); - } -} - -static int128_t -__atoi128(const char *tok, bool *p_isnull) -{ - int128_t ival = 0; - bool is_minus = false; - - if (*tok == '-') - { - is_minus = true; - tok++; - } - while (isdigit(*tok)) - { - ival = 10 * ival + (*tok - '0'); - tok++; - } - - if (*tok != '\0') - *p_isnull = true; - if (is_minus) - { - if (ival == 0) - *p_isnull = true; - ival = -ival; - } - return ival; -} - -static bool -__parseArrowFieldStatsBinary(arrowFieldStatsBinary *bstats, - ArrowField *field, - const char *min_tokens, - const char *max_tokens) -{ - MinMaxStatDatum *stat_values; - char *min_buffer; - char *max_buffer; - char *tok1, *pos1; - char *tok2, *pos2; - uint32_t index; - - /* parse the min_tokens/max_tokens */ - min_buffer = alloca(strlen(min_tokens) + 1); - max_buffer = alloca(strlen(max_tokens) + 1); - strcpy(min_buffer, min_tokens); - strcpy(max_buffer, max_tokens); - - stat_values = palloc0(sizeof(MinMaxStatDatum) * bstats->nrooms); - for (tok1 = strtok_r(min_buffer, ",", &pos1), - tok2 = strtok_r(max_buffer, ",", &pos2), index = 0; - tok1 != NULL && tok2 != NULL && index < bstats->nrooms; - tok1 = strtok_r(NULL, ",", &pos1), - tok2 = strtok_r(NULL, ",", &pos2), index++) - { - bool __isnull = false; - int128_t __min = __atoi128(__trim(tok1), &__isnull); - int128_t __max = __atoi128(__trim(tok2), &__isnull); - - if (__isnull) - { - stat_values[index].isnull = true; - continue; - } - - switch (field->type.node.tag) - { - case ArrowNodeTag__Int: - case ArrowNodeTag__FloatingPoint: - stat_values[index].min.datum = (Datum)__min; - stat_values[index].max.datum = (Datum)__min; - break; - - case ArrowNodeTag__Decimal: - __xpu_numeric_to_varlena((char *)&stat_values[index].min.numeric, - field->type.Decimal.scale, - __min); - __xpu_numeric_to_varlena((char *)&stat_values[index].max.numeric, - field->type.Decimal.scale, - __max); - break; - - case ArrowNodeTag__Date: - switch (field->type.Date.unit) - { - case ArrowDateUnit__Day: - stat_values[index].min.datum = __min - - (POSTGRES_EPOCH_JDATE - UNIX_EPOCH_JDATE); - stat_values[index].max.datum = __max - - (POSTGRES_EPOCH_JDATE - UNIX_EPOCH_JDATE); - break; - case ArrowDateUnit__MilliSecond: - stat_values[index].min.datum = __min / (SECS_PER_DAY * 1000) - - (POSTGRES_EPOCH_JDATE - UNIX_EPOCH_JDATE); - stat_values[index].max.datum = __max / (SECS_PER_DAY * 1000) - - (POSTGRES_EPOCH_JDATE - UNIX_EPOCH_JDATE); - break; - default: - goto bailout; - } - break; - - case ArrowNodeTag__Time: - switch (field->type.Time.unit) - { - case ArrowTimeUnit__Second: - stat_values[index].min.datum = __min * 1000000L; - stat_values[index].max.datum = __max * 1000000L; - break; - case ArrowTimeUnit__MilliSecond: - stat_values[index].min.datum = __min * 1000L; - stat_values[index].max.datum = __max * 1000L; - break; - case ArrowTimeUnit__MicroSecond: - stat_values[index].min.datum = __min; - stat_values[index].max.datum = __max; - break; - case ArrowTimeUnit__NanoSecond: - stat_values[index].min.datum = __min / 1000; - stat_values[index].max.datum = __max / 1000; - break; - default: - goto bailout; - } - break; - - case ArrowNodeTag__Timestamp: - switch (field->type.Timestamp.unit) - { - case ArrowTimeUnit__Second: - stat_values[index].min.datum = __min * 1000000L; - stat_values[index].max.datum = __max * 1000000L; - break; - case ArrowTimeUnit__MilliSecond: - stat_values[index].min.datum = __min * 1000L; - stat_values[index].max.datum = __max * 1000L; - break; - case ArrowTimeUnit__MicroSecond: - stat_values[index].min.datum = __min; - stat_values[index].max.datum = __max; - break; - case ArrowTimeUnit__NanoSecond: - stat_values[index].min.datum = __min / 1000; - stat_values[index].max.datum = __max / 1000; - break; - default: - goto bailout; - } - break; - default: - goto bailout; - } - } - /* sanity checks */ - if (!tok1 && !tok2 && index == bstats->nrooms) - { - bstats->stat_values = stat_values; - return true; - } -bailout: - pfree(stat_values); - return false; -} - -static bool -__buildArrowFieldStatsBinary(arrowFieldStatsBinary *bstats, - ArrowField *field, - uint32 numRecordBatches) -{ - const char *min_tokens = NULL; - const char *max_tokens = NULL; - int j, k; - bool retval = false; - - for (k=0; k < field->_num_custom_metadata; k++) - { - ArrowKeyValue *kv = &field->custom_metadata[k]; - - if (strcmp(kv->key, "min_values") == 0) - min_tokens = kv->value; - else if (strcmp(kv->key, "max_values") == 0) - max_tokens = kv->value; - } - - bstats->nrooms = numRecordBatches; - if (min_tokens && max_tokens) - { - if (__parseArrowFieldStatsBinary(bstats, field, - min_tokens, - max_tokens)) - { - retval = true; - } - } - - if (field->_num_children > 0) - { - bstats->nfields = field->_num_children; - bstats->subfields = palloc0(sizeof(arrowFieldStatsBinary) * bstats->nfields); - for (j=0; j < bstats->nfields; j++) - { - if (__buildArrowFieldStatsBinary(&bstats->subfields[j], - &field->children[j], - numRecordBatches)) - retval = true; - } - } - return retval; -} - -static arrowStatsBinary * -buildArrowStatsBinary(const ArrowFooter *footer, Bitmapset **p_stat_attrs) -{ - arrowStatsBinary *arrow_bstats; - int nfields = footer->schema._num_fields; - bool found = false; - - arrow_bstats = palloc0(offsetof(arrowStatsBinary, - fields[nfields])); - arrow_bstats->nitems = footer->_num_recordBatches; - arrow_bstats->nfields = nfields; - for (int j=0; j < nfields; j++) - { - if (__buildArrowFieldStatsBinary(&arrow_bstats->fields[j], - &footer->schema.fields[j], - footer->_num_recordBatches)) - { - if (p_stat_attrs) - *p_stat_attrs = bms_add_member(*p_stat_attrs, j+1); - found = true; - } - } - if (!found) - { - releaseArrowStatsBinary(arrow_bstats); - return NULL; - } - return arrow_bstats; -} - -/* - * applyArrowStatsBinary - */ -static void -__applyArrowFieldStatsBinary(RecordBatchFieldState *rb_field, - arrowFieldStatsBinary *bstats, - int rb_index) -{ - int j; - - if (bstats->stat_values) - { - memcpy(&rb_field->stat_datum, - &bstats->stat_values[rb_index], sizeof(MinMaxStatDatum)); - } - else - { - rb_field->stat_datum.isnull = true; - } - Assert(rb_field->num_children == bstats->nfields); - for (j=0; j < rb_field->num_children; j++) - { - RecordBatchFieldState *__rb_field = &rb_field->children[j]; - arrowFieldStatsBinary *__bstats = &bstats->subfields[j]; - - __applyArrowFieldStatsBinary(__rb_field, __bstats, rb_index); - } -} - -static void -applyArrowStatsBinary(RecordBatchState *rb_state, arrowStatsBinary *arrow_bstats) -{ - Assert(rb_state->nfields == arrow_bstats->nfields && - rb_state->rb_index < arrow_bstats->nitems); - for (int j=0; j < rb_state->nfields; j++) - { - __applyArrowFieldStatsBinary(&rb_state->fields[j], - &arrow_bstats->fields[j], - rb_state->rb_index); - } -} - -/* - * execInitArrowStatsHint / execCheckArrowStatsHint / execEndArrowStatsHint - * - * ... are executor routines for min/max statistics. - */ -static bool -__buildArrowStatsOper(arrowStatsHint *as_hint, - ScanState *ss, - OpExpr *op, - bool reverse) -{ - Index scanrelid = ((Scan *)ss->ps.plan)->scanrelid; - Oid opcode; - Var *var; - Node *arg; - Expr *expr; - Oid opfamily = InvalidOid; - StrategyNumber strategy = InvalidStrategy; - CatCList *catlist; - int i; - - if (!reverse) - { - opcode = op->opno; - var = linitial(op->args); - arg = lsecond(op->args); - } - else - { - opcode = get_commutator(op->opno); - var = lsecond(op->args); - arg = linitial(op->args); - } - /* Is it VAR ARG form? */ - if (!IsA(var, Var) || var->varno != scanrelid || !OidIsValid(opcode)) - return false; - if (!bms_is_member(var->varattno, as_hint->stat_attrs)) - return false; - if (contain_var_clause(arg) || - contain_volatile_functions(arg)) - return false; - - catlist = SearchSysCacheList1(AMOPOPID, ObjectIdGetDatum(opcode)); - for (i=0; i < catlist->n_members; i++) - { - HeapTuple tuple = &catlist->members[i]->tuple; - Form_pg_amop amop = (Form_pg_amop) GETSTRUCT(tuple); - - if (amop->amopmethod == BRIN_AM_OID) - { - opfamily = amop->amopfamily; - strategy = amop->amopstrategy; - break; - } - } - ReleaseSysCacheList(catlist); - - if (strategy == BTLessStrategyNumber || - strategy == BTLessEqualStrategyNumber) - { - /* if (VAR < ARG) --> (Min >= ARG), can be skipped */ - /* if (VAR <= ARG) --> (Min > ARG), can be skipped */ - opcode = get_negator(opcode); - if (!OidIsValid(opcode)) - return false; - expr = make_opclause(opcode, - op->opresulttype, - op->opretset, - (Expr *)makeVar(INNER_VAR, - var->varattno, - var->vartype, - var->vartypmod, - var->varcollid, - 0), - (Expr *)copyObject(arg), - op->opcollid, - op->inputcollid); - set_opfuncid((OpExpr *)expr); - as_hint->eval_quals = lappend(as_hint->eval_quals, expr); - } - else if (strategy == BTGreaterEqualStrategyNumber || - strategy == BTGreaterStrategyNumber) - { - /* if (VAR > ARG) --> (Max <= ARG), can be skipped */ - /* if (VAR >= ARG) --> (Max < ARG), can be skipped */ - opcode = get_negator(opcode); - if (!OidIsValid(opcode)) - return false; - expr = make_opclause(opcode, - op->opresulttype, - op->opretset, - (Expr *)makeVar(OUTER_VAR, - var->varattno, - var->vartype, - var->vartypmod, - var->varcollid, - 0), - (Expr *)copyObject(arg), - op->opcollid, - op->inputcollid); - set_opfuncid((OpExpr *)expr); - as_hint->eval_quals = lappend(as_hint->eval_quals, expr); - } - else if (strategy == BTEqualStrategyNumber) - { - /* (VAR = ARG) --> (Min > ARG) || (Max < ARG), can be skipped */ - opcode = get_opfamily_member(opfamily, var->vartype, - exprType((Node *)arg), - BTGreaterStrategyNumber); - expr = make_opclause(opcode, - op->opresulttype, - op->opretset, - (Expr *)makeVar(INNER_VAR, - var->varattno, - var->vartype, - var->vartypmod, - var->varcollid, - 0), - (Expr *)copyObject(arg), - op->opcollid, - op->inputcollid); - set_opfuncid((OpExpr *)expr); - as_hint->eval_quals = lappend(as_hint->eval_quals, expr); - - opcode = get_opfamily_member(opfamily, var->vartype, - exprType((Node *)arg), - BTLessEqualStrategyNumber); - expr = make_opclause(opcode, - op->opresulttype, - op->opretset, - (Expr *)makeVar(OUTER_VAR, - var->varattno, - var->vartype, - var->vartypmod, - var->varcollid, - 0), - (Expr *)copyObject(arg), - op->opcollid, - op->inputcollid); - set_opfuncid((OpExpr *)expr); - as_hint->eval_quals = lappend(as_hint->eval_quals, expr); - } - else - { - return false; - } - as_hint->load_attrs = bms_add_member(as_hint->load_attrs, var->varattno); - - return true; -} - -static arrowStatsHint * -execInitArrowStatsHint(ScanState *ss, List *outer_quals, Bitmapset *stat_attrs) -{ - Relation relation = ss->ss_currentRelation; - TupleDesc tupdesc = RelationGetDescr(relation); - arrowStatsHint *as_hint; - ExprContext *econtext; - Expr *eval_expr; - ListCell *lc; - - as_hint = palloc0(sizeof(arrowStatsHint)); - as_hint->stat_attrs = stat_attrs; - foreach (lc, outer_quals) - { - OpExpr *op = lfirst(lc); - - if (IsA(op, OpExpr) && list_length(op->args) == 2 && - (__buildArrowStatsOper(as_hint, ss, op, false) || - __buildArrowStatsOper(as_hint, ss, op, true))) - { - as_hint->orig_quals = lappend(as_hint->orig_quals, op); - } - } - if (as_hint->eval_quals == NIL) - return NULL; - if (list_length(as_hint->eval_quals) == 1) - eval_expr = linitial(as_hint->eval_quals); - else - eval_expr = make_orclause(as_hint->eval_quals); - - econtext = CreateExprContext(ss->ps.state); - econtext->ecxt_innertuple = MakeSingleTupleTableSlot(tupdesc, &TTSOpsVirtual); - econtext->ecxt_outertuple = MakeSingleTupleTableSlot(tupdesc, &TTSOpsVirtual); - - as_hint->eval_state = ExecInitExpr(eval_expr, &ss->ps); - as_hint->econtext = econtext; - - return as_hint; -} - -static bool -execCheckArrowStatsHint(arrowStatsHint *stats_hint, - RecordBatchState *rb_state) -{ - ExprContext *econtext = stats_hint->econtext; - TupleTableSlot *min_values = econtext->ecxt_innertuple; - TupleTableSlot *max_values = econtext->ecxt_outertuple; - int anum; - Datum datum; - bool isnull; - - /* load the min/max statistics */ - ExecStoreAllNullTuple(min_values); - ExecStoreAllNullTuple(max_values); - for (anum = bms_next_member(stats_hint->load_attrs, -1); - anum >= 0; - anum = bms_next_member(stats_hint->load_attrs, anum)) - { - RecordBatchFieldState *rb_field = &rb_state->fields[anum-1]; - - Assert(anum > 0 && anum <= rb_state->nfields); - if (!rb_field->stat_datum.isnull) - { - min_values->tts_isnull[anum-1] = false; - max_values->tts_isnull[anum-1] = false; - if (rb_field->atttypid == NUMERICOID) - { - min_values->tts_values[anum-1] - = PointerGetDatum(&rb_field->stat_datum.min.numeric); - max_values->tts_values[anum-1] - = PointerGetDatum(&rb_field->stat_datum.max.numeric); - } - else - { - min_values->tts_values[anum-1] = rb_field->stat_datum.min.datum; - max_values->tts_values[anum-1] = rb_field->stat_datum.max.datum; - } - } - } - datum = ExecEvalExprSwitchContext(stats_hint->eval_state, econtext, &isnull); -// elog(INFO, "file [%s] rb_index=%u datum=%lu isnull=%d", -// FilePathName(rb_state->fdesc), rb_state->rb_index, datum, (int)isnull); - if (!isnull && DatumGetBool(datum)) - return true; /* ok, skip this record-batch */ - return false; -} - -static void -execEndArrowStatsHint(arrowStatsHint *stats_hint) -{ - ExprContext *econtext = stats_hint->econtext; - - ExecDropSingleTupleTableSlot(econtext->ecxt_innertuple); - ExecDropSingleTupleTableSlot(econtext->ecxt_outertuple); - econtext->ecxt_innertuple = NULL; - econtext->ecxt_outertuple = NULL; - - FreeExprContext(econtext, true); -} - - -/* ---------------------------------------------------------------- - * - * BuildArrowFileState - * - * It build RecordBatchState based on the metadata-cache, or raw Arrow files. - * ---------------------------------------------------------------- - */ -static void -__buildRecordBatchFieldStateByCache(RecordBatchFieldState *rb_field, - arrowMetadataFieldCache *fcache) -{ - rb_field->atttypid = fcache->atttypid; - rb_field->atttypmod = fcache->atttypmod; - rb_field->attopts = fcache->attopts; - rb_field->nitems = fcache->nitems; - rb_field->null_count = fcache->null_count; - rb_field->nullmap_offset = fcache->nullmap_offset; - rb_field->nullmap_length = fcache->nullmap_length; - rb_field->values_offset = fcache->values_offset; - rb_field->values_length = fcache->values_length; - rb_field->extra_offset = fcache->extra_offset; - rb_field->extra_length = fcache->extra_length; - memcpy(&rb_field->stat_datum, - &fcache->stat_datum, sizeof(MinMaxStatDatum)); - if (fcache->num_children > 0) - { - dlist_iter iter; - int j = 0; - - rb_field->num_children = fcache->num_children; - rb_field->children = palloc0(sizeof(RecordBatchFieldState) * - fcache->num_children); - dlist_foreach(iter, &fcache->children) - { - arrowMetadataFieldCache *__fcache - = dlist_container(arrowMetadataFieldCache, chain, iter.cur); - __buildRecordBatchFieldStateByCache(&rb_field->children[j++], __fcache); - } - Assert(j == rb_field->num_children); - } - else - { - Assert(dlist_is_empty(&fcache->children)); - } -} - -static ArrowFileState * -__buildArrowFileStateByCache(const char *filename, - arrowMetadataCache *mcache, - Bitmapset **p_stat_attrs) -{ - ArrowFileState *af_state; - - af_state = palloc0(sizeof(ArrowFileState)); - af_state->filename = pstrdup(filename); - memcpy(&af_state->stat_buf, &mcache->stat_buf, sizeof(struct stat)); - - while (mcache) - { - RecordBatchState *rb_state; - dlist_iter iter; - int j = 0; - - rb_state = palloc0(offsetof(RecordBatchState, - fields[mcache->nfields])); - rb_state->af_state = af_state; - rb_state->rb_index = mcache->rb_index; - rb_state->rb_offset = mcache->rb_offset; - rb_state->rb_length = mcache->rb_length; - rb_state->rb_nitems = mcache->rb_nitems; - rb_state->nfields = mcache->nfields; - dlist_foreach(iter, &mcache->fields) - { - arrowMetadataFieldCache *fcache; - - fcache = dlist_container(arrowMetadataFieldCache, chain, iter.cur); - if (p_stat_attrs && fcache->stat_datum.isnull) - *p_stat_attrs = bms_add_member(*p_stat_attrs, j+1); - __buildRecordBatchFieldStateByCache(&rb_state->fields[j++], fcache); - } - Assert(j == rb_state->nfields); - af_state->rb_list = lappend(af_state->rb_list, rb_state); - - mcache = mcache->next; - } - return af_state; -} - -/* - * Routines to setup RecordBatchState by raw-file - */ -typedef struct -{ - ArrowBuffer *buffer_curr; - ArrowBuffer *buffer_tail; - ArrowFieldNode *fnode_curr; - ArrowFieldNode *fnode_tail; -} setupRecordBatchContext; - -static Oid -__lookupCompositePGType(int nattrs, Oid *type_oids, Oid hint_oid) -{ - Relation rel; - ScanKeyData skeys[3]; - SysScanDesc sscan; - Oid comp_oid = InvalidOid; - - rel = table_open(RelationRelationId, AccessShareLock); - ScanKeyInit(&skeys[0], - Anum_pg_class_relkind, - BTEqualStrategyNumber, F_CHAREQ, - CharGetDatum(RELKIND_COMPOSITE_TYPE)); - ScanKeyInit(&skeys[1], - Anum_pg_class_relnatts, - BTEqualStrategyNumber, F_INT2EQ, - Int16GetDatum(nattrs)); - ScanKeyInit(&skeys[2], - Anum_pg_class_oid, - BTEqualStrategyNumber, F_OIDNE, - ObjectIdGetDatum(hint_oid)); - sscan = systable_beginscan(rel, InvalidOid, false, NULL, - OidIsValid(hint_oid) ? 3 : 2, skeys); - for (;;) - { - HeapTuple htup; - TupleDesc tupdesc; - int j; - - if (OidIsValid(hint_oid)) - { - comp_oid = hint_oid; - hint_oid = InvalidOid; - } - else - { - htup = systable_getnext(sscan); - if (!HeapTupleIsValid(htup)) - break; - comp_oid = ((Form_pg_type) GETSTRUCT(htup))->oid; - } - - if (pg_type_aclcheck(comp_oid, - GetUserId(), - ACL_USAGE) != ACLCHECK_OK) - continue; - - tupdesc = lookup_rowtype_tupdesc_noerror(comp_oid, -1, true); - if (!tupdesc) - continue; - if (tupdesc->natts == nattrs) - { - for (j=0; j < tupdesc->natts; j++) - { - Form_pg_attribute attr = TupleDescAttr(tupdesc, j); - - if (attr->atttypid != type_oids[j]) - break; - } - if (j == tupdesc->natts) - { - ReleaseTupleDesc(tupdesc); - goto found; - } - } - ReleaseTupleDesc(tupdesc); - } - comp_oid = InvalidOid; /* not found */ -found: - systable_endscan(sscan); - table_close(rel, AccessShareLock); - - return comp_oid; -} - -static void -__arrowFieldTypeToPGType(const ArrowField *field, - Oid *p_type_oid, - int32_t *p_type_mod, - ArrowTypeOptions *p_attopts) -{ - const ArrowType *t = &field->type; - Oid type_oid = InvalidOid; - int32_t type_mod = -1; - Oid hint_oid = arrowFieldGetPGTypeHint(field); - ArrowTypeOptions attopts; - - memset(&attopts, 0, sizeof(ArrowTypeOptions)); - switch (t->node.tag) - { - case ArrowNodeTag__Int: - attopts.tag = ArrowType__Int; - switch (t->Int.bitWidth) - { - case 8: - attopts.unitsz = sizeof(int8_t); - type_oid = - GetSysCacheOid2(TYPENAMENSP, - Anum_pg_type_oid, - CStringGetDatum("int1"), - ObjectIdGetDatum(PG_CATALOG_NAMESPACE)); - break; - case 16: - attopts.unitsz = sizeof(int16_t); - type_oid = INT2OID; - break; - case 32: - attopts.unitsz = sizeof(int32_t); - type_oid = INT4OID; - break; - case 64: - attopts.unitsz = sizeof(int64_t); - type_oid = INT8OID; - break; - default: - elog(ERROR, "Arrow::Int bitWidth=%d is not supported", - t->Int.bitWidth); - } - attopts.integer.bitWidth = t->Int.bitWidth; - attopts.integer.is_signed = t->Int.is_signed; - break; - - case ArrowNodeTag__FloatingPoint: - attopts.tag = ArrowType__FloatingPoint; - switch (t->FloatingPoint.precision) - { - case ArrowPrecision__Half: - attopts.unitsz = sizeof(float2_t); - type_oid = - GetSysCacheOid2(TYPENAMENSP, - Anum_pg_type_oid, - CStringGetDatum("float2"), - ObjectIdGetDatum(PG_CATALOG_NAMESPACE)); - break; - case ArrowPrecision__Single: - attopts.unitsz = sizeof(float4_t); - type_oid = FLOAT4OID; - break; - case ArrowPrecision__Double: - attopts.unitsz = sizeof(float8_t); - type_oid = FLOAT8OID; - break; - default: - elog(ERROR, "Arrow::FloatingPoint unknown precision (%d)", - (int)t->FloatingPoint.precision); - } - attopts.floating_point.precision = t->FloatingPoint.precision; - break; - - case ArrowNodeTag__Bool: - attopts.tag = ArrowType__Bool; - attopts.unitsz = -1; /* values is bitmap */ - type_oid = BOOLOID; - break; - - case ArrowNodeTag__Decimal: - if (t->Decimal.bitWidth != 128) - elog(ERROR, "Arrow::Decimal%u is not supported", t->Decimal.bitWidth); - attopts.tag = ArrowType__Decimal; - attopts.unitsz = sizeof(int128_t); - attopts.decimal.precision = t->Decimal.precision; - attopts.decimal.scale = t->Decimal.scale; - attopts.decimal.bitWidth = t->Decimal.bitWidth; - type_oid = NUMERICOID; - break; - - case ArrowNodeTag__Date: - attopts.tag = ArrowType__Date; - switch (t->Date.unit) - { - case ArrowDateUnit__Day: - attopts.unitsz = sizeof(int32_t); - break; - case ArrowDateUnit__MilliSecond: - attopts.unitsz = sizeof(int32_t); - break; - default: - elog(ERROR, "Arrow::Date unknown unit (%d)", - (int)t->Date.unit); - } - attopts.date.unit = t->Date.unit; - type_oid = DATEOID; - break; - - case ArrowNodeTag__Time: - attopts.tag = ArrowType__Time; - switch (t->Time.unit) - { - case ArrowTimeUnit__Second: - case ArrowTimeUnit__MilliSecond: - attopts.unitsz = sizeof(int32_t); - break; - case ArrowTimeUnit__MicroSecond: - case ArrowTimeUnit__NanoSecond: - attopts.unitsz = sizeof(int64_t); - break; - default: - elog(ERROR, "unknown Time::unit (%d)", - (int)t->Time.unit); - } - attopts.time.unit = t->Time.unit; - type_oid = TIMEOID; - break; - - case ArrowNodeTag__Timestamp: - attopts.tag = ArrowType__Timestamp; - switch (t->Timestamp.unit) - { - case ArrowTimeUnit__Second: - case ArrowTimeUnit__MilliSecond: - case ArrowTimeUnit__MicroSecond: - case ArrowTimeUnit__NanoSecond: - attopts.unitsz = sizeof(int64_t); - break; - default: - elog(ERROR, "unknown Timestamp::unit (%d)", - (int)t->Timestamp.unit); - } - attopts.timestamp.unit = t->Timestamp.unit; - type_oid = (t->Timestamp.timezone - ? TIMESTAMPTZOID - : TIMESTAMPOID); - break; - - case ArrowNodeTag__Interval: - attopts.tag = ArrowType__Interval; - switch (t->Interval.unit) - { - case ArrowIntervalUnit__Year_Month: - attopts.unitsz = sizeof(int32_t); - break; - case ArrowIntervalUnit__Day_Time: - attopts.unitsz = sizeof(int64_t); - break; - default: - elog(ERROR, "unknown Interval::unit (%d)", - (int)t->Interval.unit); - } - attopts.interval.unit = t->Interval.unit; - type_oid = INTERVALOID; - break; - - case ArrowNodeTag__FixedSizeBinary: - attopts.tag = ArrowType__FixedSizeBinary; - attopts.unitsz = t->FixedSizeBinary.byteWidth; - attopts.fixed_size_binary.byteWidth = t->FixedSizeBinary.byteWidth; - if (t->FixedSizeBinary.byteWidth <= 0 || - t->FixedSizeBinary.byteWidth > BLCKSZ) - elog(ERROR, "arrow_fdw: %s with byteWidth=%d is not supported", - t->node.tagName, - t->FixedSizeBinary.byteWidth); - if (hint_oid == MACADDROID && - t->FixedSizeBinary.byteWidth == 6) - { - type_oid = MACADDROID; - } - else if (hint_oid == INETOID && - (t->FixedSizeBinary.byteWidth == 4 || - t->FixedSizeBinary.byteWidth == 16)) - { - type_oid = INETOID; - } - else - { - type_oid = BPCHAROID; - type_mod = VARHDRSZ + t->FixedSizeBinary.byteWidth; - } - break; - - case ArrowNodeTag__Utf8: - attopts.tag = ArrowType__Utf8; - attopts.unitsz = sizeof(uint32_t); - type_oid = TEXTOID; - break; - - case ArrowNodeTag__LargeUtf8: - attopts.tag = ArrowType__LargeUtf8; - attopts.unitsz = sizeof(uint64_t); - type_oid = TEXTOID; - break; - - case ArrowNodeTag__Binary: - attopts.tag = ArrowType__Binary; - attopts.unitsz = sizeof(uint32_t); - type_oid = BYTEAOID; - break; - - case ArrowNodeTag__LargeBinary: - attopts.tag = ArrowType__LargeBinary; - attopts.unitsz = sizeof(uint64_t); - type_oid = BYTEAOID; - break; - - case ArrowNodeTag__List: - case ArrowNodeTag__LargeList: - if (field->_num_children != 1) - elog(ERROR, "Bug? List of arrow type is corrupted"); - else - { - Oid __type_oid = InvalidOid; - - attopts.tag = ArrowType__List; - attopts.unitsz = (t->node.tag == ArrowNodeTag__List - ? sizeof(uint32_t) - : sizeof(uint64_t)); - __arrowFieldTypeToPGType(&field->children[0], - &__type_oid, - NULL, - NULL); - type_oid = get_array_type(__type_oid); - if (!OidIsValid(type_oid)) - elog(ERROR, "arrow_fdw: no array type for '%s'", - format_type_be(__type_oid)); - } - break; - - case ArrowNodeTag__Struct: - { - Oid *__type_oids; - - attopts.tag = ArrowType__Struct; - attopts.unitsz = 0; /* only nullmap */ - __type_oids = alloca(sizeof(Oid) * (field->_num_children + 1)); - for (int j=0; j < field->_num_children; j++) - { - __arrowFieldTypeToPGType(&field->children[j], - &__type_oids[j], - NULL, - NULL); - } - type_oid = __lookupCompositePGType(field->_num_children, - __type_oids, - hint_oid); - if (!OidIsValid(type_oid)) - elog(ERROR, "arrow_fdw: no suitable composite type"); - } - break; - - default: - elog(ERROR, "Bug? ArrowSchema contains unsupported types"); - } - - if (p_type_oid) - *p_type_oid = type_oid; - if (p_type_mod) - *p_type_mod = type_mod; - if (p_attopts) - memcpy(p_attopts, &attopts, sizeof(ArrowTypeOptions)); -} - -static void -__buildRecordBatchFieldState(setupRecordBatchContext *con, - RecordBatchFieldState *rb_field, - ArrowField *field, int depth) -{ - ArrowFieldNode *fnode; - ArrowBuffer *buffer_curr; - size_t least_values_length = 0; - bool has_extra_buffer = false; - - if (con->fnode_curr >= con->fnode_tail) - elog(ERROR, "RecordBatch has less ArrowFieldNode than expected"); - fnode = con->fnode_curr++; - rb_field->atttypid = InvalidOid; - rb_field->atttypmod = -1; - rb_field->nitems = fnode->length; - rb_field->null_count = fnode->null_count; - rb_field->stat_datum.isnull = true; - __arrowFieldTypeToPGType(field, - &rb_field->atttypid, - &rb_field->atttypmod, - &rb_field->attopts); - /* assign buffers */ - switch (field->type.node.tag) - { - case ArrowNodeTag__Bool: - least_values_length = BITMAPLEN(rb_field->nitems); - break; - case ArrowNodeTag__Int: - case ArrowNodeTag__FloatingPoint: - case ArrowNodeTag__Decimal: - case ArrowNodeTag__Date: - case ArrowNodeTag__Time: - case ArrowNodeTag__Timestamp: - case ArrowNodeTag__Interval: - case ArrowNodeTag__FixedSizeBinary: - least_values_length = rb_field->attopts.unitsz * rb_field->nitems; - break; - - case ArrowNodeTag__Utf8: - case ArrowNodeTag__LargeUtf8: - case ArrowNodeTag__Binary: - case ArrowNodeTag__LargeBinary: - least_values_length = rb_field->attopts.unitsz * (rb_field->nitems + 1); - has_extra_buffer = true; - break; - - case ArrowNodeTag__List: - case ArrowNodeTag__LargeList: - if (depth > 0) - elog(ERROR, "nested array type is not supported"); - least_values_length = rb_field->attopts.unitsz * (rb_field->nitems + 1); - break; - - case ArrowNodeTag__Struct: - if (depth > 0) - elog(ERROR, "nested composite type is not supported"); - /* no values and extra buffer, only nullmap */ - break; - default: - elog(ERROR, "Bug? ArrowSchema contains unsupported types"); - } - - /* setup nullmap buffer */ - buffer_curr = con->buffer_curr++; - if (buffer_curr >= con->buffer_tail) - elog(ERROR, "RecordBatch has less buffers than expected"); - if (rb_field->null_count > 0) - { - rb_field->nullmap_offset = buffer_curr->offset; - rb_field->nullmap_length = buffer_curr->length; - if (rb_field->nullmap_length < BITMAPLEN(rb_field->nitems)) - elog(ERROR, "nullmap length is smaller than expected"); - if (rb_field->nullmap_offset != MAXALIGN(rb_field->nullmap_offset)) - elog(ERROR, "nullmap is not aligned well"); - } - - /* setup values buffer */ - if (least_values_length > 0) - { - buffer_curr = con->buffer_curr++; - if (buffer_curr >= con->buffer_tail) - elog(ERROR, "RecordBatch has less buffers than expected"); - rb_field->values_offset = buffer_curr->offset; - rb_field->values_length = buffer_curr->length; - if (rb_field->values_length < least_values_length) - elog(ERROR, "values array is smaller than expected"); - if (rb_field->values_offset != MAXALIGN(rb_field->values_offset)) - elog(ERROR, "values array is not aligned well"); - } - - /* setup extra buffer */ - if (has_extra_buffer) - { - Assert(least_values_length > 0); - buffer_curr = con->buffer_curr++; - if (buffer_curr >= con->buffer_tail) - elog(ERROR, "RecordBatch has less buffers than expected"); - rb_field->extra_offset = buffer_curr->offset; - rb_field->extra_length = buffer_curr->length; - if (rb_field->extra_offset != MAXALIGN(rb_field->extra_offset)) - elog(ERROR, "extra buffer is not aligned well"); - } - - /* child fields, if any */ - if (field->_num_children > 0) - { - rb_field->children = palloc0(sizeof(RecordBatchFieldState) * - field->_num_children); - for (int j=0; j < field->_num_children; j++) - { - __buildRecordBatchFieldState(con, - &rb_field->children[j], - &field->children[j], - depth+1); - } - } - rb_field->num_children = field->_num_children; -} - -static RecordBatchState * -__buildRecordBatchStateOne(ArrowSchema *schema, - ArrowFileState *af_state, - int rb_index, - ArrowBlock *block, - ArrowRecordBatch *rbatch) -{ - setupRecordBatchContext con; - RecordBatchState *rb_state; - int nfields = schema->_num_fields; - - if (rbatch->compression) - elog(ERROR, "arrow_fdw: right now, compressed record-batche is not supported"); - - rb_state = palloc0(offsetof(RecordBatchState, fields[nfields])); - rb_state->af_state = af_state; - rb_state->rb_index = rb_index; - rb_state->rb_offset = block->offset + block->metaDataLength; - rb_state->rb_length = block->bodyLength; - rb_state->rb_nitems = rbatch->length; - rb_state->nfields = nfields; - - memset(&con, 0, sizeof(setupRecordBatchContext)); - con.buffer_curr = rbatch->buffers; - con.buffer_tail = rbatch->buffers + rbatch->_num_buffers; - con.fnode_curr = rbatch->nodes; - con.fnode_tail = rbatch->nodes + rbatch->_num_nodes; - for (int j=0; j < nfields; j++) - { - RecordBatchFieldState *rb_field = &rb_state->fields[j]; - ArrowField *field = &schema->fields[j]; - - __buildRecordBatchFieldState(&con, rb_field, field, 0); - } - if (con.buffer_curr != con.buffer_tail || - con.fnode_curr != con.fnode_tail) - elog(ERROR, "arrow_fdw: RecordBatch may be corrupted"); - return rb_state; -} - -/* - * readArrowFile - */ -static bool -readArrowFile(const char *filename, ArrowFileInfo *af_info, bool missing_ok) -{ - File filp = PathNameOpenFile(filename, O_RDONLY | PG_BINARY); - - if (filp < 0) - { - if (missing_ok && errno == ENOENT) - return false; - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not open file \"%s\": %m", filename))); - } - readArrowFileDesc(FileGetRawDesc(filp), af_info); - FileClose(filp); - if (af_info->dictionaries != NULL) - elog(ERROR, "DictionaryBatch is not supported at '%s'", filename); - Assert(af_info->footer._num_dictionaries == 0); - return true; -} - -static ArrowFileState * -__buildArrowFileStateByFile(const char *filename, Bitmapset **p_stat_attrs) -{ - ArrowFileInfo af_info; - ArrowFileState *af_state; - arrowStatsBinary *arrow_bstats; - - if (!readArrowFile(filename, &af_info, true)) - { - elog(DEBUG2, "file '%s' is missing: %m", filename); - return NULL; - } - if (af_info.recordBatches == NULL) - { - elog(DEBUG2, "arrow file '%s' contains no RecordBatch", filename); - return NULL; - } - /* allocate ArrowFileState */ - af_state = palloc0(sizeof(ArrowFileInfo)); - af_state->filename = pstrdup(filename); - memcpy(&af_state->stat_buf, &af_info.stat_buf, sizeof(struct stat)); - - arrow_bstats = buildArrowStatsBinary(&af_info.footer, p_stat_attrs); - for (int i=0; i < af_info.footer._num_recordBatches; i++) - { - ArrowBlock *block = &af_info.footer.recordBatches[i]; - ArrowRecordBatch *rbatch = &af_info.recordBatches[i].body.recordBatch; - RecordBatchState *rb_state; - - rb_state = __buildRecordBatchStateOne(&af_info.footer.schema, - af_state, i, block, rbatch); - if (arrow_bstats) - applyArrowStatsBinary(rb_state, arrow_bstats); - af_state->rb_list = lappend(af_state->rb_list, rb_state); - } - releaseArrowStatsBinary(arrow_bstats); - - return af_state; -} - - -static arrowMetadataFieldCache * -__buildArrowMetadataFieldCache(RecordBatchFieldState *rb_field) -{ - arrowMetadataFieldCache *fcache; - - fcache = __allocMetadataFieldCache(); - if (!fcache) - return NULL; - fcache->atttypid = rb_field->atttypid; - fcache->atttypmod = rb_field->atttypmod; - memcpy(&fcache->attopts, &rb_field->attopts, sizeof(ArrowTypeOptions)); - fcache->nitems = rb_field->nitems; - fcache->null_count = rb_field->null_count; - fcache->nullmap_offset = rb_field->nullmap_offset; - fcache->nullmap_length = rb_field->nullmap_length; - fcache->values_offset = rb_field->values_offset; - fcache->values_length = rb_field->values_length; - fcache->extra_offset = rb_field->extra_offset; - fcache->extra_length = rb_field->extra_length; - memcpy(&fcache->stat_datum, - &rb_field->stat_datum, sizeof(MinMaxStatDatum)); - fcache->num_children = rb_field->num_children; - dlist_init(&fcache->children); - for (int j=0; j < rb_field->num_children; j++) - { - arrowMetadataFieldCache *__fcache; - - __fcache = __buildArrowMetadataFieldCache(&rb_field->children[j]); - if (!__fcache) - { - __releaseMetadataFieldCache(fcache); - return NULL; - } - dlist_push_tail(&fcache->children, &__fcache->chain); - } - return fcache; -} - -/* - * __buildArrowMetadataCacheNoLock - * - * it builds arrowMetadataCache entries according to the supplied - * ArrowFileState - */ -static void -__buildArrowMetadataCacheNoLock(ArrowFileState *af_state) -{ - arrowMetadataCache *mcache_head = NULL; - arrowMetadataCache *mcache_prev = NULL; - arrowMetadataCache *mcache; - uint32_t hindex; - ListCell *lc; - - foreach (lc, af_state->rb_list) - { - RecordBatchState *rb_state = lfirst(lc); - - mcache = __allocMetadataCache(); - if (!mcache) - { - __releaseMetadataCache(mcache_head); - return; - } - memcpy(&mcache->stat_buf, - &af_state->stat_buf, sizeof(struct stat)); - mcache->rb_index = rb_state->rb_index; - mcache->rb_offset = rb_state->rb_offset; - mcache->rb_length = rb_state->rb_length; - mcache->rb_nitems = rb_state->rb_nitems; - mcache->nfields = rb_state->nfields; - dlist_init(&mcache->fields); - if (!mcache_head) - mcache_head = mcache; - else - mcache_prev->next = mcache; - - for (int j=0; j < rb_state->nfields; j++) - { - arrowMetadataFieldCache *fcache; - - fcache = __buildArrowMetadataFieldCache(&rb_state->fields[j]); - if (!fcache) - { - __releaseMetadataCache(mcache_head); - return; - } - dlist_push_tail(&mcache->fields, &fcache->chain); - } - mcache_prev = mcache; - } - /* chain to the list */ - hindex = arrowMetadataHashIndex(&af_state->stat_buf); - dlist_push_tail(&arrow_metadata_cache->hash_slots[hindex], - &mcache_head->chain ); - SpinLockAcquire(&arrow_metadata_cache->lru_lock); - gettimeofday(&mcache_head->lru_tv, NULL); - dlist_push_head(&arrow_metadata_cache->lru_list, &mcache_head->lru_chain); - SpinLockRelease(&arrow_metadata_cache->lru_lock); -} - -static ArrowFileState * -BuildArrowFileState(Relation frel, const char *filename, Bitmapset **p_stat_attrs) -{ - arrowMetadataCache *mcache; - ArrowFileState *af_state; - RecordBatchState *rb_state; - struct stat stat_buf; - TupleDesc tupdesc; - - if (stat(filename, &stat_buf) != 0) - elog(ERROR, "failed on stat('%s'): %m", filename); - LWLockAcquire(&arrow_metadata_cache->mutex, LW_SHARED); - mcache = lookupArrowMetadataCache(&stat_buf, false); - if (mcache) - { - /* found a valid metadata-cache */ - af_state = __buildArrowFileStateByCache(filename, mcache, - p_stat_attrs); - } - else - { - LWLockRelease(&arrow_metadata_cache->mutex); - - /* here is no valid metadata-cache, so build it from the raw file */ - af_state = __buildArrowFileStateByFile(filename, p_stat_attrs); - if (!af_state) - return NULL; /* file not found? */ - - LWLockAcquire(&arrow_metadata_cache->mutex, LW_EXCLUSIVE); - mcache = lookupArrowMetadataCache(&af_state->stat_buf, true); - if (!mcache) - __buildArrowMetadataCacheNoLock(af_state); - } - LWLockRelease(&arrow_metadata_cache->mutex); - - /* compatibility checks */ - rb_state = linitial(af_state->rb_list); - tupdesc = RelationGetDescr(frel); - if (tupdesc->natts != rb_state->nfields) - elog(ERROR, "arrow_fdw: foreign table '%s' is not compatible to '%s'", - RelationGetRelationName(frel), filename); - for (int j=0; j < tupdesc->natts; j++) - { - Form_pg_attribute attr = TupleDescAttr(tupdesc, j); - RecordBatchFieldState *rb_field = &rb_state->fields[j]; - - if (attr->atttypid != rb_field->atttypid) - elog(ERROR, "arrow_fdw: foreign table '%s' column '%s' (%s) is not compatible to the arrow field (%s) in the '%s'", - RelationGetRelationName(frel), - NameStr(attr->attname), - format_type_be(attr->atttypid), - format_type_be(rb_field->atttypid), - filename); - } - return af_state; -} - -/* - * baseRelIsArrowFdw - */ -bool -baseRelIsArrowFdw(RelOptInfo *baserel) -{ - if ((baserel->reloptkind == RELOPT_BASEREL || - baserel->reloptkind == RELOPT_OTHER_MEMBER_REL) && - baserel->rtekind == RTE_RELATION && - OidIsValid(baserel->serverid) && - baserel->fdwroutine && - memcmp(baserel->fdwroutine, - &pgstrom_arrow_fdw_routine, - sizeof(FdwRoutine)) == 0) - return true; - - return false; -} - -/* - * RelationIsArrowFdw - */ -bool -RelationIsArrowFdw(Relation frel) -{ - if (RelationGetForm(frel)->relkind == RELKIND_FOREIGN_TABLE) - { - FdwRoutine *routine = GetFdwRoutineForRelation(frel, false); - - if (memcmp(routine, &pgstrom_arrow_fdw_routine, sizeof(FdwRoutine)) == 0) - return true; - } - return false; -} - -/* - * GetOptimalGpusForArrowFdw - */ -const Bitmapset * -GetOptimalGpusForArrowFdw(PlannerInfo *root, RelOptInfo *baserel) -{ - List *priv_list = (List *)baserel->fdw_private; - Bitmapset *optimal_gpus = NULL; - - if (baseRelIsArrowFdw(baserel) && - IsA(priv_list, List) && list_length(priv_list) == 2) - { - List *af_list = lsecond(priv_list); - ListCell *lc; - - foreach (lc, af_list) - { - ArrowFileState *af_state = lfirst(lc); - const Bitmapset *__optimal_gpus; - - __optimal_gpus = GetOptimalGpuForFile(af_state->filename); - if (lc == list_head(af_list)) - optimal_gpus = bms_copy(__optimal_gpus); - else - optimal_gpus = bms_intersect(optimal_gpus, __optimal_gpus); - } - } - return optimal_gpus; -} - -/* - * GetOptimalDpuForArrowFdw - */ -const DpuStorageEntry * -GetOptimalDpuForArrowFdw(PlannerInfo *root, RelOptInfo *baserel) -{ - const DpuStorageEntry *ds_entry = NULL; - List *priv_list = (List *)baserel->fdw_private; - - if (baseRelIsArrowFdw(baserel) && - IsA(priv_list, List) && list_length(priv_list) == 2) - { - List *af_list = linitial(priv_list); - ListCell *lc; - - foreach (lc, af_list) - { - ArrowFileState *af_state = lfirst(lc); - const DpuStorageEntry *__ds_entry; - - __ds_entry = GetOptimalDpuForFile(af_state->filename, NULL); - if (lc == list_head(af_list)) - ds_entry = __ds_entry; - else if (ds_entry && ds_entry != __ds_entry) - ds_entry = NULL; - } - } - return ds_entry; -} - -/* - * arrowFdwExtractFilesList - */ -static List * -arrowFdwExtractFilesList(List *options_list, - int *p_parallel_nworkers) -{ - - ListCell *lc; - List *filesList = NIL; - char *dir_path = NULL; - char *dir_suffix = NULL; - int parallel_nworkers = -1; - - foreach (lc, options_list) - { - DefElem *defel = lfirst(lc); - - Assert(IsA(defel->arg, String)); - if (strcmp(defel->defname, "file") == 0) - { - char *temp = strVal(defel->arg); - - if (access(temp, R_OK) != 0) - elog(ERROR, "arrow_fdw: unable to access '%s': %m", temp); - filesList = lappend(filesList, makeString(pstrdup(temp))); - } - else if (strcmp(defel->defname, "files") == 0) - { - char *temp = pstrdup(strVal(defel->arg)); - char *saveptr; - char *tok; - - while ((tok = strtok_r(temp, ",", &saveptr)) != NULL) - { - tok = __trim(tok); - - if (*tok != '/') - elog(ERROR, "arrow_fdw: file '%s' must be absolute path", tok); - if (access(tok, R_OK) != 0) - elog(ERROR, "arrow_fdw: unable to access '%s': %m", tok); - filesList = lappend(filesList, makeString(pstrdup(tok))); - } - pfree(temp); - } - else if (strcmp(defel->defname, "dir") == 0) - { - dir_path = strVal(defel->arg); - if (*dir_path != '/') - elog(ERROR, "arrow_fdw: dir '%s' must be absolute path", dir_path); - } - else if (strcmp(defel->defname, "suffix") == 0) - { - dir_suffix = strVal(defel->arg); - } - else if (strcmp(defel->defname, "parallel_workers") == 0) - { - if (parallel_nworkers >= 0) - elog(ERROR, "'parallel_workers' appeared twice"); - parallel_nworkers = atoi(strVal(defel->arg)); - } - else - elog(ERROR, "arrow: unknown option (%s)", defel->defname); - } - if (dir_suffix && !dir_path) - elog(ERROR, "arrow: cannot use 'suffix' option without 'dir'"); - - if (dir_path) - { - struct dirent *dentry; - DIR *dir; - char *temp; - - dir = AllocateDir(dir_path); - while ((dentry = ReadDir(dir, dir_path)) != NULL) - { - if (strcmp(dentry->d_name, ".") == 0 || - strcmp(dentry->d_name, "..") == 0) - continue; - if (dir_suffix) - { - char *pos = strrchr(dentry->d_name, '.'); - - if (!pos || strcmp(pos+1, dir_suffix) != 0) - continue; - } - temp = psprintf("%s/%s", dir_path, dentry->d_name); - if (access(temp, R_OK) != 0) - { - elog(DEBUG1, "arrow_fdw: unable to read '%s', so skipped", temp); - continue; - } - filesList = lappend(filesList, makeString(temp)); - } - FreeDir(dir); - } - - if (p_parallel_nworkers) - *p_parallel_nworkers = parallel_nworkers; - return filesList; -} - -/* ---------------------------------------------------------------- - * - * arrowFdwLoadRecordBatch() and related routines - * - * it setup KDS (ARROW format) with IOvec according to RecordBatchState - * - * ---------------------------------------------------------------- - */ - -/* - * arrowFdwSetupIOvector - */ -typedef struct -{ - off_t rb_offset; - off_t f_offset; - off_t m_offset; - size_t kds_head_sz; - int32_t depth; - int32_t io_index; - strom_io_chunk ioc[FLEXIBLE_ARRAY_MEMBER]; -} arrowFdwSetupIOContext; - -static void -__setupIOvectorField(arrowFdwSetupIOContext *con, - off_t chunk_offset, - size_t chunk_length, - uint32_t *p_cmeta_offset, - uint32_t *p_cmeta_length) -{ - off_t f_pos = con->rb_offset + chunk_offset; - size_t __length = MAXALIGN(chunk_length); - - Assert(con->m_offset == MAXALIGN(con->m_offset)); - - if (f_pos == con->f_offset) - { - /* good, buffer is fully continuous */ - *p_cmeta_offset = __kds_packed(con->kds_head_sz + - con->m_offset); - *p_cmeta_length = __kds_packed(__length); - - con->m_offset += __length; - con->f_offset += __length; - } - else if (f_pos > con->f_offset && - (f_pos & ~PAGE_MASK) == (con->f_offset & ~PAGE_MASK) && - (f_pos - con->f_offset) == MAXALIGN(f_pos - con->f_offset)) - { - /* - * we can also consolidate the i/o of two chunks, if file position - * of the next chunk (f_pos) and the current file tail position - * (con->f_offset) locate within the same file page, and if gap bytes - * on the file does not break alignment. - */ - size_t __gap = (f_pos - con->f_offset); - - /* put gap bytes */ - Assert(__gap < PAGE_SIZE); - con->m_offset += __gap; - con->f_offset += __gap; - - *p_cmeta_offset = __kds_packed(con->kds_head_sz + - con->m_offset); - *p_cmeta_length = __kds_packed(__length); - - con->m_offset += __length; - con->f_offset += __length; - } - else - { - /* - * Elsewhere, we have no chance to consolidate this chunk to - * the previous i/o-chunk. So, make a new i/o-chunk. - */ - off_t f_base = TYPEALIGN_DOWN(PAGE_SIZE, f_pos); - off_t gap = f_pos - f_base; - strom_io_chunk *ioc; - - if (con->io_index < 0) - con->io_index = 0; /* no previous i/o chunks */ - else - { - off_t f_tail = PAGE_ALIGN(con->f_offset); - - ioc = &con->ioc[con->io_index++]; - ioc->nr_pages = f_tail / PAGE_SIZE - ioc->fchunk_id; - con->m_offset += (f_tail - con->f_offset); /* margin for alignment */ - } - Assert(con->m_offset == PAGE_ALIGN(con->m_offset)); - ioc = &con->ioc[con->io_index]; - ioc->m_offset = con->m_offset; - ioc->fchunk_id = f_base / PAGE_SIZE; - - con->m_offset += gap; - *p_cmeta_offset = __kds_packed(con->kds_head_sz + - con->m_offset); - *p_cmeta_length = __kds_packed(__length); - con->m_offset += __length; - con->f_offset = f_pos + __length; - } -} - -static void -arrowFdwSetupIOvectorField(arrowFdwSetupIOContext *con, - RecordBatchFieldState *rb_field, - kern_data_store *kds, - kern_colmeta *cmeta) -{ - //int index = cmeta - kds->colmeta; - - if (rb_field->nullmap_length > 0) - { - Assert(rb_field->null_count > 0); - __setupIOvectorField(con, - rb_field->nullmap_offset, - rb_field->nullmap_length, - &cmeta->nullmap_offset, - &cmeta->nullmap_length); - //elog(INFO, "D%d att[%d] nullmap=%lu,%lu m_offset=%lu f_offset=%lu", con->depth, index, rb_field->nullmap_offset, rb_field->nullmap_length, con->m_offset, con->f_offset); - } - if (rb_field->values_length > 0) - { - __setupIOvectorField(con, - rb_field->values_offset, - rb_field->values_length, - &cmeta->values_offset, - &cmeta->values_length); - //elog(INFO, "D%d att[%d] values=%lu,%lu m_offset=%lu f_offset=%lu", con->depth, index, rb_field->values_offset, rb_field->values_length, con->m_offset, con->f_offset); - } - if (rb_field->extra_length > 0) - { - __setupIOvectorField(con, - rb_field->extra_offset, - rb_field->extra_length, - &cmeta->extra_offset, - &cmeta->extra_length); - //elog(INFO, "D%d att[%d] extra=%lu,%lu m_offset=%lu f_offset=%lu", con->depth, index, rb_field->extra_offset, rb_field->extra_length, con->m_offset, con->f_offset); - } - - /* nested sub-fields if composite types */ - if (cmeta->atttypkind == TYPE_KIND__ARRAY || - cmeta->atttypkind == TYPE_KIND__COMPOSITE) - { - kern_colmeta *subattr; - int j; - - Assert(rb_field->num_children == cmeta->num_subattrs); - con->depth++; - for (j=0, subattr = &kds->colmeta[cmeta->idx_subattrs]; - j < cmeta->num_subattrs; - j++, subattr++) - { - RecordBatchFieldState *child = &rb_field->children[j]; - - arrowFdwSetupIOvectorField(con, child, kds, subattr); - } - con->depth--; - } -} - -static strom_io_vector * -arrowFdwSetupIOvector(RecordBatchState *rb_state, - Bitmapset *referenced, - kern_data_store *kds) -{ - arrowFdwSetupIOContext *con; - strom_io_vector *iovec; - - Assert(kds->ncols <= kds->nr_colmeta && - kds->ncols == rb_state->nfields); - con = alloca(offsetof(arrowFdwSetupIOContext, - ioc[3 * kds->nr_colmeta])); - con->rb_offset = rb_state->rb_offset; - con->f_offset = ~0UL; /* invalid offset */ - con->m_offset = 0; - con->kds_head_sz = KDS_HEAD_LENGTH(kds); - con->depth = 0; - con->io_index = -1; /* invalid index */ - for (int j=0; j < kds->ncols; j++) - { - RecordBatchFieldState *rb_field = &rb_state->fields[j]; - kern_colmeta *cmeta = &kds->colmeta[j]; - int attidx = j + 1 - FirstLowInvalidHeapAttributeNumber; - - if (bms_is_member(attidx, referenced) || - bms_is_member(-FirstLowInvalidHeapAttributeNumber, referenced)) - arrowFdwSetupIOvectorField(con, rb_field, kds, cmeta); - else - cmeta->atttypkind = TYPE_KIND__NULL; /* unreferenced */ - } - if (con->io_index >= 0) - { - /* close the last I/O chunks */ - strom_io_chunk *ioc = &con->ioc[con->io_index++]; - - ioc->nr_pages = (TYPEALIGN(PAGE_SIZE, con->f_offset) / PAGE_SIZE - - ioc->fchunk_id); - con->m_offset = ioc->m_offset + PAGE_SIZE * ioc->nr_pages; - } - kds->length = con->m_offset; - - iovec = palloc0(offsetof(strom_io_vector, ioc[con->io_index])); - iovec->nr_chunks = con->io_index; - if (iovec->nr_chunks > 0) - memcpy(iovec->ioc, con->ioc, sizeof(strom_io_chunk) * con->io_index); -#if 0 - /* for debug - dump the i/o vector */ - { - elog(INFO, "nchunks = %d", iovec->nr_chunks); - for (int j=0; j < iovec->nr_chunks; j++) - { - strom_io_chunk *ioc = &iovec->ioc[j]; - - elog(INFO, "io[%d] [ m_offset=%lu, f_read=%lu...%lu, nr_pages=%u}", - j, - ioc->m_offset, - ioc->fchunk_id * PAGE_SIZE, - (ioc->fchunk_id + ioc->nr_pages) * PAGE_SIZE, - ioc->nr_pages); - } - - elog(INFO, "kds {length=%zu nitems=%u typeid=%u typmod=%u table_oid=%u}", - kds->length, kds->nitems, - kds->tdtypeid, kds->tdtypmod, kds->table_oid); - for (int j=0; j < kds->nr_colmeta; j++) - { - kern_colmeta *cmeta = &kds->colmeta[j]; - - elog(INFO, "%ccol[%d] nullmap=%lu,%lu values=%lu,%lu extra=%lu,%lu", - j < kds->ncols ? ' ' : '*', j, - __kds_unpack(cmeta->nullmap_offset), - __kds_unpack(cmeta->nullmap_length), - __kds_unpack(cmeta->values_offset), - __kds_unpack(cmeta->values_length), - __kds_unpack(cmeta->extra_offset), - __kds_unpack(cmeta->extra_length)); - } - } -#endif - return iovec; -} - -/* - * arrowFdwLoadRecordBatch - */ -static void -__arrowKdsAssignAttrOptions(kern_data_store *kds, - kern_colmeta *cmeta, - RecordBatchFieldState *rb_field) -{ - memcpy(&cmeta->attopts, - &rb_field->attopts, sizeof(ArrowTypeOptions)); - if (cmeta->atttypkind == TYPE_KIND__ARRAY) - { - Assert(cmeta->idx_subattrs >= kds->ncols && - cmeta->num_subattrs == 1 && - cmeta->idx_subattrs + cmeta->num_subattrs <= kds->nr_colmeta && - rb_field->num_children == 1); - __arrowKdsAssignAttrOptions(kds, - &kds->colmeta[cmeta->idx_subattrs], - &rb_field->children[0]); - } - else if (cmeta->atttypkind == TYPE_KIND__COMPOSITE) - { - Assert(cmeta->idx_subattrs >= kds->ncols && - cmeta->num_subattrs == rb_field->num_children && - cmeta->idx_subattrs + cmeta->num_subattrs <= kds->nr_colmeta); - for (int j=0; j < cmeta->num_subattrs; j++) - { - __arrowKdsAssignAttrOptions(kds, - &kds->colmeta[cmeta->idx_subattrs + j], - &rb_field->children[j]); - } - } -} - -static strom_io_vector * -arrowFdwLoadRecordBatch(Relation relation, - Bitmapset *referenced, - RecordBatchState *rb_state, - StringInfo chunk_buffer) -{ - TupleDesc tupdesc = RelationGetDescr(relation); - size_t head_sz = estimate_kern_data_store(tupdesc); - kern_data_store *kds; - - /* setup KDS and I/O-vector */ - enlargeStringInfo(chunk_buffer, head_sz); - kds = (kern_data_store *)(chunk_buffer->data + - chunk_buffer->len); - setup_kern_data_store(kds, tupdesc, 0, KDS_FORMAT_ARROW); - kds->nitems = rb_state->rb_nitems; - kds->table_oid = RelationGetRelid(relation); - Assert(head_sz == KDS_HEAD_LENGTH(kds)); - Assert(kds->ncols == rb_state->nfields); - for (int j=0; j < kds->ncols; j++) - __arrowKdsAssignAttrOptions(kds, - &kds->colmeta[j], - &rb_state->fields[j]); - chunk_buffer->len += head_sz; - - return arrowFdwSetupIOvector(rb_state, referenced, kds); -} - -static kern_data_store * -arrowFdwFillupRecordBatch(Relation relation, - Bitmapset *referenced, - RecordBatchState *rb_state, - StringInfo chunk_buffer) -{ - ArrowFileState *af_state = rb_state->af_state; - kern_data_store *kds; - strom_io_vector *iovec; - char *base; - File filp; - - resetStringInfo(chunk_buffer); - iovec = arrowFdwLoadRecordBatch(relation, - referenced, - rb_state, - chunk_buffer); - kds = (kern_data_store *)chunk_buffer->data; - enlargeStringInfo(chunk_buffer, kds->length); - kds = (kern_data_store *)chunk_buffer->data; - filp = PathNameOpenFile(af_state->filename, O_RDONLY | PG_BINARY); - base = (char *)kds + KDS_HEAD_LENGTH(kds); - for (int i=0; i < iovec->nr_chunks; i++) - { - strom_io_chunk *ioc = &iovec->ioc[i]; - char *dest = base + ioc->m_offset; - off_t f_pos = (size_t)ioc->fchunk_id * PAGE_SIZE; - size_t len = (size_t)ioc->nr_pages * PAGE_SIZE; - ssize_t sz; - - while (len > 0) - { - CHECK_FOR_INTERRUPTS(); - - sz = FileRead(filp, dest, len, f_pos, - WAIT_EVENT_REORDER_BUFFER_READ); - if (sz > 0) - { - Assert(sz <= len); - dest += sz; - f_pos += sz; - len -= sz; - } - else if (sz == 0) - { - /* - * Due to the page_sz alignment, we may try to read the file - * over its tail. So, pread(2) may tell us unable to read - * any more. The expected scenario happend only when remained - * length is less than PAGE_SIZE. - */ - memset(dest, 0, len); - break; - } - else if (errno != EINTR) - { - assert(false); - elog(ERROR, "failed on FileRead('%s', pos=%lu, len=%lu): %m", - af_state->filename, f_pos, len); - } - } - } - chunk_buffer->len += kds->length; - FileClose(filp); - - pfree(iovec); - - return kds; -} - -/* - * ArrowGetForeignRelSize - */ -static size_t -__recordBatchFieldLength(RecordBatchFieldState *rb_field) -{ - size_t len = 0; - - if (rb_field->null_count > 0) - len += rb_field->nullmap_length; - len += (rb_field->values_length + - rb_field->extra_length); - for (int j=0; j < rb_field->num_children; j++) - len += __recordBatchFieldLength(&rb_field->children[j]); - return len; -} - -static void -ArrowGetForeignRelSize(PlannerInfo *root, - RelOptInfo *baserel, - Oid foreigntableid) -{ - ForeignTable *ft = GetForeignTable(foreigntableid); - Relation frel = table_open(foreigntableid, NoLock); - List *filesList; - List *results = NIL; - Bitmapset *referenced = NULL; - ListCell *lc1, *lc2; - size_t totalLen = 0; - double ntuples = 0.0; - int parallel_nworkers; - - /* columns to be referenced */ - foreach (lc1, baserel->baserestrictinfo) - { - RestrictInfo *rinfo = lfirst(lc1); - - pull_varattnos((Node *)rinfo->clause, baserel->relid, &referenced); - } - referenced = pickup_outer_referenced(root, baserel, referenced); - - /* read arrow-file metadta */ - filesList = arrowFdwExtractFilesList(ft->options, ¶llel_nworkers); - foreach (lc1, filesList) - { - ArrowFileState *af_state; - char *fname = strVal(lfirst(lc1)); - - af_state = BuildArrowFileState(frel, fname, NULL); - if (!af_state) - continue; - - /* - * Size calculation based the record-batch metadata - */ - foreach (lc2, af_state->rb_list) - { - RecordBatchState *rb_state = lfirst(lc2); - - /* whole-row reference? */ - if (bms_is_member(-FirstLowInvalidHeapAttributeNumber, referenced)) - { - totalLen += rb_state->rb_length; - } - else - { - int j, k; - - for (k = bms_next_member(referenced, -1); - k >= 0; - k = bms_next_member(referenced, k)) - { - j = k + FirstLowInvalidHeapAttributeNumber; - if (j <= 0 || j > rb_state->nfields) - continue; - totalLen += __recordBatchFieldLength(&rb_state->fields[j-1]); - } - } - ntuples += rb_state->rb_nitems; - } - results = lappend(results, af_state); - } - table_close(frel, NoLock); - - /* setup baserel */ - baserel->rel_parallel_workers = parallel_nworkers; - baserel->fdw_private = list_make2(results, referenced); - baserel->pages = totalLen / BLCKSZ; - baserel->tuples = ntuples; - baserel->rows = ntuples * - clauselist_selectivity(root, - baserel->baserestrictinfo, - 0, - JOIN_INNER, - NULL); -} - -/* - * cost_arrow_fdw_seqscan - */ -static void -cost_arrow_fdw_seqscan(Path *path, - PlannerInfo *root, - RelOptInfo *baserel, - ParamPathInfo *param_info, - int num_workers) -{ - Cost startup_cost = 0.0; - Cost disk_run_cost = 0.0; - Cost cpu_run_cost = 0.0; - QualCost qcost; - double nrows; - double spc_seq_page_cost; - - if (param_info) - nrows = param_info->ppi_rows; - else - nrows = baserel->rows; - - /* arrow_fdw.enabled */ - if (!arrow_fdw_enabled) - startup_cost += disable_cost; - - /* - * Storage costs - * - * XXX - smaller number of columns to read shall have less disk cost - * because of columnar format. Right now, we don't discount cost for - * the pages not to be read. - */ - get_tablespace_page_costs(baserel->reltablespace, - NULL, - &spc_seq_page_cost); - disk_run_cost = spc_seq_page_cost * baserel->pages; - - /* CPU costs */ - if (param_info) - { - cost_qual_eval(&qcost, param_info->ppi_clauses, root); - qcost.startup += baserel->baserestrictcost.startup; - qcost.per_tuple += baserel->baserestrictcost.per_tuple; - } - else - qcost = baserel->baserestrictcost; - startup_cost += qcost.startup; - cpu_run_cost = (cpu_tuple_cost + qcost.per_tuple) * baserel->tuples; - - /* tlist evaluation costs */ - startup_cost += path->pathtarget->cost.startup; - cpu_run_cost += path->pathtarget->cost.per_tuple * path->rows; - - /* adjust cost for CPU parallelism */ - if (num_workers > 0) - { - double leader_contribution; - double parallel_divisor = (double) num_workers; - - /* see get_parallel_divisor() */ - leader_contribution = 1.0 - (0.3 * (double)num_workers); - parallel_divisor += Max(leader_contribution, 0.0); - - /* The CPU cost is divided among all the workers. */ - cpu_run_cost /= parallel_divisor; - - /* Estimated row count per background worker process */ - nrows = clamp_row_est(nrows / parallel_divisor); - } - path->rows = nrows; - path->startup_cost = startup_cost; - path->total_cost = startup_cost + cpu_run_cost + disk_run_cost; - path->parallel_workers = num_workers; -} - -/* - * ArrowGetForeignPaths - */ -static void -ArrowGetForeignPaths(PlannerInfo *root, - RelOptInfo *baserel, - Oid foreigntableid) -{ - ForeignPath *fpath; - ParamPathInfo *param_info; - Relids required_outer = baserel->lateral_relids; - - param_info = get_baserel_parampathinfo(root, baserel, required_outer); - fpath = create_foreignscan_path(root, - baserel, - NULL, /* default pathtarget */ - -1.0, /* dummy */ - -1.0, /* dummy */ - -1.0, /* dummy */ - NIL, /* no pathkeys */ - required_outer, - NULL, /* no extra plan */ - NIL); /* no particular private */ - cost_arrow_fdw_seqscan(&fpath->path, - root, - baserel, - param_info, 0); - add_path(baserel, &fpath->path); - - if (baserel->consider_parallel) - { - int num_workers = - compute_parallel_worker(baserel, - baserel->pages, -1.0, - max_parallel_workers_per_gather); - if (num_workers == 0) - return; - - fpath = create_foreignscan_path(root, - baserel, - NULL, /* default pathtarget */ - -1.0, /* dummy */ - -1.0, /* dummy */ - -1.0, /* dummy */ - NIL, /* no pathkeys */ - required_outer, - NULL, /* no extra plan */ - NIL); /* no particular private */ - fpath->path.parallel_aware = true; - cost_arrow_fdw_seqscan(&fpath->path, - root, - baserel, - param_info, - num_workers); - add_partial_path(baserel, (Path *)fpath); - } -} - -/* - * ArrowGetForeignPlan - */ -static ForeignScan * -ArrowGetForeignPlan(PlannerInfo *root, - RelOptInfo *baserel, - Oid foreigntableid, - ForeignPath *best_path, - List *tlist, - List *scan_clauses, - Plan *outer_plan) -{ - Bitmapset *referenced = lsecond(baserel->fdw_private); - List *ref_list = NIL; - int k; - - for (k = bms_next_member(referenced, -1); - k >= 0; - k = bms_next_member(referenced, k)) - { - ref_list = lappend_int(ref_list, k); - } - return make_foreignscan(tlist, - extract_actual_clauses(scan_clauses, false), - baserel->relid, - NIL, /* no expressions to evaluate */ - ref_list, /* list of referenced attnums */ - NIL, /* no custom tlist */ - NIL, /* no remote quals */ - outer_plan); -} - -/* ---------------------------------------------------------------- - * - * Routines related to Arrow datum fetch - * - * ---------------------------------------------------------------- - */ -static void pg_datum_arrow_ref(kern_data_store *kds, - kern_colmeta *cmeta, - size_t index, - Datum *p_datum, - bool *p_isnull); - -static Datum -pg_varlena32_arrow_ref(kern_data_store *kds, - kern_colmeta *cmeta, size_t index) -{ - uint32_t *offset = (uint32_t *)((char *)kds + - __kds_unpack(cmeta->values_offset)); - char *extra = (char *)kds + __kds_unpack(cmeta->extra_offset); - uint32_t len; - struct varlena *res; - - if (sizeof(uint32_t) * (index+2) > __kds_unpack(cmeta->values_length)) - elog(ERROR, "corruption? varlena index out of range"); - len = offset[index+1] - offset[index]; - if (offset[index] > offset[index+1] || - offset[index+1] > __kds_unpack(cmeta->extra_length)) - elog(ERROR, "corruption? varlena points out of extra buffer"); - if (len >= (1UL<values_offset)); - char *extra = (char *)kds + __kds_unpack(cmeta->extra_offset); - uint64_t len; - struct varlena *res; - - if (sizeof(uint64_t) * (index+2) > __kds_unpack(cmeta->values_length)) - elog(ERROR, "corruption? varlena index out of range"); - len = offset[index+1] - offset[index]; - if (offset[index] > offset[index+1] || - offset[index+1] > __kds_unpack(cmeta->extra_length)) - elog(ERROR, "corruption? varlena points out of extra buffer"); - if (len >= (1UL<values_offset)); - size_t length = __kds_unpack(cmeta->values_length); - int32_t unitsz = cmeta->attopts.fixed_size_binary.byteWidth; - struct varlena *res; - - if (unitsz <= 0) - elog(ERROR, "CHAR(%d) is not expected", unitsz); - if (unitsz * index >= length) - elog(ERROR, "corruption? bpchar points out of range"); - res = palloc(VARHDRSZ + unitsz); - memcpy((char *)res + VARHDRSZ, values + unitsz * index, unitsz); - SET_VARSIZE(res, VARHDRSZ + unitsz); - - return PointerGetDatum(res); -} - -static Datum -pg_bool_arrow_ref(kern_data_store *kds, - kern_colmeta *cmeta, size_t index) -{ - uint8_t *bitmap = (uint8_t *)kds + __kds_unpack(cmeta->values_offset); - size_t length = __kds_unpack(cmeta->values_length); - uint8_t mask = (1 << (index & 7)); - - if (sizeof(uint8_t) * index >= length) - elog(ERROR, "corruption? bool points out of range"); - return BoolGetDatum((bitmap[index>>3] & mask) != 0 ? true : false); -} - -static Datum -pg_simple_arrow_ref(kern_data_store *kds, - kern_colmeta *cmeta, size_t index) -{ - int32_t unitsz = cmeta->attopts.unitsz; - char *values = (char *)kds + __kds_unpack(cmeta->values_offset); - size_t length = __kds_unpack(cmeta->values_length); - Datum retval = 0; - - Assert(unitsz > 0 && unitsz <= sizeof(Datum)); - if (unitsz * index >= length) - elog(ERROR, "corruption? simple int8 points out of range"); - memcpy(&retval, values + unitsz * index, unitsz); - return retval; -} - -static Datum -pg_numeric_arrow_ref(kern_data_store *kds, - kern_colmeta *cmeta, size_t index) -{ - char *result = palloc0(sizeof(struct NumericData)); - char *base = (char *)kds + __kds_unpack(cmeta->values_offset); - size_t length = __kds_unpack(cmeta->values_length); - int dscale = cmeta->attopts.decimal.scale; - int128_t ival; - - if (sizeof(int128_t) * index >= length) - elog(ERROR, "corruption? numeric points out of range"); - ival = ((int128_t *)base)[index]; - __xpu_numeric_to_varlena(result, dscale, ival); - - return PointerGetDatum(result); -} - -static Datum -pg_date_arrow_ref(kern_data_store *kds, - kern_colmeta *cmeta, size_t index) -{ - char *base = (char *)kds + __kds_unpack(cmeta->values_offset); - size_t length = __kds_unpack(cmeta->values_length); - DateADT dt; - - switch (cmeta->attopts.date.unit) - { - case ArrowDateUnit__Day: - if (sizeof(uint32) * index >= length) - elog(ERROR, "corruption? Date[day] points out of range"); - dt = ((uint32 *)base)[index]; - break; - case ArrowDateUnit__MilliSecond: - if (sizeof(uint64) * index >= length) - elog(ERROR, "corruption? Date[ms] points out of range"); - dt = ((uint64 *)base)[index] / 1000; - break; - default: - elog(ERROR, "Bug? unexpected unit of Date type"); - } - /* convert UNIX epoch to PostgreSQL epoch */ - dt -= (POSTGRES_EPOCH_JDATE - UNIX_EPOCH_JDATE); - return DateADTGetDatum(dt); -} - -static Datum -pg_time_arrow_ref(kern_data_store *kds, - kern_colmeta *cmeta, size_t index) -{ - char *base = (char *)kds + __kds_unpack(cmeta->values_offset); - size_t length = __kds_unpack(cmeta->values_length); - TimeADT tm; - - switch (cmeta->attopts.time.unit) - { - case ArrowTimeUnit__Second: - if (sizeof(uint32) * index >= length) - elog(ERROR, "corruption? Time[sec] points out of range"); - tm = ((uint32 *)base)[index] * 1000000L; - break; - case ArrowTimeUnit__MilliSecond: - if (sizeof(uint32) * index >= length) - elog(ERROR, "corruption? Time[ms] points out of range"); - tm = ((uint32 *)base)[index] * 1000L; - break; - case ArrowTimeUnit__MicroSecond: - if (sizeof(uint64) * index >= length) - elog(ERROR, "corruption? Time[us] points out of range"); - tm = ((uint64 *)base)[index]; - break; - case ArrowTimeUnit__NanoSecond: - if (sizeof(uint64) * index >= length) - elog(ERROR, "corruption? Time[ns] points out of range"); - tm = ((uint64 *)base)[index] / 1000L; - break; - default: - elog(ERROR, "Bug? unexpected unit of Time type"); - break; - } - return TimeADTGetDatum(tm); -} - -static Datum -pg_timestamp_arrow_ref(kern_data_store *kds, - kern_colmeta *cmeta, size_t index) -{ - char *base = (char *)kds + __kds_unpack(cmeta->values_offset); - size_t length = __kds_unpack(cmeta->values_length); - Timestamp ts; - - switch (cmeta->attopts.timestamp.unit) - { - case ArrowTimeUnit__Second: - if (sizeof(uint64) * index >= length) - elog(ERROR, "corruption? Timestamp[sec] points out of range"); - ts = ((uint64 *)base)[index] * 1000000UL; - break; - case ArrowTimeUnit__MilliSecond: - if (sizeof(uint64) * index >= length) - elog(ERROR, "corruption? Timestamp[ms] points out of range"); - ts = ((uint64 *)base)[index] * 1000UL; - break; - case ArrowTimeUnit__MicroSecond: - if (sizeof(uint64) * index >= length) - elog(ERROR, "corruption? Timestamp[us] points out of range"); - ts = ((uint64 *)base)[index]; - break; - case ArrowTimeUnit__NanoSecond: - if (sizeof(uint64) * index >= length) - elog(ERROR, "corruption? Timestamp[ns] points out of range"); - ts = ((uint64 *)base)[index] / 1000UL; - break; - default: - elog(ERROR, "Bug? unexpected unit of Timestamp type"); - break; - } - /* convert UNIX epoch to PostgreSQL epoch */ - ts -= (POSTGRES_EPOCH_JDATE - - UNIX_EPOCH_JDATE) * USECS_PER_DAY; - return TimestampGetDatum(ts); -} - -static Datum -pg_interval_arrow_ref(kern_data_store *kds, - kern_colmeta *cmeta, size_t index) -{ - char *base = (char *)kds + __kds_unpack(cmeta->values_offset); - size_t length = __kds_unpack(cmeta->values_length); - Interval *iv = palloc0(sizeof(Interval)); - - switch (cmeta->attopts.interval.unit) - { - case ArrowIntervalUnit__Year_Month: - /* 32bit: number of months */ - if (sizeof(uint32) * index >= length) - elog(ERROR, "corruption? Interval[Year/Month] points out of range"); - iv->month = ((uint32 *)base)[index]; - break; - case ArrowIntervalUnit__Day_Time: - /* 32bit+32bit: number of days and milliseconds */ - if (2 * sizeof(uint32) * index >= length) - elog(ERROR, "corruption? Interval[Day/Time] points out of range"); - iv->day = ((int32 *)base)[2 * index]; - iv->time = ((int32 *)base)[2 * index + 1] * 1000; - break; - default: - elog(ERROR, "Bug? unexpected unit of Interval type"); - } - return PointerGetDatum(iv); -} - -static Datum -pg_macaddr_arrow_ref(kern_data_store *kds, - kern_colmeta *cmeta, size_t index) -{ - char *base = (char *)kds + __kds_unpack(cmeta->values_offset); - size_t length = __kds_unpack(cmeta->values_length); - - if (cmeta->attopts.fixed_size_binary.byteWidth != sizeof(macaddr)) - elog(ERROR, "Bug? wrong FixedSizeBinary::byteWidth(%d) for macaddr", - cmeta->attopts.fixed_size_binary.byteWidth); - if (sizeof(macaddr) * index >= length) - elog(ERROR, "corruption? Binary[macaddr] points out of range"); - - return PointerGetDatum(base + sizeof(macaddr) * index); -} - -static Datum -pg_inet_arrow_ref(kern_data_store *kds, - kern_colmeta *cmeta, size_t index) -{ - char *base = (char *)kds + __kds_unpack(cmeta->values_offset); - size_t length = __kds_unpack(cmeta->values_length); - inet *ip = palloc(sizeof(inet)); - - if (cmeta->attopts.fixed_size_binary.byteWidth == 4) - { - if (4 * index >= length) - elog(ERROR, "corruption? Binary[inet4] points out of range"); - ip->inet_data.family = PGSQL_AF_INET; - ip->inet_data.bits = 32; - memcpy(ip->inet_data.ipaddr, base + 4 * index, 4); - } - else if (cmeta->attopts.fixed_size_binary.byteWidth == 16) - { - if (16 * index >= length) - elog(ERROR, "corruption? Binary[inet6] points out of range"); - ip->inet_data.family = PGSQL_AF_INET6; - ip->inet_data.bits = 128; - memcpy(ip->inet_data.ipaddr, base + 16 * index, 16); - } - else - elog(ERROR, "Bug? wrong FixedSizeBinary::byteWidth(%d) for inet", - cmeta->attopts.fixed_size_binary.byteWidth); - - SET_INET_VARSIZE(ip); - return PointerGetDatum(ip); -} - -static Datum -pg_array_arrow_ref(kern_data_store *kds, - kern_colmeta *smeta, - uint32_t start, uint32_t end) -{ - ArrayType *res; - size_t sz; - uint32_t i, nitems = end - start; - bits8 *nullmap = NULL; - size_t usage, __usage; - - /* sanity checks */ - if (start > end) - elog(ERROR, "Bug? array index has reversed order [%u..%u]", start, end); - - /* allocation of the result buffer */ - if (smeta->nullmap_offset != 0) - sz = ARR_OVERHEAD_WITHNULLS(1, nitems); - else - sz = ARR_OVERHEAD_NONULLS(1); - - if (smeta->attlen > 0) - { - sz += TYPEALIGN(smeta->attalign, - smeta->attlen) * nitems; - } - else if (smeta->attlen == -1) - { - sz += 400; /* tentative allocation */ - } - else - elog(ERROR, "Bug? corrupted kernel column metadata"); - - res = palloc0(sz); - res->ndim = 1; - if (smeta->nullmap_offset != 0) - { - res->dataoffset = ARR_OVERHEAD_WITHNULLS(1, nitems); - nullmap = ARR_NULLBITMAP(res); - } - res->elemtype = smeta->atttypid; - ARR_DIMS(res)[0] = nitems; - ARR_LBOUND(res)[0] = 1; - usage = ARR_DATA_OFFSET(res); - for (i=0; i < nitems; i++) - { - Datum datum; - bool isnull; - - pg_datum_arrow_ref(kds, smeta, start+i, &datum, &isnull); - if (isnull) - { - if (!nullmap) - elog(ERROR, "Bug? element item should not be NULL"); - } - else if (smeta->attlen > 0) - { - if (nullmap) - nullmap[i>>3] |= (1<<(i&7)); - __usage = TYPEALIGN(smeta->attalign, usage); - while (__usage + smeta->attlen > sz) - { - sz += sz; - res = repalloc(res, sz); - } - if (__usage > usage) - memset((char *)res + usage, 0, __usage - usage); - memcpy((char *)res + __usage, &datum, smeta->attlen); - usage = __usage + smeta->attlen; - } - else if (smeta->attlen == -1) - { - int32_t vl_len = VARSIZE(datum); - - if (nullmap) - nullmap[i>>3] |= (1<<(i&7)); - __usage = TYPEALIGN(smeta->attalign, usage); - while (__usage + vl_len > sz) - { - sz += sz; - res = repalloc(res, sz); - } - if (__usage > usage) - memset((char *)res + usage, 0, __usage - usage); - memcpy((char *)res + __usage, DatumGetPointer(datum), vl_len); - usage = __usage + vl_len; - - pfree(DatumGetPointer(datum)); - } - else - elog(ERROR, "Bug? corrupted kernel column metadata"); - } - SET_VARSIZE(res, usage); - - return PointerGetDatum(res); -} - -/* - * pg_datum_arrow_ref - */ -static void -pg_datum_arrow_ref(kern_data_store *kds, - kern_colmeta *cmeta, - size_t index, - Datum *p_datum, - bool *p_isnull) -{ - Datum datum = 0; - bool isnull = false; - - if (cmeta->nullmap_offset != 0) - { - size_t nullmap_offset = __kds_unpack(cmeta->nullmap_offset); - uint8 *nullmap = (uint8 *)kds + nullmap_offset; - - if (att_isnull(index, nullmap)) - { - isnull = true; - goto out; - } - } - - switch (cmeta->attopts.tag) - { - case ArrowType__Int: - case ArrowType__FloatingPoint: - datum = pg_simple_arrow_ref(kds, cmeta, index); - break; - case ArrowType__Bool: - datum = pg_bool_arrow_ref(kds, cmeta, index); - break; - case ArrowType__Decimal: - datum = pg_numeric_arrow_ref(kds, cmeta, index); - break; - case ArrowType__Date: - datum = pg_date_arrow_ref(kds, cmeta, index); - break; - case ArrowType__Time: - datum = pg_time_arrow_ref(kds, cmeta, index); - break; - case ArrowType__Timestamp: - datum = pg_timestamp_arrow_ref(kds, cmeta, index); - break; - case ArrowType__Interval: - datum = pg_interval_arrow_ref(kds, cmeta, index); - break; - case ArrowType__Utf8: - case ArrowType__Binary: - datum = pg_varlena32_arrow_ref(kds, cmeta, index); - break; - case ArrowType__LargeUtf8: - case ArrowType__LargeBinary: - datum = pg_varlena64_arrow_ref(kds, cmeta, index); - break; - - case ArrowType__FixedSizeBinary: - switch (cmeta->atttypid) - { - case MACADDROID: - datum = pg_macaddr_arrow_ref(kds, cmeta, index); - break; - case INETOID: - datum = pg_inet_arrow_ref(kds, cmeta, index); - break; - case BPCHAROID: - datum = pg_bpchar_arrow_ref(kds, cmeta, index); - break; - default: - elog(ERROR, "unknown FixedSizeBinary mapping"); - break; - } - break; - - case ArrowType__List: - { - kern_colmeta *smeta; - uint32_t *offset; - - if (cmeta->num_subattrs != 1 || - cmeta->idx_subattrs < kds->ncols || - cmeta->idx_subattrs >= kds->nr_colmeta) - elog(ERROR, "Bug? corrupted kernel column metadata"); - if (sizeof(uint32_t) * (index+2) > __kds_unpack(cmeta->values_length)) - elog(ERROR, "Bug? array index is out of range"); - smeta = &kds->colmeta[cmeta->idx_subattrs]; - offset = (uint32_t *)((char *)kds + __kds_unpack(cmeta->values_offset)); - datum = pg_array_arrow_ref(kds, smeta, - offset[index], - offset[index+1]); - isnull = false; - } - break; - - case ArrowType__LargeList: - { - kern_colmeta *smeta; - uint64_t *offset; - - if (cmeta->num_subattrs != 1 || - cmeta->idx_subattrs < kds->ncols || - cmeta->idx_subattrs >= kds->nr_colmeta) - elog(ERROR, "Bug? corrupted kernel column metadata"); - if (sizeof(uint64_t) * (index+2) > __kds_unpack(cmeta->values_length)) - elog(ERROR, "Bug? array index is out of range"); - smeta = &kds->colmeta[cmeta->idx_subattrs]; - offset = (uint64_t *)((char *)kds + __kds_unpack(cmeta->values_offset)); - datum = pg_array_arrow_ref(kds, smeta, - offset[index], - offset[index+1]); - isnull = false; - } - break; - - case ArrowType__Struct: - { - TupleDesc tupdesc = lookup_rowtype_tupdesc(cmeta->atttypid, -1); - Datum *sub_values = alloca(sizeof(Datum) * tupdesc->natts); - bool *sub_isnull = alloca(sizeof(bool) * tupdesc->natts); - HeapTuple htup; - - if (tupdesc->natts != cmeta->num_subattrs) - elog(ERROR, "Struct definition is conrrupted?"); - if (cmeta->idx_subattrs < kds->ncols || - cmeta->idx_subattrs + cmeta->num_subattrs > kds->nr_colmeta) - elog(ERROR, "Bug? strange kernel column metadata"); - for (int j=0; j < tupdesc->natts; j++) - { - kern_colmeta *sub_meta = &kds->colmeta[cmeta->idx_subattrs + j]; - - pg_datum_arrow_ref(kds, sub_meta, index, - sub_values + j, - sub_isnull + j); - } - htup = heap_form_tuple(tupdesc, sub_values, sub_isnull); - - ReleaseTupleDesc(tupdesc); - - datum = PointerGetDatum(htup->t_data); - isnull = false; - } - break; - default: - /* TODO: custom data type support here */ - elog(ERROR, "arrow_fdw: unknown or unsupported type"); - } -out: - *p_datum = datum; - *p_isnull = isnull; -} - -/* - * KDS_fetch_tuple_arrow - */ -bool -kds_arrow_fetch_tuple(TupleTableSlot *slot, - kern_data_store *kds, - size_t index, - const Bitmapset *referenced) -{ - int j, k; - - if (index >= kds->nitems) - return false; - ExecStoreAllNullTuple(slot); - for (k = bms_next_member(referenced, -1); - k >= 0; - k = bms_next_member(referenced, k)) - { - j = k + FirstLowInvalidHeapAttributeNumber - 1; - if (j < 0) - continue; - pg_datum_arrow_ref(kds, - &kds->colmeta[j], - index, - slot->tts_values + j, - slot->tts_isnull + j); - } - return true; -} - -/* ---------------------------------------------------------------- - * - * Executor callbacks - * - * ---------------------------------------------------------------- - */ - -/* - * __arrowFdwExecInit - */ -static ArrowFdwState * -__arrowFdwExecInit(ScanState *ss, - List *outer_quals, - const Bitmapset *outer_refs, - const Bitmapset **p_optimal_gpus, - const DpuStorageEntry **p_ds_entry) -{ - Relation frel = ss->ss_currentRelation; - TupleDesc tupdesc = RelationGetDescr(frel); - ForeignTable *ft = GetForeignTable(RelationGetRelid(frel)); - Bitmapset *referenced = NULL; - Bitmapset *stat_attrs = NULL; - Bitmapset *optimal_gpus = NULL; - const DpuStorageEntry *ds_entry = NULL; - bool whole_row_ref = false; - List *filesList; - List *af_states_list = NIL; - uint32_t rb_nrooms = 0; - uint32_t rb_nitems = 0; - ArrowFdwState *arrow_state; - ListCell *lc1, *lc2; - - Assert(RelationIsArrowFdw(frel)); - /* expand 'referenced' if it has whole-row reference */ - if (bms_is_member(-FirstLowInvalidHeapAttributeNumber, outer_refs)) - whole_row_ref = true; - for (int j=0; j < tupdesc->natts; j++) - { - Form_pg_attribute attr = TupleDescAttr(tupdesc, j); - int k = attr->attnum - FirstLowInvalidHeapAttributeNumber; - - if (attr->attisdropped) - continue; - if (whole_row_ref || bms_is_member(k, outer_refs)) - referenced = bms_add_member(referenced, k); - } - - /* setup ArrowFileState */ - filesList = arrowFdwExtractFilesList(ft->options, NULL); - foreach (lc1, filesList) - { - char *fname = strVal(lfirst(lc1)); - ArrowFileState *af_state; - - af_state = BuildArrowFileState(frel, fname, &stat_attrs); - if (af_state) - { - rb_nrooms += list_length(af_state->rb_list); - if (p_optimal_gpus) - { - const Bitmapset *__optimal_gpus = GetOptimalGpuForFile(fname); - - if (af_states_list == NIL) - optimal_gpus = bms_copy(__optimal_gpus); - else - optimal_gpus = bms_intersect(optimal_gpus, __optimal_gpus); - } - if (p_ds_entry) - { - const DpuStorageEntry *ds_temp; - - if (af_states_list == NIL) - ds_entry = GetOptimalDpuForFile(fname, &af_state->dpu_path); - else if (ds_entry) - { - ds_temp = GetOptimalDpuForFile(fname, &af_state->dpu_path); - if (!DpuStorageEntryIsEqual(ds_entry, ds_temp)) - ds_entry = NULL; - } - } - af_states_list = lappend(af_states_list, af_state); - } - } - - /* setup ArrowFdwState */ - arrow_state = palloc0(offsetof(ArrowFdwState, rb_states[rb_nrooms])); - arrow_state->referenced = referenced; - if (arrow_fdw_stats_hint_enabled) - arrow_state->stats_hint = execInitArrowStatsHint(ss, outer_quals, stat_attrs); - arrow_state->rbatch_index = &arrow_state->__rbatch_index_local; - arrow_state->rbatch_nload = &arrow_state->__rbatch_nload_local; - arrow_state->rbatch_nskip = &arrow_state->__rbatch_nskip_local; - initStringInfo(&arrow_state->chunk_buffer); - arrow_state->curr_filp = -1; - arrow_state->curr_kds = NULL; - arrow_state->curr_index = 0; - arrow_state->af_states_list = af_states_list; - foreach (lc1, af_states_list) - { - ArrowFileState *af_state = lfirst(lc1); - - foreach (lc2, af_state->rb_list) - { - RecordBatchState *rb_state = lfirst(lc2); - - arrow_state->rb_states[rb_nitems++] = rb_state; - } - } - Assert(rb_nrooms == rb_nitems); - arrow_state->rb_nitems = rb_nitems; - - if (p_optimal_gpus) - *p_optimal_gpus = optimal_gpus; - if (p_ds_entry) - *p_ds_entry = ds_entry; - - return arrow_state; -} - -/* - * pgstromArrowFdwExecInit - */ -bool -pgstromArrowFdwExecInit(pgstromTaskState *pts, - List *outer_quals, - const Bitmapset *outer_refs) -{ - Relation frel = pts->css.ss.ss_currentRelation; - ArrowFdwState *arrow_state = NULL; - - if (RelationIsArrowFdw(frel)) - { - arrow_state = __arrowFdwExecInit(&pts->css.ss, - outer_quals, - outer_refs, - (pts->task_kind & DEVKIND__NVIDIA_GPU) != 0 - ? &pts->optimal_gpus : NULL, - (pts->task_kind & DEVKIND__NVIDIA_DPU) != 0 - ? &pts->ds_entry : NULL); - } - pts->arrow_state = arrow_state; - return (pts->arrow_state != NULL); -} - -/* - * ArrowBeginForeignScan - */ -static void -ArrowBeginForeignScan(ForeignScanState *node, int eflags) -{ - ForeignScan *fscan = (ForeignScan *)node->ss.ps.plan; - Bitmapset *referenced = NULL; - ListCell *lc; - - foreach (lc, fscan->fdw_private) - { - int k = lfirst_int(lc); - - referenced = bms_add_member(referenced, k); - } - node->fdw_state = __arrowFdwExecInit(&node->ss, - fscan->scan.plan.qual, - referenced, - NULL, /* no GPU */ - NULL); /* no DPU */ -} - -/* - * ExecArrowScanChunk - */ -static inline RecordBatchState * -__arrowFdwNextRecordBatch(ArrowFdwState *arrow_state) -{ - RecordBatchState *rb_state; - uint32_t rb_index; - -retry: - rb_index = pg_atomic_fetch_add_u32(arrow_state->rbatch_index, 1); - if (rb_index >= arrow_state->rb_nitems) - return NULL; /* no more chunks to load */ - rb_state = arrow_state->rb_states[rb_index]; - if (arrow_state->stats_hint) - { - if (execCheckArrowStatsHint(arrow_state->stats_hint, rb_state)) - { - pg_atomic_fetch_add_u32(arrow_state->rbatch_nskip, 1); - goto retry; - } - pg_atomic_fetch_add_u32(arrow_state->rbatch_nload, 1); - } - return rb_state; -} - -/* - * pgstromScanChunkArrowFdw - */ -XpuCommand * -pgstromScanChunkArrowFdw(pgstromTaskState *pts, - struct iovec *xcmd_iov, int *xcmd_iovcnt) -{ - ArrowFdwState *arrow_state = pts->arrow_state; - StringInfo chunk_buffer = &arrow_state->chunk_buffer; - RecordBatchState *rb_state; - ArrowFileState *af_state; - strom_io_vector *iovec; - XpuCommand *xcmd; - uint32_t kds_src_offset; - uint32_t kds_src_iovec; - uint32_t kds_src_pathname; - - rb_state = __arrowFdwNextRecordBatch(arrow_state); - if (!rb_state) - return NULL; - af_state = rb_state->af_state; - - /* XpuCommand header */ - resetStringInfo(chunk_buffer); - appendBinaryStringInfo(chunk_buffer, - pts->xcmd_buf.data, - pts->xcmd_buf.len); - /* kds_src + iovec */ - kds_src_offset = chunk_buffer->len; - iovec = arrowFdwLoadRecordBatch(pts->css.ss.ss_currentRelation, - arrow_state->referenced, - rb_state, - chunk_buffer); - kds_src_iovec = __appendBinaryStringInfo(chunk_buffer, - iovec, - offsetof(strom_io_vector, - ioc[iovec->nr_chunks])); - /* arrow filename */ - kds_src_pathname = chunk_buffer->len; - if (!pts->ds_entry) - appendStringInfoString(chunk_buffer, af_state->filename); - else - appendStringInfoString(chunk_buffer, af_state->dpu_path); - appendStringInfoChar(chunk_buffer, '\0'); - - /* assign offset of XpuCommand */ - xcmd = (XpuCommand *)chunk_buffer->data; - xcmd->length = chunk_buffer->len; - xcmd->u.task.kds_src_pathname = kds_src_pathname; - xcmd->u.task.kds_src_iovec = kds_src_iovec; - xcmd->u.task.kds_src_offset = kds_src_offset; - - xcmd_iov->iov_base = xcmd; - xcmd_iov->iov_len = xcmd->length; - *xcmd_iovcnt = 1; - - return xcmd; -} - -/* - * ArrowIterateForeignScan - */ -static TupleTableSlot * -ArrowIterateForeignScan(ForeignScanState *node) -{ - ArrowFdwState *arrow_state = node->fdw_state; - TupleTableSlot *slot = node->ss.ss_ScanTupleSlot; - kern_data_store *kds; - - while ((kds = arrow_state->curr_kds) == NULL || - arrow_state->curr_index >= kds->nitems) - { - RecordBatchState *rb_state; - - arrow_state->curr_index = 0; - arrow_state->curr_kds = NULL; - rb_state = __arrowFdwNextRecordBatch(arrow_state); - if (!rb_state) - return NULL; - arrow_state->curr_kds - = arrowFdwFillupRecordBatch(node->ss.ss_currentRelation, - arrow_state->referenced, - rb_state, - &arrow_state->chunk_buffer); - } - Assert(kds && arrow_state->curr_index < kds->nitems); - if (kds_arrow_fetch_tuple(slot, kds, - arrow_state->curr_index++, - arrow_state->referenced)) - return slot; - return NULL; -} - -/* - * ArrowReScanForeignScan - */ -void -pgstromArrowFdwExecReset(ArrowFdwState *arrow_state) -{ - pg_atomic_write_u32(arrow_state->rbatch_index, 0); - if (arrow_state->curr_kds) - pfree(arrow_state->curr_kds); - arrow_state->curr_kds = NULL; - arrow_state->curr_index = 0; -} - -static void -ArrowReScanForeignScan(ForeignScanState *node) -{ - pgstromArrowFdwExecReset(node->fdw_state); -} - -/* - * ExecEndArrowScan - */ -void -pgstromArrowFdwExecEnd(ArrowFdwState *arrow_state) -{ - if (arrow_state->curr_filp >= 0) - FileClose(arrow_state->curr_filp); - if (arrow_state->stats_hint) - execEndArrowStatsHint(arrow_state->stats_hint); -} - -static void -ArrowEndForeignScan(ForeignScanState *node) -{ - pgstromArrowFdwExecEnd(node->fdw_state); -} - -/* - * ArrowIsForeignScanParallelSafe - */ -static bool -ArrowIsForeignScanParallelSafe(PlannerInfo *root, - RelOptInfo *rel, - RangeTblEntry *rte) -{ - return true; -} - -/* - * ArrowEstimateDSMForeignScan - */ -static Size -ArrowEstimateDSMForeignScan(ForeignScanState *node, - ParallelContext *pcxt) -{ - return offsetof(pgstromSharedState, inners); -} - -/* - * ArrowInitializeDSMForeignScan - */ -void -pgstromArrowFdwInitDSM(ArrowFdwState *arrow_state, - pgstromSharedState *ps_state) -{ - arrow_state->rbatch_index = &ps_state->arrow_rbatch_index; - arrow_state->rbatch_nload = &ps_state->arrow_rbatch_nload; - arrow_state->rbatch_nskip = &ps_state->arrow_rbatch_nskip; -} - -static void -ArrowInitializeDSMForeignScan(ForeignScanState *node, - ParallelContext *pcxt, - void *coordinate) -{ - pgstromSharedState *ps_state = (pgstromSharedState *)coordinate; - - memset(ps_state, 0, offsetof(pgstromSharedState, inners)); - pgstromArrowFdwInitDSM(node->fdw_state, ps_state); -} - -/* - * ArrowInitializeWorkerForeignScan - */ -void -pgstromArrowFdwAttachDSM(ArrowFdwState *arrow_state, - pgstromSharedState *ps_state) -{ - arrow_state->rbatch_index = &ps_state->arrow_rbatch_index; - arrow_state->rbatch_nload = &ps_state->arrow_rbatch_nload; - arrow_state->rbatch_nskip = &ps_state->arrow_rbatch_nskip; -} - -static void -ArrowInitializeWorkerForeignScan(ForeignScanState *node, - shm_toc *toc, - void *coordinate) -{ - pgstromSharedState *ps_state = (pgstromSharedState *)coordinate; - - pgstromArrowFdwAttachDSM(node->fdw_state, ps_state); -} - -/* - * ArrowShutdownForeignScan - */ -void -pgstromArrowFdwShutdown(ArrowFdwState *arrow_state) -{ - uint32 temp; - - temp = pg_atomic_read_u32(arrow_state->rbatch_index); - pg_atomic_write_u32(&arrow_state->__rbatch_index_local, temp); - arrow_state->rbatch_index = &arrow_state->__rbatch_index_local; - - temp = pg_atomic_read_u32(arrow_state->rbatch_nload); - pg_atomic_write_u32(&arrow_state->__rbatch_nload_local, temp); - arrow_state->rbatch_nload = &arrow_state->__rbatch_nload_local; - - temp = pg_atomic_read_u32(arrow_state->rbatch_nskip); - pg_atomic_write_u32(&arrow_state->__rbatch_nskip_local, temp); - arrow_state->rbatch_nskip = &arrow_state->__rbatch_nskip_local; - -} - -static void -ArrowShutdownForeignScan(ForeignScanState *node) -{ - pgstromArrowFdwShutdown(node->fdw_state); -} - -/* - * ArrowExplainForeignScan - */ -void -pgstromArrowFdwExplain(ArrowFdwState *arrow_state, - Relation frel, - ExplainState *es, - List *dcontext) -{ - TupleDesc tupdesc = RelationGetDescr(frel); - size_t *chunk_sz; - ListCell *lc1, *lc2; - int fcount = 0; - int j, k; - char label[100]; - StringInfoData buf; - - initStringInfo(&buf); - /* shows referenced columns */ - for (k = bms_next_member(arrow_state->referenced, -1); - k >= 0; - k = bms_next_member(arrow_state->referenced, k)) - { - j = k + FirstLowInvalidHeapAttributeNumber; - - if (j > 0) - { - Form_pg_attribute attr = TupleDescAttr(tupdesc, j-1); - const char *attname = NameStr(attr->attname); - - if (buf.len > 0) - appendStringInfoString(&buf, ", "); - appendStringInfoString(&buf, quote_identifier(attname)); - } - } - ExplainPropertyText("referenced", buf.data, es); - - /* shows stats hint if any */ - if (arrow_state->stats_hint) - { - arrowStatsHint *stats_hint = arrow_state->stats_hint; - - resetStringInfo(&buf); - foreach (lc1, stats_hint->orig_quals) - { - Node *qual = lfirst(lc1); - char *temp; - - temp = deparse_expression(qual, dcontext, es->verbose, false); - if (buf.len > 0) - appendStringInfoString(&buf, ", "); - appendStringInfoString(&buf, temp); - pfree(temp); - } - if (es->analyze) - appendStringInfo(&buf, " [loaded: %u, skipped: %u]", - pg_atomic_read_u32(arrow_state->rbatch_nload), - pg_atomic_read_u32(arrow_state->rbatch_nskip)); - ExplainPropertyText("Stats-Hint", buf.data, es); - } - - /* shows files on behalf of the foreign table */ - chunk_sz = alloca(sizeof(size_t) * tupdesc->natts); - memset(chunk_sz, 0, sizeof(size_t) * tupdesc->natts); - foreach (lc1, arrow_state->af_states_list) - { - ArrowFileState *af_state = lfirst(lc1); - size_t total_sz = af_state->stat_buf.st_size; - size_t read_sz = 0; - size_t sz; - - foreach (lc2, af_state->rb_list) - { - RecordBatchState *rb_state = lfirst(lc2); - - if (bms_is_member(-FirstLowInvalidHeapAttributeNumber, - arrow_state->referenced)) - { - /* whole-row reference */ - read_sz += rb_state->rb_length; - } - else - { - for (k = bms_next_member(arrow_state->referenced, -1); - k >= 0; - k = bms_next_member(arrow_state->referenced, k)) - { - j = k + FirstLowInvalidHeapAttributeNumber - 1; - if (j < 0 || j >= tupdesc->natts) - continue; - sz = __recordBatchFieldLength(&rb_state->fields[j]); - read_sz += sz; - chunk_sz[j] += sz; - } - } - } - - /* file size and read size */ - if (es->format == EXPLAIN_FORMAT_TEXT) - { - resetStringInfo(&buf); - appendStringInfo(&buf, "%s (read: %s, size: %s)", - af_state->filename, - format_bytesz(read_sz), - format_bytesz(total_sz)); - snprintf(label, sizeof(label), "file%d", fcount); - ExplainPropertyText(label, buf.data, es); - } - else - { - snprintf(label, sizeof(label), "file%d", fcount); - ExplainPropertyText(label, af_state->filename, es); - - snprintf(label, sizeof(label), "file%d-read", fcount); - ExplainPropertyText(label, format_bytesz(read_sz), es); - - snprintf(label, sizeof(label), "file%d-size", fcount); - ExplainPropertyText(label, format_bytesz(total_sz), es); - } - fcount++; - } - - /* read-size per column (only verbose mode) */ - if (es->verbose && arrow_state->rb_nitems > 0 && - !bms_is_member(-FirstLowInvalidHeapAttributeNumber, - arrow_state->referenced)) - { - resetStringInfo(&buf); - for (k = bms_next_member(arrow_state->referenced, -1); - k >= 0; - k = bms_next_member(arrow_state->referenced, k)) - { - Form_pg_attribute attr; - - j = k + FirstLowInvalidHeapAttributeNumber - 1; - if (j < 0 || j >= tupdesc->natts) - continue; - attr = TupleDescAttr(tupdesc, j); - snprintf(label, sizeof(label), " %s", NameStr(attr->attname)); - ExplainPropertyText(label, format_bytesz(chunk_sz[j]), es); - } - } - pfree(buf.data); -} - -static void -ArrowExplainForeignScan(ForeignScanState *node, ExplainState *es) -{ - Relation frel = node->ss.ss_currentRelation; - List *dcontext; - - dcontext = set_deparse_context_plan(es->deparse_cxt, - node->ss.ps.plan, - NULL); - pgstromArrowFdwExplain(node->fdw_state, frel, es, dcontext); -} - -/* - * ArrowAnalyzeForeignTable - */ -static int -RecordBatchAcquireSampleRows(Relation relation, - RecordBatchState *rb_state, - HeapTuple *rows, - int nsamples) -{ - TupleDesc tupdesc = RelationGetDescr(relation); - kern_data_store *kds; - Bitmapset *referenced = NULL; - StringInfoData buffer; - Datum *values; - bool *isnull; - int count; - uint32_t index; - - /* ANALYZE needs to fetch all the attributes */ - referenced = bms_make_singleton(-FirstLowInvalidHeapAttributeNumber); - initStringInfo(&buffer); - kds = arrowFdwFillupRecordBatch(relation, - referenced, - rb_state, - &buffer); - values = alloca(sizeof(Datum) * tupdesc->natts); - isnull = alloca(sizeof(bool) * tupdesc->natts); - for (count = 0; count < nsamples; count++) - { - /* fetch a row randomly */ - index = (double)kds->nitems * drand48(); - Assert(index < kds->nitems); - - for (int j=0; j < kds->ncols; j++) - { - kern_colmeta *cmeta = &kds->colmeta[j]; - - pg_datum_arrow_ref(kds, - cmeta, - index, - values + j, - isnull + j); - } - rows[count] = heap_form_tuple(tupdesc, values, isnull); - } - pfree(buffer.data); - - return count; -} - -static int -ArrowAcquireSampleRows(Relation relation, - int elevel, - HeapTuple *rows, - int nrooms, - double *p_totalrows, - double *p_totaldeadrows) -{ - ForeignTable *ft = GetForeignTable(RelationGetRelid(relation)); - List *filesList = arrowFdwExtractFilesList(ft->options, NULL); - List *rb_state_list = NIL; - ListCell *lc1, *lc2; - int64 total_nrows = 0; - int64 count_nrows = 0; - int nsamples_min = nrooms / 100; - int nitems = 0; - - foreach (lc1, filesList) - { - ArrowFileState *af_state; - char *fname = strVal(lfirst(lc1)); - - af_state = BuildArrowFileState(relation, fname, NULL); - if (!af_state) - continue; - foreach (lc2, af_state->rb_list) - { - RecordBatchState *rb_state = lfirst(lc2); - - if (rb_state->rb_nitems == 0) - continue; /* not reasonable to sample, skipped */ - total_nrows += rb_state->rb_nitems; - rb_state_list = lappend(rb_state_list, rb_state); - } - } - nrooms = Min(nrooms, total_nrows); - - /* fetch samples for each record-batch */ - foreach (lc1, rb_state_list) - { - RecordBatchState *rb_state = lfirst(lc1); - int nsamples; - - count_nrows += rb_state->rb_nitems; - nsamples = (double)nrooms * ((double)count_nrows / - (double)total_nrows) - nitems; - if (nitems + nsamples > nrooms) - nsamples = nrooms - nitems; - if (nsamples > nsamples_min) - nitems += RecordBatchAcquireSampleRows(relation, - rb_state, - rows + nitems, - nsamples); - } - *p_totalrows = total_nrows; - *p_totaldeadrows = 0.0; - - return nitems; -} - -/* - * ArrowAnalyzeForeignTable - */ -static bool -ArrowAnalyzeForeignTable(Relation frel, - AcquireSampleRowsFunc *p_sample_rows_func, - BlockNumber *p_totalpages) -{ - ForeignTable *ft = GetForeignTable(RelationGetRelid(frel)); - List *filesList = arrowFdwExtractFilesList(ft->options, NULL); - ListCell *lc; - size_t totalpages = 0; - - foreach (lc, filesList) - { - const char *fname = strVal(lfirst(lc)); - struct stat stat_buf; - - if (stat(fname, &stat_buf) != 0) - { - elog(NOTICE, "failed on stat('%s') on behalf of '%s', skipped", - fname, get_rel_name(ft->relid)); - continue; - } - totalpages += (stat_buf.st_size + BLCKSZ - 1) / BLCKSZ; - } - if (totalpages > MaxBlockNumber) - totalpages = MaxBlockNumber; - - *p_sample_rows_func = ArrowAcquireSampleRows; - *p_totalpages = totalpages; - - return true; -} - -/* - * ArrowImportForeignSchema - */ -static List * -ArrowImportForeignSchema(ImportForeignSchemaStmt *stmt, Oid serverOid) -{ - ArrowSchema schema; - List *filesList; - ListCell *lc; - int j; - StringInfoData cmd; - - /* sanity checks */ - switch (stmt->list_type) - { - case FDW_IMPORT_SCHEMA_ALL: - break; - case FDW_IMPORT_SCHEMA_LIMIT_TO: - elog(ERROR, "arrow_fdw does not support LIMIT TO clause"); - break; - case FDW_IMPORT_SCHEMA_EXCEPT: - elog(ERROR, "arrow_fdw does not support EXCEPT clause"); - break; - default: - elog(ERROR, "arrow_fdw: Bug? unknown list-type"); - break; - } - filesList = arrowFdwExtractFilesList(stmt->options, NULL); - if (filesList == NIL) - ereport(ERROR, - (errmsg("No valid apache arrow files are specified"), - errhint("Use 'file' or 'dir' option to specify apache arrow files on behalf of the foreign table"))); - - /* read the schema */ - memset(&schema, 0, sizeof(ArrowSchema)); - foreach (lc, filesList) - { - ArrowFileInfo af_info; - const char *fname = strVal(lfirst(lc)); - - readArrowFile(fname, &af_info, false); - if (lc == list_head(filesList)) - { - copyArrowNode(&schema.node, &af_info.footer.schema.node); - } - else - { - /* compatibility checks */ - ArrowSchema *stemp = &af_info.footer.schema; - - if (schema.endianness != stemp->endianness || - schema._num_fields != stemp->_num_fields) - elog(ERROR, "file '%s' has incompatible schema definition", fname); - for (j=0; j < schema._num_fields; j++) - { - if (!arrowFieldTypeIsEqual(&schema.fields[j], - &stemp->fields[j])) - elog(ERROR, "file '%s' has incompatible schema definition", fname); - } - } - } - - /* makes a command to define foreign table */ - initStringInfo(&cmd); - appendStringInfo(&cmd, "CREATE FOREIGN TABLE %s (\n", - quote_identifier(stmt->remote_schema)); - for (j=0; j < schema._num_fields; j++) - { - ArrowField *field = &schema.fields[j]; - Oid type_oid; - int32 type_mod; - char *schema; - HeapTuple htup; - Form_pg_type __type; - - __arrowFieldTypeToPGType(field, &type_oid, &type_mod, NULL); - if (!OidIsValid(type_oid)) - elog(ERROR, "unable to map Arrow type on any PG type"); - htup = SearchSysCache1(TYPEOID, ObjectIdGetDatum(type_oid)); - if (!HeapTupleIsValid(htup)) - elog(ERROR, "cache lookup failed for type %u", type_oid); - __type = (Form_pg_type) GETSTRUCT(htup); - schema = get_namespace_name(__type->typnamespace); - if (!schema) - elog(ERROR, "cache lookup failed for schema %u", __type->typnamespace); - if (j > 0) - appendStringInfo(&cmd, ",\n"); - if (type_mod < 0) - { - appendStringInfo(&cmd, " %s %s.%s", - quote_identifier(field->name), - quote_identifier(schema), - NameStr(__type->typname)); - } - else - { - Assert(type_mod >= VARHDRSZ); - appendStringInfo(&cmd, " %s %s.%s(%d)", - quote_identifier(field->name), - quote_identifier(schema), - NameStr(__type->typname), - type_mod - VARHDRSZ); - } - ReleaseSysCache(htup); - } - appendStringInfo(&cmd, - "\n" - ") SERVER %s\n" - " OPTIONS (", stmt->server_name); - foreach (lc, stmt->options) - { - DefElem *defel = lfirst(lc); - - if (lc != list_head(stmt->options)) - appendStringInfo(&cmd, ",\n "); - appendStringInfo(&cmd, "%s '%s'", - defel->defname, - strVal(defel->arg)); - } - appendStringInfo(&cmd, ")"); - - return list_make1(cmd.data); -} - -/* - * pgstrom_arrow_fdw_import_file - * - * NOTE: Due to historical reason, PostgreSQL does not allow to define - * columns more than MaxHeapAttributeNumber (1600) for foreign-tables also, - * not only heap-tables. This restriction comes from NULL-bitmap length - * in HeapTupleHeaderData and width of t_hoff. - * However, it is not a reasonable restriction for foreign-table, because - * it does not use heap-format internally. - */ -static void -__insertPgAttributeTuple(Relation pg_attr_rel, - CatalogIndexState pg_attr_index, - Oid ftable_oid, - AttrNumber attnum, - ArrowField *field) -{ - Oid type_oid; - int32 type_mod; - int16 type_len; - bool type_byval; - char type_align; - int32 type_ndims; - char type_storage; - Datum values[Natts_pg_attribute]; - bool isnull[Natts_pg_attribute]; - HeapTuple tup; - ObjectAddress myself, referenced; - - __arrowFieldTypeToPGType(field, &type_oid, &type_mod, NULL); - get_typlenbyvalalign(type_oid, - &type_len, - &type_byval, - &type_align); - type_ndims = (type_is_array(type_oid) ? 1 : 0); - type_storage = get_typstorage(type_oid); - - memset(values, 0, sizeof(values)); - memset(isnull, 0, sizeof(isnull)); - - values[Anum_pg_attribute_attrelid - 1] = ObjectIdGetDatum(ftable_oid); - values[Anum_pg_attribute_attname - 1] = CStringGetDatum(field->name); - values[Anum_pg_attribute_atttypid - 1] = ObjectIdGetDatum(type_oid); - values[Anum_pg_attribute_attstattarget - 1] = Int32GetDatum(-1); - values[Anum_pg_attribute_attlen - 1] = Int16GetDatum(type_len); - values[Anum_pg_attribute_attnum - 1] = Int16GetDatum(attnum); - values[Anum_pg_attribute_attndims - 1] = Int32GetDatum(type_ndims); - values[Anum_pg_attribute_attcacheoff - 1] = Int32GetDatum(-1); - values[Anum_pg_attribute_atttypmod - 1] = Int32GetDatum(type_mod); - values[Anum_pg_attribute_attbyval - 1] = BoolGetDatum(type_byval); - values[Anum_pg_attribute_attstorage - 1] = CharGetDatum(type_storage); - values[Anum_pg_attribute_attalign - 1] = CharGetDatum(type_align); - values[Anum_pg_attribute_attnotnull - 1] = BoolGetDatum(!field->nullable); - values[Anum_pg_attribute_attislocal - 1] = BoolGetDatum(true); - isnull[Anum_pg_attribute_attacl - 1] = true; - isnull[Anum_pg_attribute_attoptions - 1] = true; - isnull[Anum_pg_attribute_attfdwoptions - 1] = true; - isnull[Anum_pg_attribute_attmissingval - 1] = true; - - tup = heap_form_tuple(RelationGetDescr(pg_attr_rel), values, isnull); - CatalogTupleInsertWithInfo(pg_attr_rel, tup, pg_attr_index); - - /* add dependency */ - myself.classId = RelationRelationId; - myself.objectId = ftable_oid; - myself.objectSubId = attnum; - referenced.classId = TypeRelationId; - referenced.objectId = type_oid; - referenced.objectSubId = 0; - recordDependencyOn(&myself, &referenced, DEPENDENCY_NORMAL); - - heap_freetuple(tup); -} - -Datum -pgstrom_arrow_fdw_import_file(PG_FUNCTION_ARGS) -{ - CreateForeignTableStmt stmt; - ArrowSchema schema; - List *tableElts = NIL; - char *ftable_name; - char *file_name; - char *namespace_name; - DefElem *defel; - int j, nfields; - Oid ftable_oid; - ObjectAddress myself; - ArrowFileInfo af_info; - - /* read schema of the file */ - if (PG_ARGISNULL(0)) - elog(ERROR, "foreign table name is not supplied"); - ftable_name = text_to_cstring(PG_GETARG_TEXT_PP(0)); - - if (PG_ARGISNULL(1)) - elog(ERROR, "arrow filename is not supplied"); - file_name = text_to_cstring(PG_GETARG_TEXT_PP(1)); - defel = makeDefElem("file", (Node *)makeString(file_name), -1); - - if (PG_ARGISNULL(2)) - namespace_name = NULL; - else - namespace_name = text_to_cstring(PG_GETARG_TEXT_PP(2)); - - readArrowFile(file_name, &af_info, false); - copyArrowNode(&schema.node, &af_info.footer.schema.node); - if (schema._num_fields > SHRT_MAX) - Elog("Arrow file '%s' has too much fields: %d", - file_name, schema._num_fields); - - /* setup CreateForeignTableStmt */ - memset(&stmt, 0, sizeof(CreateForeignTableStmt)); - NodeSetTag(&stmt, T_CreateForeignTableStmt); - stmt.base.relation = makeRangeVar(namespace_name, ftable_name, -1); - - nfields = Min(schema._num_fields, 100); - for (j=0; j < nfields; j++) - { - ColumnDef *cdef; - Oid type_oid; - int32_t type_mod; - - __arrowFieldTypeToPGType(&schema.fields[j], - &type_oid, - &type_mod, - NULL); - cdef = makeColumnDef(schema.fields[j].name, - type_oid, - type_mod, - InvalidOid); - tableElts = lappend(tableElts, cdef); - } - stmt.base.tableElts = tableElts; - stmt.base.oncommit = ONCOMMIT_NOOP; - stmt.servername = "arrow_fdw"; - stmt.options = list_make1(defel); - - myself = DefineRelation(&stmt.base, - RELKIND_FOREIGN_TABLE, - InvalidOid, - NULL, - __FUNCTION__); - ftable_oid = myself.objectId; - CreateForeignTable(&stmt, ftable_oid); - - if (nfields < schema._num_fields) - { - Relation c_rel = table_open(RelationRelationId, RowExclusiveLock); - Relation a_rel = table_open(AttributeRelationId, RowExclusiveLock); - CatalogIndexState c_index = CatalogOpenIndexes(c_rel); - CatalogIndexState a_index = CatalogOpenIndexes(a_rel); - HeapTuple tup; - - tup = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(ftable_oid)); - if (!HeapTupleIsValid(tup)) - elog(ERROR, "cache lookup failed for relation %u", ftable_oid); - - for (j=nfields; j < schema._num_fields; j++) - { - __insertPgAttributeTuple(a_rel, - a_index, - ftable_oid, - j+1, - &schema.fields[j]); - } - /* update relnatts also */ - ((Form_pg_class) GETSTRUCT(tup))->relnatts = schema._num_fields; - CatalogTupleUpdate(c_rel, &tup->t_self, tup); - - CatalogCloseIndexes(a_index); - CatalogCloseIndexes(c_index); - table_close(a_rel, RowExclusiveLock); - table_close(c_rel, RowExclusiveLock); - - CommandCounterIncrement(); - } - PG_RETURN_VOID(); -} - -/* - * handler of Arrow_Fdw - */ -Datum -pgstrom_arrow_fdw_handler(PG_FUNCTION_ARGS) -{ - PG_RETURN_POINTER(&pgstrom_arrow_fdw_routine); -} - -/* - * validator of Arrow_Fdw - */ -Datum -pgstrom_arrow_fdw_validator(PG_FUNCTION_ARGS) -{ - List *options = untransformRelOptions(PG_GETARG_DATUM(0)); - Oid catalog = PG_GETARG_OID(1); - - if (catalog == ForeignTableRelationId) - { - List *filesList = arrowFdwExtractFilesList(options, NULL); - ListCell *lc; - - foreach (lc, filesList) - { - const char *fname = strVal(lfirst(lc)); - ArrowFileInfo af_info; - - readArrowFile(fname, &af_info, true); - } - } - else if (options != NIL) - { - const char *label; - - switch (catalog) - { - case ForeignDataWrapperRelationId: - label = "FOREIGN DATA WRAPPER"; - break; - case ForeignServerRelationId: - label = "SERVER"; - break; - case UserMappingRelationId: - label = "USER MAPPING"; - break; - case AttributeRelationId: - label = "attribute of FOREIGN TABLE"; - break; - default: - label = "????"; - break; - } - elog(ERROR, "Arrow_Fdw does not support any options for %s", label); - } - PG_RETURN_VOID(); -} - -/* - * pgstrom_arrow_fdw_precheck_schema - */ -Datum -pgstrom_arrow_fdw_precheck_schema(PG_FUNCTION_ARGS) -{ - EventTriggerData *trigdata; - Relation frel = NULL; - ListCell *lc; - bool check_schema_compatibility = false; - - if (!CALLED_AS_EVENT_TRIGGER(fcinfo)) - elog(ERROR, "%s: must be called as EventTrigger", __FUNCTION__); - trigdata = (EventTriggerData *) fcinfo->context; - if (strcmp(trigdata->event, "ddl_command_end") != 0) - elog(ERROR, "%s: must be called on ddl_command_end event", __FUNCTION__); - - if (strcmp(GetCommandTagName(trigdata->tag), - "CREATE FOREIGN TABLE") == 0) - { - CreateStmt *stmt = (CreateStmt *)trigdata->parsetree; - - frel = relation_openrv_extended(stmt->relation, NoLock, true); - if (frel && RelationIsArrowFdw(frel)) - check_schema_compatibility = true; - } - else if (strcmp(GetCommandTagName(trigdata->tag), - "ALTER FOREIGN TABLE") == 0 && - IsA(trigdata->parsetree, AlterTableStmt)) - { - AlterTableStmt *stmt = (AlterTableStmt *)trigdata->parsetree; - - frel = relation_openrv_extended(stmt->relation, NoLock, true); - if (frel && RelationIsArrowFdw(frel)) - { - foreach (lc, stmt->cmds) - { - AlterTableCmd *cmd = lfirst(lc); - - if (cmd->subtype == AT_AddColumn || - cmd->subtype == AT_DropColumn || - cmd->subtype == AT_AlterColumnType) - { - check_schema_compatibility = true; - break; - } - } - } - } - - if (check_schema_compatibility) - { - ForeignTable *ft = GetForeignTable(RelationGetRelid(frel)); - List *filesList = arrowFdwExtractFilesList(ft->options, NULL); - - foreach (lc, filesList) - { - const char *fname = strVal(lfirst(lc)); - - (void)BuildArrowFileState(frel, fname, NULL); - } - } - if (frel) - relation_close(frel, NoLock); - PG_RETURN_NULL(); -} - -/* - * pgstrom_request_arrow_fdw - */ -static void -pgstrom_request_arrow_fdw(void) -{ - size_t sz; - - if (shmem_request_next) - shmem_request_next(); - sz = TYPEALIGN(ARROW_METADATA_BLOCKSZ, - (size_t)arrow_metadata_cache_size_kb << 10); - RequestAddinShmemSpace(MAXALIGN(sizeof(arrowMetadataCacheHead)) + sz); -} - -/* - * pgstrom_startup_arrow_fdw - */ -static void -pgstrom_startup_arrow_fdw(void) -{ - bool found; - size_t sz; - char *buffer; - int i, n; - - if (shmem_startup_next) - (*shmem_startup_next)(); - - arrow_metadata_cache = ShmemInitStruct("arrowMetadataCache(head)", - MAXALIGN(sizeof(arrowMetadataCacheHead)), - &found); - Assert(!found); - - LWLockInitialize(&arrow_metadata_cache->mutex, LWLockNewTrancheId()); - SpinLockInit(&arrow_metadata_cache->lru_lock); - dlist_init(&arrow_metadata_cache->lru_list); - dlist_init(&arrow_metadata_cache->free_blocks); - dlist_init(&arrow_metadata_cache->free_mcaches); - dlist_init(&arrow_metadata_cache->free_fcaches); - for (i=0; i < ARROW_METADATA_HASH_NSLOTS; i++) - dlist_init(&arrow_metadata_cache->hash_slots[i]); - - /* slab allocator */ - sz = TYPEALIGN(ARROW_METADATA_BLOCKSZ, - (size_t)arrow_metadata_cache_size_kb << 10); - n = sz / ARROW_METADATA_BLOCKSZ; - buffer = ShmemInitStruct("arrowMetadataCache(body)", sz, &found); - Assert(!found); - for (i=0; i < n; i++) - { - arrowMetadataCacheBlock *mc_block = (arrowMetadataCacheBlock *)buffer; - - memset(mc_block, 0, offsetof(arrowMetadataCacheBlock, data)); - dlist_push_tail(&arrow_metadata_cache->free_blocks, &mc_block->chain); - - buffer += ARROW_METADATA_BLOCKSZ; - } -} - -/* - * pgstrom_init_arrow_fdw - */ -void -pgstrom_init_arrow_fdw(void) -{ - FdwRoutine *r = &pgstrom_arrow_fdw_routine; - - memset(r, 0, sizeof(FdwRoutine)); - NodeSetTag(r, T_FdwRoutine); - /* SCAN support */ - r->GetForeignRelSize = ArrowGetForeignRelSize; - r->GetForeignPaths = ArrowGetForeignPaths; - r->GetForeignPlan = ArrowGetForeignPlan; - r->BeginForeignScan = ArrowBeginForeignScan; - r->IterateForeignScan = ArrowIterateForeignScan; - r->ReScanForeignScan = ArrowReScanForeignScan; - r->EndForeignScan = ArrowEndForeignScan; - /* EXPLAIN support */ - r->ExplainForeignScan = ArrowExplainForeignScan; - /* ANALYZE support */ - r->AnalyzeForeignTable = ArrowAnalyzeForeignTable; - /* CPU Parallel support */ - r->IsForeignScanParallelSafe = ArrowIsForeignScanParallelSafe; - r->EstimateDSMForeignScan = ArrowEstimateDSMForeignScan; - r->InitializeDSMForeignScan = ArrowInitializeDSMForeignScan; - //r->ReInitializeDSMForeignScan = ArrowReInitializeDSMForeignScan; - r->InitializeWorkerForeignScan = ArrowInitializeWorkerForeignScan; - r->ShutdownForeignScan = ArrowShutdownForeignScan; - /* IMPORT FOREIGN SCHEMA support */ - r->ImportForeignSchema = ArrowImportForeignSchema; - - /* - * Turn on/off arrow_fdw - */ - DefineCustomBoolVariable("arrow_fdw.enabled", - "Enables the planner's use of Arrow_Fdw", - NULL, - &arrow_fdw_enabled, - true, - PGC_USERSET, - GUC_NOT_IN_SAMPLE, - NULL, NULL, NULL); - /* - * Turn on/off min/max statistics hint - */ - DefineCustomBoolVariable("arrow_fdw.stats_hint_enabled", - "Enables min/max statistics hint, if any", - NULL, - &arrow_fdw_stats_hint_enabled, - true, - PGC_USERSET, - GUC_NOT_IN_SAMPLE, - NULL, NULL, NULL); - /* - * Configurations for arrow_fdw metadata cache - */ - DefineCustomIntVariable("arrow_fdw.metadata_cache_size", - "size of shared metadata cache for arrow files", - NULL, - &arrow_metadata_cache_size_kb, - 512 * 1024, /* 512MB */ - 32 * 1024, /* 32MB */ - INT_MAX, - PGC_POSTMASTER, - GUC_NOT_IN_SAMPLE | GUC_UNIT_KB, - NULL, NULL, NULL); - /* shared memory size */ - shmem_request_next = shmem_request_hook; - shmem_request_hook = pgstrom_request_arrow_fdw; - shmem_startup_next = shmem_startup_hook; - shmem_startup_hook = pgstrom_startup_arrow_fdw; -} - - - - - - - - diff --git a/next/codegen.c b/next/codegen.c deleted file mode 100644 index 426e4f282..000000000 --- a/next/codegen.c +++ /dev/null @@ -1,2881 +0,0 @@ -/* - * codegen.c - * - * Routines for xPU code generator - * ---- - * Copyright 2011-2023 (C) KaiGai Kohei - * Copyright 2014-2023 (C) PG-Strom Developers Team - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the PostgreSQL License. - */ -#include "pg_strom.h" - -/* -------- static variables --------*/ -#define DEVTYPE_INFO_NSLOTS 128 -#define DEVFUNC_INFO_NSLOTS 1024 -static MemoryContext devinfo_memcxt = NULL; -static List *devtype_info_slot[DEVTYPE_INFO_NSLOTS]; -static List *devtype_code_slot[DEVTYPE_INFO_NSLOTS]; /* by TypeOpCode */ -static List *devfunc_info_slot[DEVFUNC_INFO_NSLOTS]; -static List *devfunc_code_slot[DEVFUNC_INFO_NSLOTS]; /* by FuncOpCode */ - -#define TYPE_OPCODE(NAME,OID,EXTENSION,FLAGS) \ - static uint32_t devtype_##NAME##_hash(bool isnull, Datum value); -#include "xpu_opcodes.h" - -#define TYPE_OPCODE(NAME,OID,EXTENSION,FLAGS) \ - { EXTENSION, #NAME, TypeOpCode__##NAME, \ - DEVKIND__ANY | (FLAGS), \ - devtype_##NAME##_hash, \ - sizeof(xpu_##NAME##_t), \ - __alignof__(xpu_##NAME##_t), InvalidOid}, -static struct { - const char *type_extension; - const char *type_name; - TypeOpCode type_code; - uint32_t type_flags; - devtype_hashfunc_f type_hashfunc; - int type_sizeof; - int type_alignof; - Oid type_alias; -} devtype_catalog[] = { -#include "xpu_opcodes.h" - /* alias device data types */ - {NULL, "varchar", TypeOpCode__text, DEVKIND__ANY, - devtype_text_hash, sizeof(xpu_text_t), TEXTOID}, - {NULL, "cidr", TypeOpCode__inet, DEVKIND__ANY, - devtype_inet_hash, sizeof(xpu_inet_t), INETOID}, - {NULL, NULL, TypeOpCode__Invalid, 0, NULL, 0, InvalidOid} -}; - -static const char * -get_extension_name_by_object(Oid class_id, Oid object_id) -{ - Oid ext_oid = getExtensionOfObject(class_id, object_id); - - if (OidIsValid(ext_oid)) - return get_extension_name(ext_oid); - return NULL; -} - -static devtype_info * -build_basic_devtype_info(TypeCacheEntry *tcache, const char *ext_name) -{ - devtype_info *dtype = NULL; - HeapTuple htup; - Form_pg_type __type; - char type_name[NAMEDATALEN+1]; - Oid type_namespace; - int i; - - htup = SearchSysCache1(TYPEOID, ObjectIdGetDatum(tcache->type_id)); - if (!HeapTupleIsValid(htup)) - elog(ERROR, "cache lookup failed for type %u", tcache->type_id); - __type = (Form_pg_type) GETSTRUCT(htup); - strcpy(type_name, NameStr(__type->typname)); - type_namespace = __type->typnamespace; - ReleaseSysCache(htup); - /* built-in types must be in pg_catalog */ - if (!ext_name && type_namespace != PG_CATALOG_NAMESPACE) - return NULL; - for (i=0; devtype_catalog[i].type_name != NULL; i++) - { - const char *__ext_name = devtype_catalog[i].type_extension; - const char *__type_name = devtype_catalog[i].type_name; - - if ((ext_name - ? (__ext_name && strcmp(ext_name, __ext_name) == 0) - : (__ext_name == NULL)) && - strcmp(type_name, __type_name) == 0) - { - MemoryContext oldcxt; - Oid __type_alias = devtype_catalog[i].type_alias; - - /* check feasibility of type alias */ - if (OidIsValid(__type_alias)) - { - char castmethod; - - htup = SearchSysCache2(CASTSOURCETARGET, - ObjectIdGetDatum(tcache->type_id), - ObjectIdGetDatum(__type_alias)); - if (!HeapTupleIsValid(htup)) - elog(ERROR, "binary type cast %s to %s is not defined", - format_type_be(tcache->type_id), - format_type_be(__type_alias)); - castmethod = ((Form_pg_cast)GETSTRUCT(htup))->castmethod; - if (castmethod != COERCION_METHOD_BINARY) - elog(ERROR, "type cast %s to %s is not binary compatible (%c)", - format_type_be(tcache->type_id), - format_type_be(__type_alias), castmethod); - ReleaseSysCache(htup); - /* use type name of the alias */ - htup = SearchSysCache1(TYPEOID, ObjectIdGetDatum(__type_alias)); - if (!HeapTupleIsValid(htup)) - elog(ERROR, "cache lookup failed for type %u", __type_alias); - __type = (Form_pg_type) GETSTRUCT(htup); - strcpy(type_name, NameStr(__type->typname)); - ReleaseSysCache(htup); - } - oldcxt = MemoryContextSwitchTo(devinfo_memcxt); - dtype = palloc0(offsetof(devtype_info, comp_subtypes[0])); - if (ext_name) - dtype->type_extension = pstrdup(ext_name); - dtype->type_code = devtype_catalog[i].type_code; - dtype->type_oid = tcache->type_id; - dtype->type_flags = devtype_catalog[i].type_flags; - dtype->type_length = tcache->typlen; - dtype->type_align = typealign_get_width(tcache->typalign); - dtype->type_byval = tcache->typbyval; - dtype->type_name = pstrdup(type_name); - dtype->type_extension = (ext_name ? pstrdup(ext_name) : NULL); - dtype->type_sizeof = devtype_catalog[i].type_sizeof; - dtype->type_alignof = devtype_catalog[i].type_alignof; - dtype->type_hashfunc = devtype_catalog[i].type_hashfunc; - /* type equality functions */ - dtype->type_eqfunc = get_opcode(tcache->eq_opr); - dtype->type_cmpfunc = tcache->cmp_proc; - MemoryContextSwitchTo(oldcxt); - - return dtype; - } - } - return NULL; /* not found */ -} - -static devtype_info * -build_composite_devtype_info(TypeCacheEntry *tcache, const char *ext_name) -{ - TupleDesc tupdesc = lookup_rowtype_tupdesc(tcache->type_id, -1); - devtype_info **subtypes = alloca(sizeof(devtype_info *) * tupdesc->natts); - devtype_info *dtype; - MemoryContext oldcxt; - uint32_t extra_flags = DEVKIND__ANY; - int j; - - for (j=0; j < tupdesc->natts; j++) - { - Form_pg_attribute attr = TupleDescAttr(tupdesc, j); - - dtype = pgstrom_devtype_lookup(attr->atttypid); - if (!dtype) - { - ReleaseTupleDesc(tupdesc); - return NULL; - } - extra_flags &= dtype->type_flags; - subtypes[j] = dtype; - } - ReleaseTupleDesc(tupdesc); - - oldcxt = MemoryContextSwitchTo(devinfo_memcxt); - dtype = palloc0(offsetof(devtype_info, - comp_subtypes[tupdesc->natts])); - if (ext_name) - dtype->type_extension = pstrdup(ext_name); - dtype->type_code = TypeOpCode__composite; - dtype->type_oid = tcache->type_id; - dtype->type_flags = extra_flags | DEVTYPE__USE_KVARS_SLOTBUF; - dtype->type_length = tcache->typlen; - dtype->type_align = typealign_get_width(tcache->typalign); - dtype->type_byval = tcache->typbyval; - dtype->type_name = "composite"; - dtype->type_sizeof = sizeof(xpu_composite_t); - dtype->type_alignof = __alignof__(xpu_composite_t); - dtype->type_hashfunc = NULL; //devtype_composite_hash; - dtype->type_eqfunc = get_opcode(tcache->eq_opr); - dtype->type_cmpfunc = tcache->cmp_proc; - dtype->comp_nfields = tupdesc->natts; - memcpy(dtype->comp_subtypes, subtypes, - sizeof(devtype_info *) * tupdesc->natts); - MemoryContextSwitchTo(oldcxt); - - return dtype; -} - -static devtype_info * -build_array_devtype_info(TypeCacheEntry *tcache, const char *ext_name) -{ - devtype_info *elem; - devtype_info *dtype; - MemoryContext oldcxt; - - elem = pgstrom_devtype_lookup(tcache->typelem); - if (!elem) - return NULL; - - oldcxt = MemoryContextSwitchTo(devinfo_memcxt); - dtype = palloc0(offsetof(devtype_info, comp_subtypes[0])); - if (ext_name) - dtype->type_extension = pstrdup(ext_name); - dtype->type_code = TypeOpCode__array; - dtype->type_oid = tcache->type_id; - dtype->type_flags = elem->type_flags | DEVTYPE__USE_KVARS_SLOTBUF; - dtype->type_length = tcache->typlen; - dtype->type_align = typealign_get_width(tcache->typalign); - dtype->type_byval = tcache->typbyval; - dtype->type_name = "array"; - dtype->type_sizeof = sizeof(xpu_array_t); - dtype->type_alignof = __alignof__(xpu_array_t); - dtype->type_hashfunc = NULL; //devtype_array_hash; - /* type equality functions */ - dtype->type_eqfunc = get_opcode(tcache->eq_opr); - dtype->type_cmpfunc = tcache->cmp_proc; - - MemoryContextSwitchTo(oldcxt); - - return dtype; -} - -devtype_info * -pgstrom_devtype_lookup(Oid type_oid) -{ - devtype_info *dtype; - Datum hash; - uint32_t index; - ListCell *lc; - const char *ext_name; - TypeCacheEntry *tcache; - - hash = hash_any((unsigned char *)&type_oid, sizeof(Oid)); - index = hash % DEVTYPE_INFO_NSLOTS; - foreach (lc, devtype_info_slot[index]) - { - dtype = lfirst(lc); - - if (dtype->type_oid == type_oid) - { - Assert(dtype->hash == hash); - goto found; - } - } - /* try to build devtype_info entry */ - ext_name = get_extension_name_by_object(TypeRelationId, type_oid); - tcache = lookup_type_cache(type_oid, - TYPECACHE_EQ_OPR | - TYPECACHE_CMP_PROC); - /* if domain, move to the base type */ - while (tcache->nextDomain) - tcache = tcache->nextDomain; - - if (OidIsValid(tcache->typelem) && tcache->typlen == -1) - { - /* array type */ - dtype = build_array_devtype_info(tcache, ext_name); - } - else if (tcache->typtype == TYPTYPE_COMPOSITE) - { - /* composite type */ - if (!OidIsValid(tcache->typrelid)) - elog(ERROR, "Bug? wrong composite definition at %s", - format_type_be(type_oid)); - dtype = build_composite_devtype_info(tcache, ext_name); - } - else if (tcache->typtype == TYPTYPE_BASE || - tcache->typtype == TYPTYPE_RANGE) - { - /* base or range type */ - dtype = build_basic_devtype_info(tcache, ext_name); - } - else - { - /* not a supported type */ - dtype = NULL; - } - - /* make a negative entry, if not device executable */ - if (!dtype) - { - dtype = MemoryContextAllocZero(devinfo_memcxt, - sizeof(devtype_info)); - dtype->type_is_negative = true; - } - dtype->type_oid = type_oid; - dtype->hash = hash; - devtype_info_slot[index] = lappend_cxt(devinfo_memcxt, - devtype_info_slot[index], dtype); - if (!dtype->type_is_negative) - { - hash = hash_any((unsigned char *)&dtype->type_code, sizeof(TypeOpCode)); - index = hash % DEVTYPE_INFO_NSLOTS; - devtype_code_slot[index] = lappend_cxt(devinfo_memcxt, - devtype_code_slot[index], dtype); - } -found: - if (dtype->type_is_negative) - return NULL; - return dtype; -} - -/* - * devtype_lookup_by_opcode - */ -static devtype_info * -devtype_lookup_by_opcode(TypeOpCode type_code) -{ - Datum hash; - uint32_t index; - ListCell *lc; - - hash = hash_any((unsigned char *)&type_code, sizeof(TypeOpCode)); - index = hash % DEVTYPE_INFO_NSLOTS; - foreach (lc, devtype_code_slot[index]) - { - devtype_info *dtype = lfirst(lc); - - if (dtype->type_code == type_code) - return dtype; - } - return NULL; -} - -/* - * Built-in device type hash functions - */ -static uint32_t -devtype_bool_hash(bool isnull, Datum value) -{ - bool bval; - - if (isnull) - return 0; - bval = DatumGetBool(value) ? true : false; - return hash_any((unsigned char *)&bval, sizeof(bool)); -} - -static inline uint32_t -__devtype_simple_hash(bool isnull, Datum value, int sz) -{ - if (isnull) - return 0; - return hash_any((unsigned char *)&value, sz); -} - -static uint32_t -devtype_int1_hash(bool isnull, Datum value) -{ - return __devtype_simple_hash(isnull, value, sizeof(int8_t)); -} - -static uint32_t -devtype_int2_hash(bool isnull, Datum value) -{ - return __devtype_simple_hash(isnull, value, sizeof(int16_t)); -} - -static uint32_t -devtype_int4_hash(bool isnull, Datum value) -{ - return __devtype_simple_hash(isnull, value, sizeof(int32_t)); -} - -static uint32_t -devtype_int8_hash(bool isnull, Datum value) -{ - return __devtype_simple_hash(isnull, value, sizeof(int64_t)); -} - -static uint32_t -devtype_float2_hash(bool isnull, Datum value) -{ - return __devtype_simple_hash(isnull, value, sizeof(float2_t)); -} - -static uint32_t -devtype_float4_hash(bool isnull, Datum value) -{ - return __devtype_simple_hash(isnull, value, sizeof(float4_t)); -} - -static uint32_t -devtype_float8_hash(bool isnull, Datum value) -{ - return __devtype_simple_hash(isnull, value, sizeof(float8_t)); -} - -static uint32_t -devtype_numeric_hash(bool isnull, Datum value) -{ - uint32_t len; - - if (isnull) - return 0; - len = VARSIZE_ANY_EXHDR(value); - if (len >= sizeof(uint16_t)) - { - NumericChoice *nc = (NumericChoice *)VARDATA_ANY(value); - NumericDigit *digits = NUMERIC_DIGITS(nc, nc->n_header); - int weight = NUMERIC_WEIGHT(nc, nc->n_header) + 1; - int i, ndigits = NUMERIC_NDIGITS(nc->n_header, len); - int128_t value = 0; - - for (i=0; i < ndigits; i++) - { - NumericDigit dig = digits[i]; - - value = value * PG_NBASE + dig; - if (value < 0) - elog(ERROR, "numeric value is out of range"); - } - if (NUMERIC_SIGN(nc->n_header) == NUMERIC_NEG) - value = -value; - weight = PG_DEC_DIGITS * (ndigits - weight); - /* see, set_normalized_numeric */ - if (value == 0) - weight = 0; - else - { - while (value % 10 == 0) - { - value /= 10; - weight--; - } - } - return (hash_any((unsigned char *)&weight, sizeof(int16_t)) ^ - hash_any((unsigned char *)&value, sizeof(int128_t))); - } - elog(ERROR, "corrupted numeric header"); -} - -static uint32_t -devtype_bytea_hash(bool isnull, Datum value) -{ - if (isnull) - return 0; - return hash_any((unsigned char *)VARDATA_ANY(value), VARSIZE_ANY_EXHDR(value)); -} - -static uint32_t -devtype_text_hash(bool isnull, Datum value) -{ - if (isnull) - return 0; - return hash_any((unsigned char *)VARDATA_ANY(value), VARSIZE_ANY_EXHDR(value)); -} - -static uint32_t -devtype_bpchar_hash(bool isnull, Datum value) -{ - if (!isnull) - { - char *s = VARDATA_ANY(value); - int sz = VARSIZE_ANY_EXHDR(value); - - sz = bpchartruelen(s, sz); - return hash_any((unsigned char *)s, sz); - } - return 0; -} - -static uint32_t -devtype_date_hash(bool isnull, Datum value) -{ - return __devtype_simple_hash(isnull, value, sizeof(DateADT)); -} - -static uint32_t -devtype_time_hash(bool isnull, Datum value) -{ - return __devtype_simple_hash(isnull, value, sizeof(TimeADT)); -} - -static uint32_t -devtype_timetz_hash(bool isnull, Datum value) -{ - if (!isnull) - { - TimeTzADT *tmtz = DatumGetTimeTzADTP(value); - - return (hash_any((unsigned char *)&tmtz->time, sizeof(TimeADT)) ^ - hash_any((unsigned char *)&tmtz->zone, sizeof(int32_t))); - } - return 0; -} - -static uint32_t -devtype_timestamp_hash(bool isnull, Datum value) -{ - return __devtype_simple_hash(isnull, value, sizeof(Timestamp)); -} - -static uint32_t -devtype_timestamptz_hash(bool isnull, Datum value) -{ - return __devtype_simple_hash(isnull, value, sizeof(TimestampTz)); -} - -static uint32_t -devtype_interval_hash(bool isnull, Datum value) -{ - if (!isnull) - { - Interval *iv = DatumGetIntervalP(value); - - return hash_any((unsigned char *)iv, sizeof(Interval)); - } - return 0; -} - -static uint32_t -devtype_money_hash(bool isnull, Datum value) -{ - return __devtype_simple_hash(isnull, value, sizeof(int64_t)); -} - -static uint32_t -devtype_uuid_hash(bool isnull, Datum value) -{ - if (!isnull) - { - pg_uuid_t *uuid = DatumGetUUIDP(value); - - return hash_any(uuid->data, UUID_LEN); - } - return 0; -} - -static uint32_t -devtype_macaddr_hash(bool isnull, Datum value) -{ - if (!isnull) - { - macaddr *maddr = DatumGetMacaddrP(value); - - return hash_any((unsigned char *)maddr, sizeof(macaddr)); - } - return 0; -} - -static uint32_t -devtype_inet_hash(bool isnull, Datum value) -{ - if (!isnull) - { - inet *in = DatumGetInetP(value); - int sz; - - if (in->inet_data.family == PGSQL_AF_INET) - sz = offsetof(inet_struct, ipaddr[4]); - else if (in->inet_data.family == PGSQL_AF_INET6) - sz = offsetof(inet_struct, ipaddr[16]); - else - elog(ERROR, "corrupted inet data"); - return hash_any((unsigned char *)&in->inet_data, sz); - } - return 0; -} - -/* - * Built-in device functions/operators - */ -#define FUNC_OPCODE(SQLNAME,FN_ARGS,FN_FLAGS,DEVNAME,FUNC_COST,EXTENSION) \ - { #SQLNAME, #FN_ARGS, FN_FLAGS, FuncOpCode__##DEVNAME, FUNC_COST, EXTENSION }, -static struct { - const char *func_name; - const char *func_args; - uint32_t func_flags; - FuncOpCode func_code; - int func_cost; - const char *func_extension; -} devfunc_catalog[] = { -#include "xpu_opcodes.h" - {NULL,NULL,0,FuncOpCode__Invalid,0,NULL} -}; - -static devfunc_info * -pgstrom_devfunc_build(Oid func_oid, int func_nargs, Oid *func_argtypes) -{ - const char *fextension; - const char *fname; - Oid fnamespace; - Oid frettype; - StringInfoData buf; - devfunc_info *dfunc = NULL; - devtype_info *dtype_rettype; - devtype_info **dtype_argtypes; - MemoryContext oldcxt; - int i, j, sz; - - initStringInfo(&buf); - fname = get_func_name(func_oid); - if (!fname) - elog(ERROR, "cache lookup failed on procedure '%u'", func_oid); - fnamespace = get_func_namespace(func_oid); - frettype = get_func_rettype(func_oid); - dtype_rettype = pgstrom_devtype_lookup(frettype); - if (!dtype_rettype) - goto bailout; - dtype_argtypes = alloca(sizeof(devtype_info *) * func_nargs); - for (j=0; j < func_nargs; j++) - { - dtype_argtypes[j] = pgstrom_devtype_lookup(func_argtypes[j]); - if (!dtype_argtypes[j]) - goto bailout; - } - /* we expect built-in functions are in pg_catalog namespace */ - fextension = get_extension_name_by_object(ProcedureRelationId, func_oid); - if (!fextension && fnamespace != PG_CATALOG_NAMESPACE) - goto bailout; - - for (i=0; devfunc_catalog[i].func_name != NULL; i++) - { - const char *__extension = devfunc_catalog[i].func_extension; - const char *__name = devfunc_catalog[i].func_name; - char *tok, *saveptr; - - if (fextension != NULL - ? (__extension == NULL || strcmp(fextension, __extension) != 0) - : (__extension != NULL)) - continue; - if (strcmp(fname, __name) != 0) - continue; - - resetStringInfo(&buf); - appendStringInfoString(&buf, devfunc_catalog[i].func_args); - for (tok = strtok_r(buf.data, "/", &saveptr), j=0; - tok != NULL && j < func_nargs; - tok = strtok_r(NULL, "/", &saveptr), j++) - { - devtype_info *dtype = dtype_argtypes[j]; - - tok = __trim(tok); - sz = strlen(tok); - if (sz > 4 && - tok[0] == '_' && tok[1] == '_' && - tok[sz-1] == '_' && tok[sz-2] == '_') - { - /* __TYPE__ means variable length argument! */ - tok[sz-1] = '\0'; - if (strcmp(tok+2, dtype->type_name) != 0) - break; - /* must be the last argument set */ - tok = strtok_r(NULL, "/", &saveptr); - if (tok) - break; - /* check whether the following arguments are identical */ - while (j < func_nargs) - { - if (dtype->type_oid != func_argtypes[j]) - break; - j++; - } - } - else - { - if (strcmp(tok, dtype->type_name) != 0) - break; - } - } - - /* Ok, found an entry */ - if (!tok && j == func_nargs) - { - oldcxt = MemoryContextSwitchTo(devinfo_memcxt); - dfunc = palloc0(offsetof(devfunc_info, - func_argtypes[func_nargs])); - dfunc->func_code = devfunc_catalog[i].func_code; - if (fextension) - dfunc->func_extension = pstrdup(fextension); - dfunc->func_name = pstrdup(fname); - dfunc->func_oid = func_oid; - dfunc->func_rettype = dtype_rettype; - dfunc->func_flags = devfunc_catalog[i].func_flags; - dfunc->func_cost = devfunc_catalog[i].func_cost; - dfunc->func_nargs = func_nargs; - memcpy(dfunc->func_argtypes, dtype_argtypes, - sizeof(devtype_info *) * func_nargs); - MemoryContextSwitchTo(oldcxt); - break; - } - } -bailout: - pfree(buf.data); - return dfunc; -} - -typedef struct { - Oid func_oid; - int func_nargs; - Oid func_argtypes[1]; -} devfunc_cache_signature; - -static devfunc_info * -__pgstrom_devfunc_lookup(Oid func_oid, - int func_nargs, - Oid *func_argtypes, - Oid func_collid) -{ - devfunc_cache_signature *signature; - devtype_info *dtype = NULL; - devfunc_info *dfunc = NULL; - ListCell *lc; - uint32_t hash; - int i, j, sz; - - sz = offsetof(devfunc_cache_signature, func_argtypes[func_nargs]); - signature = alloca(sz); - memset(signature, 0, sz); - signature->func_oid = func_oid; - signature->func_nargs = func_nargs; - for (i=0; i < func_nargs; i++) - signature->func_argtypes[i] = func_argtypes[i]; - hash = hash_any((unsigned char *)signature, sz); - - i = hash % DEVFUNC_INFO_NSLOTS; - foreach (lc, devfunc_info_slot[i]) - { - dfunc = lfirst(lc); - if (dfunc->hash == hash && - dfunc->func_oid == func_oid && - dfunc->func_nargs == func_nargs) - { - for (j=0; j < func_nargs; j++) - { - dtype = dfunc->func_argtypes[j]; - if (dtype->type_oid != func_argtypes[j]) - break; - } - if (j == func_nargs) - goto found; - } - } - /* not found, build a new entry */ - dfunc = pgstrom_devfunc_build(func_oid, func_nargs, func_argtypes); - if (!dfunc) - { - MemoryContext oldcxt; - - oldcxt = MemoryContextSwitchTo(devinfo_memcxt); - dfunc = palloc0(offsetof(devfunc_info, func_argtypes[func_nargs])); - dfunc->func_oid = func_oid; - dfunc->func_nargs = func_nargs; - dfunc->func_is_negative = true; - for (i=0; i < func_nargs; i++) - { - dtype = pgstrom_devtype_lookup(func_argtypes[i]); - if (!dtype) - { - dtype = palloc0(sizeof(devtype_info)); - dtype->type_oid = func_argtypes[i]; - dtype->type_is_negative = true; - } - dfunc->func_argtypes[i] = dtype; - } - MemoryContextSwitchTo(oldcxt); - } - dfunc->hash = hash; - devfunc_info_slot[i] = lappend_cxt(devinfo_memcxt, - devfunc_info_slot[i], dfunc); - if (!dfunc->func_is_negative) - { - hash = hash_any((unsigned char *)&dfunc->func_code, sizeof(FuncOpCode)); - i = hash % DEVFUNC_INFO_NSLOTS; - devfunc_code_slot[i] = lappend_cxt(devinfo_memcxt, - devfunc_code_slot[i], dfunc); - } -found: - if (dfunc->func_is_negative) - return NULL; - if (OidIsValid(func_collid) && !lc_collate_is_c(func_collid) && - (dfunc->func_flags & DEVFUNC__LOCALE_AWARE) != 0) - return NULL; - return dfunc; -} - -devfunc_info * -pgstrom_devfunc_lookup(Oid func_oid, - List *func_args, - Oid func_collid) -{ - int i, nargs = list_length(func_args); - Oid *argtypes; - ListCell *lc; - - i = 0; - argtypes = alloca(sizeof(Oid) * nargs); - foreach (lc, func_args) - { - Node *node = lfirst(lc); - - argtypes[i++] = exprType(node); - } - return __pgstrom_devfunc_lookup(func_oid, nargs, argtypes, func_collid); -} - -static devfunc_info * -devfunc_lookup_by_opcode(FuncOpCode func_code) -{ - Datum hash; - uint32_t index; - ListCell *lc; - - hash = hash_any((unsigned char *)&func_code, sizeof(FuncOpCode)); - index = hash % DEVFUNC_INFO_NSLOTS; - foreach (lc, devfunc_code_slot[index]) - { - devfunc_info *dfunc = lfirst(lc); - - if (dfunc->func_code == func_code) - return dfunc; - } - return NULL; -} - -/* - * lookup special purpose devfuncs - */ -devfunc_info * -devtype_lookup_equal_func(devtype_info *dtype, Oid coll_id) -{ - if (OidIsValid(dtype->type_eqfunc)) - { - Oid argtypes[2]; - - argtypes[0] = dtype->type_oid; - argtypes[1] = dtype->type_oid; - return __pgstrom_devfunc_lookup(dtype->type_eqfunc, 2, argtypes, coll_id); - } - return NULL; -} - -devfunc_info * -devtype_lookup_compare_func(devtype_info *dtype, Oid coll_id) -{ - if (OidIsValid(dtype->type_cmpfunc)) - { - Oid argtypes[2]; - - argtypes[0] = dtype->type_oid; - argtypes[1] = dtype->type_oid; - return __pgstrom_devfunc_lookup(dtype->type_cmpfunc, 2, argtypes, coll_id); - } - return NULL; -} - -/* ---------------------------------------------------------------- - * - * xPU pseudo code generator - * - * ---------------------------------------------------------------- - */ -#define __Elog(fmt,...) \ - do { \ - ereport(context->elevel, \ - (errcode(ERRCODE_INTERNAL_ERROR), \ - errmsg("(%s:%d) " fmt, __FUNCTION__, __LINE__, \ - ##__VA_ARGS__), \ - errdetail("problematic expression: %s", \ - nodeToString(context->top_expr)))); \ - return -1; \ - } while(0) - -static int codegen_expression_walker(codegen_context *context, - StringInfo buf, Expr *expr); - -void -codegen_context_init(codegen_context *context, uint32_t task_kind) -{ - memset(context, 0, sizeof(codegen_context)); - context->elevel = ERROR; - context->required_flags = (task_kind & DEVKIND__ANY); -} - -static void -__appendKernExpMagicAndLength(StringInfo buf, int head_pos) -{ - static uint64_t __zero = 0; - const kern_expression *kexp; - int padding = (INTALIGN(buf->len) - buf->len); - uint32_t magic; - - if (padding > 0) - appendBinaryStringInfo(buf, (char *)&__zero, padding); - kexp = (const kern_expression *)(buf->data + head_pos); - magic = (KERN_EXPRESSION_MAGIC - ^ ((uint32_t)kexp->exptype << 6) - ^ ((uint32_t)kexp->opcode << 14)); - appendBinaryStringInfo(buf, (char *)&magic, sizeof(uint32_t)); - ((kern_expression *)(buf->data + head_pos))->len = buf->len - head_pos; -} - -static int -codegen_const_expression(codegen_context *context, - StringInfo buf, Const *con) -{ - devtype_info *dtype; - char typtype; - - typtype = get_typtype(con->consttype); - if (typtype != TYPTYPE_BASE && - typtype != TYPTYPE_ENUM && - typtype != TYPTYPE_RANGE && - typtype != TYPTYPE_DOMAIN) - __Elog("unable to use type %s in Const expression (class: %c)", - format_type_be(con->consttype), typtype); - - dtype = pgstrom_devtype_lookup(con->consttype); - if (!dtype) - __Elog("type %s is not device supported", - format_type_be(con->consttype)); - if (buf) - { - kern_expression *kexp; - int pos, sz = 0; - - sz = offsetof(kern_expression, u.c.const_value); - if (!con->constisnull) - { - if (con->constbyval) - sz += con->constlen; - else if (con->constlen == -1) - sz += VARSIZE_ANY(con->constvalue); - else - elog(ERROR, "unsupported type length: %d", con->constlen); - } - kexp = alloca(sz); - memset(kexp, 0, sz); - kexp->exptype = dtype->type_code; - kexp->expflags = context->kexp_flags; - kexp->opcode = FuncOpCode__ConstExpr; - kexp->u.c.const_type = con->consttype; - kexp->u.c.const_isnull = con->constisnull; - if (!con->constisnull) - { - if (con->constbyval) - memcpy(kexp->u.c.const_value, - &con->constvalue, - con->constlen); - else - memcpy(kexp->u.c.const_value, - DatumGetPointer(con->constvalue), - VARSIZE_ANY(con->constvalue)); - } - pos = __appendBinaryStringInfo(buf, kexp, sz); - __appendKernExpMagicAndLength(buf, pos); - } - return 0; -} - -static int -codegen_param_expression(codegen_context *context, - StringInfo buf, Param *param) -{ - kern_expression kexp; - devtype_info *dtype; - char typtype; - int pos; - - if (param->paramkind != PARAM_EXTERN) - __Elog("Only PARAM_EXTERN is supported on device: %d", - (int)param->paramkind); - - typtype = get_typtype(param->paramtype); - if (typtype != TYPTYPE_BASE && - typtype != TYPTYPE_ENUM && - typtype != TYPTYPE_RANGE && - typtype != TYPTYPE_DOMAIN) - __Elog("unable to use type %s in Const expression (class: %c)", - format_type_be(param->paramtype), typtype); - - dtype = pgstrom_devtype_lookup(param->paramtype); - if (!dtype) - __Elog("type %s is not device supported", - format_type_be(param->paramtype)); - if (buf) - { - memset(&kexp, 0, sizeof(kexp)); - kexp.opcode = FuncOpCode__ParamExpr; - kexp.exptype = dtype->type_code; - kexp.expflags = context->kexp_flags; - kexp.u.p.param_id = param->paramid; - pos = __appendBinaryStringInfo(buf, &kexp, - SizeOfKernExprParam); - __appendKernExpMagicAndLength(buf, pos); - } - context->used_params = list_append_unique(context->used_params, param); - - return 0; -} - -static int -codegen_var_expression(codegen_context *context, - StringInfo buf, - Expr *expr, - int kvar_slot_id) -{ - Oid type_oid = exprType((Node *)expr); - devtype_info *dtype; - - dtype = pgstrom_devtype_lookup(type_oid); - if (!dtype) - __Elog("type %s is not device supported", format_type_be(type_oid)); - - if (buf) - { - kern_expression kexp; - int pos; - - memset(&kexp, 0, sizeof(kexp)); - kexp.exptype = dtype->type_code; - kexp.expflags = context->kexp_flags; - kexp.opcode = FuncOpCode__VarExpr; - kexp.u.v.var_typlen = dtype->type_length; - kexp.u.v.var_typbyval = dtype->type_byval; - kexp.u.v.var_typalign = dtype->type_align; - kexp.u.v.var_slot_id = kvar_slot_id; - pos = __appendBinaryStringInfo(buf, &kexp, SizeOfKernExprVar); - __appendKernExpMagicAndLength(buf, pos); - } - return 0; -} - -static int -__codegen_func_expression(codegen_context *context, - StringInfo buf, - Oid func_oid, - List *func_args, - Oid func_collid) -{ - devfunc_info *dfunc; - devtype_info *dtype; - kern_expression kexp; - int pos = -1; - ListCell *lc; - - dfunc = pgstrom_devfunc_lookup(func_oid, func_args, func_collid); - if (!dfunc || - (dfunc->func_flags & context->required_flags) != context->required_flags) - __Elog("function %s is not supported on the target device", - format_procedure(func_oid)); - dtype = dfunc->func_rettype; - context->device_cost += dfunc->func_cost; - - memset(&kexp, 0, sizeof(kexp)); - kexp.exptype = dtype->type_code; - kexp.expflags = context->kexp_flags; - kexp.opcode = dfunc->func_code; - kexp.nr_args = list_length(func_args); - kexp.args_offset = SizeOfKernExpr(0); - if (buf) - pos = __appendBinaryStringInfo(buf, &kexp, SizeOfKernExpr(0)); - foreach (lc, func_args) - { - Expr *arg = lfirst(lc); - - if (codegen_expression_walker(context, buf, arg) < 0) - return -1; - } - if (buf) - __appendKernExpMagicAndLength(buf, pos); - return 0; -} - -static int -codegen_func_expression(codegen_context *context, - StringInfo buf, FuncExpr *func) -{ - return __codegen_func_expression(context, - buf, - func->funcid, - func->args, - func->funccollid); -} - -static int -codegen_oper_expression(codegen_context *context, - StringInfo buf, OpExpr *oper) -{ - return __codegen_func_expression(context, - buf, - get_opcode(oper->opno), - oper->args, - oper->opcollid); -} - -static int -codegen_bool_expression(codegen_context *context, - StringInfo buf, BoolExpr *b) -{ - kern_expression kexp; - int pos = -1; - ListCell *lc; - - memset(&kexp, 0, sizeof(kexp)); - switch (b->boolop) - { - case AND_EXPR: - kexp.opcode = FuncOpCode__BoolExpr_And; - kexp.nr_args = list_length(b->args); - if (kexp.nr_args < 2) - __Elog("BoolExpr(AND) must have 2 or more arguments"); - break; - case OR_EXPR: - kexp.opcode = FuncOpCode__BoolExpr_Or; - kexp.nr_args = list_length(b->args); - if (kexp.nr_args < 2) - __Elog("BoolExpr(OR) must have 2 or more arguments"); - break; - case NOT_EXPR: - kexp.opcode = FuncOpCode__BoolExpr_Not; - kexp.nr_args = list_length(b->args); - if (kexp.nr_args != 1) - __Elog("BoolExpr(OR) must not have multiple arguments"); - break; - default: - __Elog("BoolExpr has unknown bool operation (%d)", (int)b->boolop); - } - kexp.exptype = TypeOpCode__bool; - kexp.expflags = context->kexp_flags; - kexp.args_offset = SizeOfKernExpr(0); - if (buf) - pos = __appendBinaryStringInfo(buf, &kexp, SizeOfKernExpr(0)); - foreach (lc, b->args) - { - Expr *arg = lfirst(lc); - - if (codegen_expression_walker(context, buf, arg) < 0) - return -1; - } - if (buf) - __appendKernExpMagicAndLength(buf, pos); - return 0; -} - -static int -codegen_nulltest_expression(codegen_context *context, - StringInfo buf, NullTest *nt) -{ - kern_expression kexp; - int pos = -1; - - memset(&kexp, 0, sizeof(kexp)); - switch (nt->nulltesttype) - { - case IS_NULL: - kexp.opcode = FuncOpCode__NullTestExpr_IsNull; - break; - case IS_NOT_NULL: - kexp.opcode = FuncOpCode__NullTestExpr_IsNotNull; - break; - default: - __Elog("NullTest has unknown NullTestType (%d)", (int)nt->nulltesttype); - } - kexp.exptype = TypeOpCode__bool; - kexp.expflags = context->kexp_flags; - kexp.nr_args = 1; - kexp.args_offset = SizeOfKernExpr(0); - if (buf) - pos = __appendBinaryStringInfo(buf, &kexp, SizeOfKernExpr(0)); - if (codegen_expression_walker(context, buf, nt->arg) < 0) - return -1; - if (buf) - __appendKernExpMagicAndLength(buf, pos); - return 0; -} - -static int -codegen_booleantest_expression(codegen_context *context, - StringInfo buf, BooleanTest *bt) -{ - kern_expression kexp; - int pos = -1; - - memset(&kexp, 0, sizeof(kexp)); - switch (bt->booltesttype) - { - case IS_TRUE: - kexp.opcode = FuncOpCode__BoolTestExpr_IsTrue; - break; - case IS_NOT_TRUE: - kexp.opcode = FuncOpCode__BoolTestExpr_IsNotTrue; - break; - case IS_FALSE: - kexp.opcode = FuncOpCode__BoolTestExpr_IsFalse; - break; - case IS_NOT_FALSE: - kexp.opcode = FuncOpCode__BoolTestExpr_IsNotFalse; - break; - case IS_UNKNOWN: - kexp.opcode = FuncOpCode__BoolTestExpr_IsUnknown; - break; - case IS_NOT_UNKNOWN: - kexp.opcode = FuncOpCode__BoolTestExpr_IsNotUnknown; - break; - default: - __Elog("BooleanTest has unknown BoolTestType (%d)", - (int)bt->booltesttype); - } - kexp.exptype = TypeOpCode__bool; - kexp.expflags = context->kexp_flags; - kexp.nr_args = 1; - kexp.args_offset = SizeOfKernExpr(0); - if (buf) - pos = __appendBinaryStringInfo(buf, &kexp, SizeOfKernExpr(0)); - if (codegen_expression_walker(context, buf, bt->arg) < 0) - return -1; - if (buf) - __appendKernExpMagicAndLength(buf, pos); - return 0; -} - -/* - * is_expression_equals_tlist - * - * It checks whether the supplied expression exactly matches any entry of - * the target-list. If found, it returns its depth and resno. - */ -static int -is_expression_equals_tlist(codegen_context *context, Expr *expr) -{ - ListCell *lc1, *lc2; - int depth = 0; - int resno; - int slot_id; - devtype_info *dtype = NULL; - - foreach (lc1, context->input_rels_tlist) - { - Node *node = lfirst(lc1); - - if (IsA(node, Integer)) - { - Index varno = intVal(node); - Var *var = (Var *)expr; - - if (IsA(var, Var) && var->varno == varno) - { - resno = var->varattno; - dtype = pgstrom_devtype_lookup(var->vartype); - goto found; - } - } - else if (IsA(node, PathTarget)) - { - PathTarget *reltarget = (PathTarget *)node; - - resno = 1; - foreach (lc2, reltarget->exprs) - { - if (equal(expr, lfirst(lc2))) - { - dtype = pgstrom_devtype_lookup(exprType((Node *)expr)); - goto found; - } - resno++; - } - } - else - { - elog(ERROR, "Bug? unexpected input_rels_tlist"); - } - depth++; - } - return -1; /* not found */ - -found: - slot_id = 0; - forboth (lc1, context->kvars_depth, - lc2, context->kvars_resno) - { - if (depth == lfirst_int(lc1) && - resno == lfirst_int(lc2)) - { - return slot_id; - } - slot_id++; - } - context->kvars_depth = lappend_int(context->kvars_depth, depth); - context->kvars_resno = lappend_int(context->kvars_resno, resno); - if (dtype && (dtype->type_flags & DEVTYPE__USE_KVARS_SLOTBUF) != 0) - context->kvars_types = lappend_oid(context->kvars_types, dtype->type_oid); - else - context->kvars_types = lappend_oid(context->kvars_types, InvalidOid); - context->kvars_exprs = lappend(context->kvars_exprs, expr); - - return slot_id; -} - -static int -codegen_expression_walker(codegen_context *context, - StringInfo buf, Expr *expr) -{ - int slot_id; - - if (!expr) - return 0; - /* check simple var references */ - slot_id = is_expression_equals_tlist(context, expr); - if (slot_id >= 0) - return codegen_var_expression(context, buf, expr, slot_id); - - switch (nodeTag(expr)) - { - case T_Const: - return codegen_const_expression(context, buf, (Const *)expr); - case T_Param: - return codegen_param_expression(context, buf, (Param *)expr); - case T_FuncExpr: - return codegen_func_expression(context, buf, (FuncExpr *)expr); - case T_OpExpr: - case T_DistinctExpr: - return codegen_oper_expression(context, buf, (OpExpr *)expr); - case T_BoolExpr: - return codegen_bool_expression(context, buf, (BoolExpr *)expr); - case T_NullTest: - return codegen_nulltest_expression(context, buf, (NullTest *)expr); - case T_BooleanTest: - return codegen_booleantest_expression(context, buf, (BooleanTest *)expr); - case T_CoalesceExpr: - case T_MinMaxExpr: - case T_RelabelType: - case T_CoerceViaIO: - case T_CoerceToDomain: - case T_CaseExpr: - case T_CaseTestExpr: - case T_ScalarArrayOpExpr: - default: - __Elog("not a supported expression type: %s", nodeToString(expr)); - } - return -1; -} -#undef __Elog - -/* - * codegen_build_loadvars - */ -static int -kern_vars_defitem_comp(const void *__a, const void *__b) -{ - const kern_vars_defitem *a = __a; - const kern_vars_defitem *b = __b; - - if (a->var_resno < b->var_resno) - return -1; - if (a->var_resno > b->var_resno) - return 1; - return 0; -} - -static kern_expression * -__codegen_build_loadvars_one(codegen_context *context, int depth) -{ - kern_expression kexp; - StringInfoData buf; - int slot_id = 0; - int nloads = 0; - int nslots = list_length(context->kvars_depth); - uint32_t kvars_offset; - ListCell *lc1, *lc2, *lc3; - - initStringInfo(&buf); - buf.len = offsetof(kern_expression, u.load.kvars); - kvars_offset = (sizeof(kern_variable) * nslots + - sizeof(int) * nslots); - forthree (lc1, context->kvars_depth, - lc2, context->kvars_resno, - lc3, context->kvars_types) - { - kern_vars_defitem vitem; - int __depth = lfirst_int(lc1); - int __resno = lfirst_int(lc2); - Oid __type_oid = lfirst_oid(lc3); - - vitem.var_resno = __resno; - vitem.var_slot_id = slot_id++; - if (__depth == depth) - { - if (!OidIsValid(__type_oid)) - vitem.var_slot_off = 0; - else - { - devtype_info *dtype = pgstrom_devtype_lookup(__type_oid); - - Assert(dtype != NULL); - kvars_offset = TYPEALIGN(dtype->type_alignof, kvars_offset); - vitem.var_slot_off = kvars_offset; - kvars_offset += dtype->type_sizeof; - } - appendBinaryStringInfo(&buf, (char *)&vitem, - sizeof(kern_vars_defitem)); - nloads++; - } - } - if (nloads == 0) - { - pfree(buf.data); - return NULL; - } - qsort(buf.data + offsetof(kern_expression, u.load.kvars), - nloads, - sizeof(kern_vars_defitem), - kern_vars_defitem_comp); - - memset(&kexp, 0, sizeof(kexp)); - kexp.exptype = TypeOpCode__int4; - kexp.expflags = context->kexp_flags; - kexp.opcode = FuncOpCode__LoadVars; - kexp.args_offset = MAXALIGN(offsetof(kern_expression, - u.load.kvars[nloads])); - kexp.u.load.depth = depth; - kexp.u.load.nloads = nloads; - memcpy(buf.data, &kexp, offsetof(kern_expression, u.load.kvars)); - __appendKernExpMagicAndLength(&buf, 0); - - return (kern_expression *)buf.data; -} - -bytea * -codegen_build_scan_loadvars(codegen_context *context) -{ - kern_expression *kexp = __codegen_build_loadvars_one(context, 0); - char *xpucode = NULL; - - if (kexp) - { - xpucode = palloc(VARHDRSZ + kexp->len); - memcpy(xpucode + VARHDRSZ, kexp, kexp->len); - SET_VARSIZE(xpucode, VARHDRSZ + kexp->len); - } - return (bytea *)xpucode; -} - -bytea * -codegen_build_join_loadvars(codegen_context *context) -{ - kern_expression *kexp; - StringInfoData buf; - int max_depth = -1; - uint32_t sz; - char *result = NULL; - ListCell *lc; - - foreach (lc, context->kvars_depth) - { - int depth = lfirst_int(lc); - - if (depth >= 0) - max_depth = Max(max_depth, depth); - } - if (max_depth < 1) - return NULL; - sz = MAXALIGN(offsetof(kern_expression, u.pack.offset[max_depth+1])); - kexp = alloca(sz); - memset(kexp, 0, sz); - kexp->exptype = TypeOpCode__int4; - kexp->expflags = context->kexp_flags; - kexp->opcode = FuncOpCode__Packed; - kexp->args_offset = sz; - kexp->u.pack.npacked = max_depth; - - initStringInfo(&buf); - buf.len = sz; - for (int i=0; i < max_depth; i++) - { - kern_expression *karg = __codegen_build_loadvars_one(context, i+1); - - if (karg) - { - kexp->u.pack.offset[i] - = __appendBinaryStringInfo(&buf, karg, karg->len); - kexp->nr_args++; - pfree(karg); - } - } - - if (kexp->nr_args > 0) - { - memcpy(buf.data, kexp, sz); - __appendKernExpMagicAndLength(&buf, 0); - result = palloc(VARHDRSZ + buf.len); - memcpy(result + VARHDRSZ, buf.data, buf.len); - SET_VARSIZE(result, VARHDRSZ + buf.len); - } - pfree(buf.data); - return (bytea *)result; -} - -/* - * codegen_build_scan_quals - */ -bytea * -codegen_build_scan_quals(codegen_context *context, List *dev_quals) -{ - StringInfoData buf; - Expr *expr; - char *result = NULL; - - Assert(context->elevel >= ERROR); - if (dev_quals == NIL) - return NULL; - if (list_length(dev_quals) == 1) - expr = linitial(dev_quals); - else - expr = make_andclause(dev_quals); - - initStringInfo(&buf); - if (codegen_expression_walker(context, &buf, expr) == 0) - { - result = palloc(VARHDRSZ + buf.len); - memcpy(result + VARHDRSZ, buf.data, buf.len); - SET_VARSIZE(result, VARHDRSZ+buf.len); - } - pfree(buf.data); - - return (bytea *)result; -} - -/* - * __try_inject_projection_expression - */ -static int -__try_inject_projection_expression(codegen_context *context, - StringInfo buf, - Expr *expr, - bool write_kexp_if_exists, - bool *p_inject_new) -{ - ListCell *lc1, *lc2; - int slot_id; - - /* - * When 'expr' is simple Var-reference on the input relations, - * we don't need to inject expression node here. - */ - slot_id = is_expression_equals_tlist(context, expr); - if (slot_id >= 0) - { - if (write_kexp_if_exists) - codegen_var_expression(context, buf, expr, slot_id); - *p_inject_new = false; - return slot_id; - } - - /* - * Try to find out the expression which already has kvars-slot. - * If exists, we can reuse it. - */ - slot_id = 0; - forboth (lc1, context->kvars_depth, - lc2, context->kvars_resno) - { - int depth = lfirst_int(lc1); - int resno = lfirst_int(lc2); - - if (depth < 0 && - resno > 0 && - resno <= list_length(context->tlist_dev)) - { - TargetEntry *tle = list_nth(context->tlist_dev, resno-1); - - if (equal(tle->expr, expr)) - { - if (write_kexp_if_exists) - codegen_var_expression(context, buf, expr, slot_id); - *p_inject_new = false; - return slot_id; - } - } - } - - /* - * Try to assign a new kvars-slot, if 'expr' exists on tlist_dev. - */ - foreach (lc1, context->tlist_dev) - { - TargetEntry *tle = lfirst(lc1); - - if (equal(tle->expr, expr)) - { - kern_expression kexp; - devtype_info *dtype; - Oid type_oid; - int pos; - - slot_id = list_length(context->kvars_depth); - context->kvars_depth = lappend_int(context->kvars_depth, -1); - context->kvars_resno = lappend_int(context->kvars_resno, tle->resno); - context->kvars_types = lappend_oid(context->kvars_types, InvalidOid); - - type_oid = exprType((Node *)expr); - dtype = pgstrom_devtype_lookup(type_oid); - if (!dtype) - elog(ERROR, "type %s is not device supported", - format_type_be(type_oid)); - memset(&kexp, 0, sizeof(kexp)); - kexp.exptype = dtype->type_code; - kexp.expflags = context->kexp_flags; - kexp.opcode = FuncOpCode__SaveExpr; - kexp.nr_args = 1; - kexp.args_offset = MAXALIGN(offsetof(kern_expression, - u.save.data)); - kexp.u.save.slot_id = slot_id; - pos = __appendBinaryStringInfo(buf, &kexp, kexp.args_offset); - codegen_expression_walker(context, buf, expr); - __appendKernExpMagicAndLength(buf, pos); - - *p_inject_new = true; - - return slot_id; - } - } - return -1; /* not found */ -} - -/* - * codegen_build_projection - */ -bytea * -codegen_build_projection(codegen_context *context) -{ - kern_expression *kexp; - StringInfoData arg; - StringInfoData buf; - bool meet_resjunk = false; - ListCell *lc; - int nexprs = 0; - int nattrs = 0; - int n, sz, pos; - char *result; - - n = list_length(context->tlist_dev); - sz = MAXALIGN(offsetof(kern_expression, u.pagg.desc[n])); - kexp = alloca(sz); - memset(kexp, 0, sz); - - initStringInfo(&arg); - foreach (lc, context->tlist_dev) - { - TargetEntry *tle = lfirst(lc); - int slot_id; - bool inject_new; - Oid type_oid; - devtype_info *dtype; - kern_projection_desc *desc; - - if (tle->resjunk) - { - meet_resjunk = true; - continue; - } - else if (meet_resjunk) - elog(ERROR, "Bug? a valid TLE after junk TLEs"); - - slot_id = __try_inject_projection_expression(context, - &arg, - tle->expr, - false, - &inject_new); - if (slot_id < 0) - elog(ERROR, "Bug? expression is missing on tlist_dev: %s", - nodeToString(tle->expr)); - if (inject_new) - nexprs++; - - type_oid = exprType((Node *)tle->expr); - dtype = pgstrom_devtype_lookup(type_oid); - if (!dtype) - elog(ERROR, "type %s is not device supported", - format_type_be(type_oid)); - - desc = &kexp->u.proj.desc[nattrs++]; - desc->slot_id = slot_id; - } - kexp->exptype = TypeOpCode__int4; - kexp->expflags = context->kexp_flags; - kexp->opcode = FuncOpCode__Projection; - kexp->nr_args = nexprs; - kexp->args_offset = MAXALIGN(offsetof(kern_expression, - u.proj.desc[nattrs])); - kexp->u.proj.nattrs = nattrs; - initStringInfo(&buf); - pos = __appendBinaryStringInfo(&buf, kexp, kexp->args_offset); - if (nexprs > 0) - __appendBinaryStringInfo(&buf, arg.data, arg.len); - __appendKernExpMagicAndLength(&buf, pos); - - result = palloc(VARHDRSZ + buf.len); - memcpy(result + VARHDRSZ, buf.data, buf.len); - SET_VARSIZE(result, VARHDRSZ + buf.len); - - pfree(arg.data); - pfree(buf.data); - - return (bytea *)result; -} - -/* - * __codegen_build_joinquals - */ -static kern_expression * -__codegen_build_joinquals(codegen_context *context, - List *join_quals, - List *other_quals) -{ - StringInfoData buf; - kern_expression kexp; - ListCell *lc; - uint32_t kexp_flags__saved; - - if (join_quals == NIL && other_quals == NIL) - return NULL; - - initStringInfo(&buf); - memset(&kexp, 0, sizeof(kexp)); - kexp.exptype = TypeOpCode__bool; - kexp.expflags = context->kexp_flags; - kexp.opcode = FuncOpCode__JoinQuals; - kexp.nr_args = list_length(join_quals) + list_length(other_quals); - kexp.args_offset = SizeOfKernExpr(0); - __appendBinaryStringInfo(&buf, &kexp, SizeOfKernExpr(0)); - - foreach (lc, join_quals) - { - Expr *qual = lfirst(lc); - - if (exprType((Node *)qual) != BOOLOID) - elog(ERROR, "Bub? JOIN quals must be boolean"); - if (codegen_expression_walker(context, &buf, qual) < 0) - return NULL; - } - - kexp_flags__saved = context->kexp_flags; - context->kexp_flags |= KEXP_FLAG__IS_PUSHED_DOWN; - foreach (lc, other_quals) - { - Expr *qual = lfirst(lc); - - if (exprType((Node *)qual) != BOOLOID) - elog(ERROR, "Bub? JOIN quals must be boolean"); - if (codegen_expression_walker(context, &buf, qual) < 0) - return NULL; - } - context->kexp_flags = kexp_flags__saved; - __appendKernExpMagicAndLength(&buf, 0); - - return (kern_expression *)buf.data; -} - -/* - * codegen_build_packed_joinquals - */ -bytea * -codegen_build_packed_joinquals(codegen_context *context, - List *stacked_join_quals, - List *stacked_other_quals) -{ - kern_expression *kexp; - StringInfoData buf; - int i, nrels; - size_t sz; - ListCell *lc1, *lc2; - char *result = NULL; - - nrels = list_length(stacked_join_quals); - sz = MAXALIGN(offsetof(kern_expression, u.pack.offset[nrels])); - kexp = alloca(sz); - memset(kexp, 0, sz); - kexp->exptype = TypeOpCode__int4; - kexp->expflags = context->kexp_flags; - kexp->opcode = FuncOpCode__Packed; - kexp->args_offset = sz; - kexp->u.pack.npacked = nrels; - - initStringInfo(&buf); - buf.len = sz; - - i = 0; - forboth (lc1, stacked_join_quals, - lc2, stacked_other_quals) - { - List *join_quals = lfirst(lc1); - List *other_quals = lfirst(lc2); - kern_expression *karg; - - karg = __codegen_build_joinquals(context, - join_quals, - other_quals); - if (karg) - { - kexp->u.pack.offset[i] - = __appendBinaryStringInfo(&buf, karg, karg->len); - kexp->nr_args++; - pfree(karg); - } - i++; - } - Assert(nrels == i); - - if (kexp->nr_args > 0) - { - memcpy(buf.data, kexp, sz); - __appendKernExpMagicAndLength(&buf, 0); - result = palloc(VARHDRSZ + buf.len); - memcpy(result + VARHDRSZ, buf.data, buf.len); - SET_VARSIZE(result, VARHDRSZ + buf.len); - } - pfree(buf.data); - return (bytea *)result; -} - -/* - * codegen_build_packed_hashkeys - */ -static kern_expression * -__codegen_build_hash_value(codegen_context *context, - List *hash_keys) -{ - kern_expression *kexp; - StringInfoData buf; - size_t sz = MAXALIGN(SizeOfKernExpr(0)); - ListCell *lc; - - if (hash_keys == NIL) - return NULL; - - kexp = alloca(sz); - memset(kexp, 0, sz); - kexp->exptype = TypeOpCode__int4; - kexp->expflags = context->kexp_flags; - kexp->opcode = FuncOpCode__HashValue; - kexp->nr_args = list_length(hash_keys); - kexp->args_offset = sz; - - initStringInfo(&buf); - buf.len = sz; - foreach (lc, hash_keys) - { - Expr *expr = lfirst(lc); - - codegen_expression_walker(context, &buf, expr); - } - memcpy(buf.data, kexp, sz); - __appendKernExpMagicAndLength(&buf, 0); - - return (kern_expression *)buf.data; -} - -bytea * -codegen_build_packed_hashkeys(codegen_context *context, - List *stacked_hash_keys) -{ - kern_expression *kexp; - StringInfoData buf; - int i, nrels; - size_t sz; - ListCell *lc; - char *result = NULL; - - nrels = list_length(stacked_hash_keys); - sz = MAXALIGN(offsetof(kern_expression, u.pack.offset[nrels])); - kexp = alloca(sz); - memset(kexp, 0, sz); - kexp->exptype = TypeOpCode__int4; - kexp->expflags = context->kexp_flags; - kexp->opcode = FuncOpCode__Packed; - kexp->args_offset = sz; - kexp->u.pack.npacked = nrels; - - initStringInfo(&buf); - buf.len = sz; - - i = 0; - foreach (lc, stacked_hash_keys) - { - List *hash_keys = lfirst(lc); - kern_expression *karg; - - karg = __codegen_build_hash_value(context, hash_keys); - if (karg) - { - kexp->u.pack.offset[i] - = __appendBinaryStringInfo(&buf, karg, karg->len); - kexp->nr_args++; - } - i++; - } - Assert(i == nrels); - - if (kexp->nr_args > 0) - { - memcpy(buf.data, kexp, sz); - __appendKernExpMagicAndLength(&buf, 0); - result = palloc(VARHDRSZ + buf.len); - memcpy(result + VARHDRSZ, buf.data, buf.len); - SET_VARSIZE(result, VARHDRSZ + buf.len); - } - pfree(buf.data); - - return (bytea *)result; -} - -/* - * __try_inject_groupby_expression - */ -static int -__try_inject_groupby_expression(codegen_context *context, - StringInfo buf, - Expr *expr, - bool *p_found) -{ - int slot_id; - bool found = false; - - slot_id = is_expression_equals_tlist(context, expr); - if (slot_id >= 0) - { - found = true; - } - else - { - ListCell *lc1, *lc2; - - slot_id = 0; - forboth (lc1, context->kvars_depth, - lc2, context->kvars_resno) - { - int depth = lfirst_int(lc1); - int resno = lfirst_int(lc2); - - if (depth >= 0) - continue; - if (resno > 0 && - resno <= list_length(context->tlist_dev)) - { - TargetEntry *tle = list_nth(context->tlist_dev, resno-1); - - if (equal(expr, tle->expr)) - { - found = true; - break; - } - } - slot_id++; - } - - if (!found) - { - TargetEntry *tle; - kern_expression kexp; - StringInfoData temp; - devtype_info *dtype; - Oid type_oid; - - /* inject expression */ - slot_id = list_length(context->kvars_depth); - tle = makeTargetEntry(copyObject(expr), - list_length(context->tlist_dev) + 1, - psprintf("slot_%u", slot_id), - true); - context->kvars_depth = lappend_int(context->kvars_depth, -1); - context->kvars_resno = lappend_int(context->kvars_resno, tle->resno); - context->kvars_types = lappend_int(context->kvars_types, InvalidOid); - context->tlist_dev = lappend(context->tlist_dev, tle); - - /* SaveExpr */ - type_oid = exprType((Node *)expr); - dtype = pgstrom_devtype_lookup(type_oid); - if (!dtype) - elog(ERROR, "type %s is not device supported", - format_type_be(type_oid)); - - initStringInfo(&temp); - memset(&kexp, 0, sizeof(kexp)); - kexp.exptype = dtype->type_code; - kexp.expflags = context->kexp_flags; - kexp.opcode = FuncOpCode__SaveExpr; - kexp.nr_args = 1; - kexp.args_offset = MAXALIGN(offsetof(kern_expression, - u.save.data)); - kexp.u.save.slot_id = slot_id; - __appendBinaryStringInfo(&temp, &kexp, kexp.args_offset); - codegen_expression_walker(context, &temp, expr); - __appendKernExpMagicAndLength(&temp, 0); - __appendBinaryStringInfo(buf, temp.data, temp.len); - pfree(temp.data); - } - } - *p_found = found; - - return slot_id; -} - -/* - * __codegen_groupby_expression - */ -static int -__codegen_groupby_expression(codegen_context *context, - StringInfo buf, - Expr *expr) -{ - int slot_id; - bool found; - - slot_id = __try_inject_groupby_expression(context, buf, expr, &found); - if (found) - codegen_var_expression(context, buf, expr, slot_id); - return slot_id; -} - -/* - * codegen_build_groupby_keyhash - */ -static List * -codegen_build_groupby_keyhash(codegen_context *context, - pgstromPlanInfo *pp_info) -{ - StringInfoData buf; - List *groupby_keys_input_slot = NIL; - kern_expression kexp; - char *xpucode; - ListCell *cell; - - Assert(pp_info->groupby_keys != NIL); - - /* - * Add variable slots to reference grouping-keys from the input and - * kds_final buffer. - */ - initStringInfo(&buf); - memset(&kexp, 0, sizeof(kexp)); - kexp.exptype = TypeOpCode__int4; - kexp.expflags = context->kexp_flags; - kexp.opcode = FuncOpCode__HashValue; - kexp.nr_args = list_length(pp_info->groupby_keys); - kexp.args_offset = MAXALIGN(SizeOfKernExpr(0)); - __appendBinaryStringInfo(&buf, &kexp, kexp.args_offset); - foreach (cell, pp_info->groupby_keys) - { - Expr *key = lfirst(cell); - int slot_id = __codegen_groupby_expression(context, &buf, key); - - groupby_keys_input_slot = lappend_int(groupby_keys_input_slot, slot_id); - } - __appendKernExpMagicAndLength(&buf, 0); - - xpucode = palloc(VARHDRSZ + buf.len); - memcpy(xpucode + VARHDRSZ, buf.data, buf.len); - SET_VARSIZE(xpucode, VARHDRSZ + buf.len); - pp_info->kexp_groupby_keyhash = (bytea *)xpucode; - - return groupby_keys_input_slot; -} - -/* - * codegen_build_groupby_keyload - */ -static List * -codegen_build_groupby_keyload(codegen_context *context, - pgstromPlanInfo *pp_info) -{ - kern_expression *kexp; - List *groupby_keys_final_slot = NIL; - char *xpucode = NULL; - ListCell *lc1, *lc2; - - Assert(pp_info->groupby_keys != NIL); - - foreach (lc1, pp_info->groupby_keys) - { - Expr *key = lfirst(lc1); - - foreach (lc2, context->tlist_dev) - { - TargetEntry *tle = lfirst(lc2); - int slot_id; - - if (tle->resjunk || !equal(key, tle->expr)) - continue; - slot_id = list_length(context->kvars_depth); - groupby_keys_final_slot = lappend_int(groupby_keys_final_slot, slot_id); - context->kvars_depth = lappend_int(context->kvars_depth, -2); - context->kvars_resno = lappend_int(context->kvars_resno, tle->resno); - context->kvars_types = lappend_oid(context->kvars_types, InvalidOid); - break; - } - if (!lc2) - elog(ERROR, "Bug? group-by key is missing on the tlist_dev"); - } - kexp = __codegen_build_loadvars_one(context, -2); - if (kexp) - { - xpucode = palloc(VARHDRSZ + kexp->len); - memcpy(xpucode + VARHDRSZ, kexp, kexp->len); - SET_VARSIZE(xpucode, VARHDRSZ + kexp->len); - pfree(kexp); - } - pp_info->kexp_groupby_keyload = (bytea *)xpucode; - - return groupby_keys_final_slot; -} - -/* - * codegen_build_groupby_keycomp - */ -static void -codegen_build_groupby_keycomp(codegen_context *context, - pgstromPlanInfo *pp_info, - List *groupby_keys_input_slot, - List *groupby_keys_final_slot) -{ - StringInfoData buf; - kern_expression kexp; - size_t sz; - char *xpucode; - ListCell *lc1, *lc2, *lc3; - - Assert(pp_info->groupby_keys != NIL); - - initStringInfo(&buf); - forthree (lc1, pp_info->groupby_keys, - lc2, groupby_keys_input_slot, - lc3, groupby_keys_final_slot) - { - Expr *key = lfirst(lc1); - int i_slot_id = lfirst_int(lc2); - int f_slot_id = lfirst_int(lc3); - Oid type_oid = exprType((Node *)key); - Oid coll_oid = exprCollation((Node *)key); - int pos, __pos; - devtype_info *dtype; - devfunc_info *dfunc; - - dtype = pgstrom_devtype_lookup(type_oid); - if (!dtype) - elog(ERROR, "type %s is not device supported", - format_type_be(type_oid)); - dfunc = devtype_lookup_equal_func(dtype, coll_oid); - if (!dfunc) - elog(ERROR, "type %s has no device executable equal function", - format_type_be(type_oid)); - Assert(dfunc->func_rettype->type_code == TypeOpCode__bool && - dfunc->func_nargs == 2 && - dfunc->func_argtypes[0]->type_oid == type_oid && - dfunc->func_argtypes[1]->type_oid == type_oid); - memset(&kexp, 0, sizeof(kern_expression)); - kexp.exptype = dfunc->func_rettype->type_code; - kexp.expflags = context->kexp_flags; - kexp.opcode = dfunc->func_code; - kexp.nr_args = 2; - kexp.args_offset = SizeOfKernExpr(0); - pos = __appendBinaryStringInfo(&buf, &kexp, kexp.args_offset); - - /* input variable */ - memset(&kexp, 0, sizeof(kern_expression)); - kexp.exptype = dtype->type_code; - kexp.expflags = context->kexp_flags; - kexp.opcode = FuncOpCode__VarExpr; - kexp.u.v.var_typlen = dtype->type_length; - kexp.u.v.var_typbyval = dtype->type_byval; - kexp.u.v.var_typalign = dtype->type_align; - kexp.u.v.var_slot_id = i_slot_id; - __pos = __appendBinaryStringInfo(&buf, &kexp, SizeOfKernExprVar); - __appendKernExpMagicAndLength(&buf, __pos); /* end of FuncExpr */ - - /* final variable */ - memset(&kexp, 0, sizeof(kern_expression)); - kexp.exptype = dtype->type_code; - kexp.expflags = context->kexp_flags; - kexp.opcode = FuncOpCode__VarExpr; - kexp.u.v.var_typlen = dtype->type_length; - kexp.u.v.var_typbyval = dtype->type_byval; - kexp.u.v.var_typalign = dtype->type_align; - kexp.u.v.var_slot_id = f_slot_id; - __pos = __appendBinaryStringInfo(&buf, &kexp, SizeOfKernExprVar); - __appendKernExpMagicAndLength(&buf, __pos); /* end of VarExpr */ - - __appendKernExpMagicAndLength(&buf, pos); /* end of FuncExpr */ - } - - if (list_length(pp_info->groupby_keys) > 1) - { - kern_expression *payload = (kern_expression *)buf.data; - int payload_sz = buf.len; - - initStringInfo(&buf); - memset(&kexp, 0, sizeof(kexp)); - kexp.exptype = TypeOpCode__bool; - kexp.expflags = context->kexp_flags; - kexp.opcode = FuncOpCode__BoolExpr_And; - kexp.nr_args = list_length(pp_info->groupby_keys); - kexp.args_offset = SizeOfKernExpr(0); - __appendBinaryStringInfo(&buf, &kexp, SizeOfKernExpr(0)); - __appendBinaryStringInfo(&buf, payload, payload_sz); - __appendKernExpMagicAndLength(&buf, 0); - pfree(payload); - } - sz = ((kern_expression *)buf.data)->len; - - xpucode = palloc(VARHDRSZ + sz); - memcpy(xpucode + VARHDRSZ, buf.data, sz); - SET_VARSIZE(xpucode, VARHDRSZ + sz); - pfree(buf.data); - - pp_info->kexp_groupby_keycomp = (bytea *)xpucode; -} - -/* - * __codegen_build_groupby_actions - */ -static void -__codegen_build_groupby_actions(codegen_context *context, - pgstromPlanInfo *pp_info) -{ - StringInfoData buf; - int nattrs = list_length(pp_info->groupby_actions); - int nexprs = 0; - int index = 0; - size_t head_sz = MAXALIGN(offsetof(kern_expression, u.pagg.desc[nattrs])); - char *xpucode; - ListCell *lc1, *lc2; - kern_expression *kexp; - - kexp = alloca(head_sz); - memset(kexp, 0, head_sz); - kexp->exptype = TypeOpCode__int4; - kexp->expflags = context->kexp_flags; - kexp->opcode = FuncOpCode__AggFuncs; - kexp->nr_args = 0; - kexp->args_offset = head_sz; - kexp->u.pagg.nattrs = nattrs; - - initStringInfo(&buf); - foreach (lc1, pp_info->groupby_actions) - { - /* MEMO: context->tlist_dev may be updated in the loop, so we cannot use - * forboth() macro here. - */ - TargetEntry *tle = list_nth(context->tlist_dev, index); - int action = lfirst_int(lc1); - int slot_id; - bool inject_new; - kern_aggregate_desc *desc; - - Assert(!tle->resjunk); - desc = &kexp->u.pagg.desc[index++]; - desc->action = action; - if (action == KAGG_ACTION__VREF) - { - slot_id = __try_inject_projection_expression(context, - &buf, - tle->expr, - false, - &inject_new); - if (slot_id < 0) - elog(ERROR, "Bug? grouping-key is not on the kvars-slot"); - if (inject_new) - nexprs++; - desc->arg0_slot_id = slot_id; - } - else - { - FuncExpr *func = (FuncExpr *)tle->expr; - int count = 0; - - Assert(IsA(func, FuncExpr) && list_length(func->args) <= 2); - foreach (lc2, func->args) - { - Expr *fn_arg = lfirst(lc2); - - slot_id = __try_inject_projection_expression(context, - &buf, - fn_arg, - false, - &inject_new); - if (slot_id < 0) - elog(ERROR, "Bug? partial-aggregate-function argument is missing"); - if (inject_new) - nexprs++; - if (count == 0) - desc->arg0_slot_id = slot_id; - else if (count == 1) - desc->arg1_slot_id = slot_id; - else - elog(ERROR, "Bug? too much partial function arguments"); - count++; - } - } - } - Assert(index == nattrs); - - if (nexprs == 0) - { - Assert(buf.len == 0); - __appendBinaryStringInfo(&buf, kexp, head_sz); - __appendKernExpMagicAndLength(&buf, 0); - } - else - { - char *payload = buf.data; - size_t payload_sz = buf.len; - - kexp->nr_args = nexprs; - initStringInfo(&buf); - __appendBinaryStringInfo(&buf, kexp, head_sz); - __appendBinaryStringInfo(&buf, payload, payload_sz); - __appendKernExpMagicAndLength(&buf, 0); - pfree(payload); - } - xpucode = palloc(VARHDRSZ + buf.len); - memcpy(xpucode + VARHDRSZ, buf.data, buf.len); - SET_VARSIZE(xpucode, VARHDRSZ + buf.len); - pfree(buf.data); - - pp_info->kexp_groupby_actions = (bytea *)xpucode; -} - -/* - * codegen_build_groupby_actions - */ -void -codegen_build_groupby_actions(codegen_context *context, - pgstromPlanInfo *pp_info) -{ - List *groupby_keys_input_slot = NIL; - List *groupby_keys_final_slot = NIL; - - if (pp_info->groupby_keys != NIL) - { - groupby_keys_input_slot = codegen_build_groupby_keyhash(context, pp_info); - groupby_keys_final_slot = codegen_build_groupby_keyload(context, pp_info); - codegen_build_groupby_keycomp(context, pp_info, - groupby_keys_input_slot, - groupby_keys_final_slot); - } - __codegen_build_groupby_actions(context, pp_info); -} - -/* - * pgstrom_xpu_expression - */ -bool -pgstrom_xpu_expression(Expr *expr, - uint32_t task_kind, - List *input_rels_tlist, - int *p_devcost) -{ - codegen_context context; - - Assert((task_kind & DEVKIND__ANY) == DEVKIND__NVIDIA_GPU || - (task_kind & DEVKIND__ANY) == DEVKIND__NVIDIA_DPU); - memset(&context, 0, sizeof(context)); - context.elevel = DEBUG2; - context.top_expr = expr; - context.required_flags = (task_kind & DEVKIND__ANY); - context.input_rels_tlist = input_rels_tlist; - - if (!expr) - return false; - if (IsA(expr, List)) - { - List *l = (List *)expr; - - if (list_length(l) == 1) - expr = linitial(l); - else - expr = make_andclause(l); - } - if (codegen_expression_walker(&context, NULL, expr) < 0) - return false; - if (p_devcost) - *p_devcost = context.device_cost; - return true; -} - -/* - * pgstrom_gpu_expression - * - * checks whether the expression is executable on GPU devices. - */ -bool -pgstrom_gpu_expression(Expr *expr, - List *input_rels_tlist, - int *p_devcost) -{ - return pgstrom_xpu_expression(expr, - DEVKIND__NVIDIA_GPU, - input_rels_tlist, - p_devcost); -} - -/* - * pgstrom_dpu_expression - * - * checks whether the expression is executable on DPU devices. - */ -bool -pgstrom_dpu_expression(Expr *expr, - List *input_rels_tlist, - int *p_devcost) -{ - return pgstrom_xpu_expression(expr, - DEVKIND__NVIDIA_DPU, - input_rels_tlist, - p_devcost); -} - -/* - * pgstrom_xpucode_to_string - * - * transforms xPU code to human readable form. - */ -static void -__xpucode_to_cstring(StringInfo buf, - const kern_expression *kexp, - const CustomScanState *css, /* optional */ - ExplainState *es, /* optional */ - List *dcontext); /* optionsl */ - -static void -__xpucode_const_cstring(StringInfo buf, const kern_expression *kexp) -{ - devtype_info *dtype = devtype_lookup_by_opcode(kexp->exptype); - - if (kexp->u.c.const_isnull) - { - appendStringInfo(buf, "{Const(%s): value=NULL}", dtype->type_name); - } - else - { - int16 type_len; - bool type_byval; - char type_align; - char type_delim; - Oid type_ioparam; - Oid type_outfunc; - Datum datum = 0; - Datum label; - - get_type_io_data(kexp->u.c.const_type, - IOFunc_output, - &type_len, - &type_byval, - &type_align, - &type_delim, - &type_ioparam, - &type_outfunc); - if (type_byval) - memcpy(&datum, kexp->u.c.const_value, type_len); - else - datum = PointerGetDatum(kexp->u.c.const_value); - label = OidFunctionCall1(type_outfunc, datum); - appendStringInfo(buf, "{Const(%s): value='%s'}", - dtype->type_name, - DatumGetCString(label)); - } -} - -static void -__xpucode_param_cstring(StringInfo buf, const kern_expression *kexp) -{ - devtype_info *dtype = devtype_lookup_by_opcode(kexp->exptype); - - appendStringInfo(buf, "{Param(%s): param_id=%u}", - dtype->type_name, - kexp->u.p.param_id); -} - -static void -__xpucode_var_cstring(StringInfo buf, const kern_expression *kexp) -{ - devtype_info *dtype = devtype_lookup_by_opcode(kexp->exptype); - - appendStringInfo(buf, "{Var(%s): slot_id=%d}", - dtype->type_name, - kexp->u.v.var_slot_id); -} - -static void -__xpucode_loadvars_cstring(StringInfo buf, - const kern_expression *kexp, - const CustomScanState *css, - ExplainState *es, - List *dcontext) -{ - bool verbose = false; - int depth = kexp->u.load.depth; - int i; - - Assert(kexp->nr_args == 0); - appendStringInfo(buf, "{LoadVars: depth=%d", depth); - if (kexp->u.load.nloads > 0) - appendStringInfo(buf, " kvars=["); - - if (css) - { - CustomScan *cscan = (CustomScan *)css->ss.ps.plan; - verbose = (cscan->custom_plans != NIL); - } - - for (i=0; i < kexp->u.load.nloads; i++) - { - const kern_vars_defitem *vitem = &kexp->u.load.kvars[i]; - - if (i > 0) - appendStringInfo(buf, ", "); - if (!css) - { - appendStringInfo(buf, "(slot_id=%u, resno=%d)", - vitem->var_slot_id, - vitem->var_resno); - } - else if (depth == 0) - { - TupleDesc tupdesc = RelationGetDescr(css->ss.ss_currentRelation); - Form_pg_attribute attr = TupleDescAttr(tupdesc, vitem->var_resno - 1); - CustomScan *cscan = (CustomScan *)css->ss.ps.plan; - Var *kvar; - - kvar = makeVar(cscan->scan.scanrelid, - attr->attnum, - attr->atttypid, - attr->atttypmod, - attr->attcollation, 0); - appendStringInfo(buf, "%u:%s", - vitem->var_slot_id, - deparse_expression((Node *)kvar, - dcontext, - verbose, false)); - pfree(kvar); - } - else if (depth < 0) - { - CustomScan *cscan = (CustomScan *)css->ss.ps.plan; - - if (vitem->var_resno >= 1 || - vitem->var_resno <= list_length(cscan->custom_scan_tlist)) - { - TargetEntry *tle = list_nth(cscan->custom_scan_tlist, - vitem->var_resno-1); - appendStringInfo(buf, "%u:%s", - vitem->var_slot_id, - deparse_expression((Node *)tle->expr, - dcontext, - verbose, false)); - } - else - { - appendStringInfo(buf, "var(slot_id=%u)", vitem->var_slot_id); - } - } - else - { - CustomScan *cscan = (CustomScan *)css->ss.ps.plan; - Plan *plan; - TargetEntry *tle; - - plan = list_nth(cscan->custom_plans, depth - 1); - tle = list_nth(plan->targetlist, vitem->var_resno - 1); - appendStringInfo(buf, "%u:%s", - vitem->var_slot_id, - deparse_expression((Node *)tle->expr, - dcontext, - verbose, false)); - } - } - if (kexp->u.load.nloads > 0) - appendStringInfo(buf, "]"); -} - -#if 0 -static void -__xpucode_projection_cstring(StringInfo buf, - const kern_expression *kexp, - const CustomScanState *css, /* optional */ - ExplainState *es, /* optional */ - List *dcontext) -{ - int i, nexprs = kexp->u.proj.nexprs; - - - if (kexp->nr_args > 0) - { - const kern_expression *karg; - - if (kexp->nr_args == 1) - appendStringInfo(buf, " arg="); - else - appendStringInfo(buf, " args=["); - for (i=0, karg = KEXP_FIRST_ARG(kexp); - i < kexp->nr_args; - i++, karg = KEXP_NEXT_ARG(karg)) - { - const kern_projection_desc *desc = &kexp->u.proj.desc[i]; - - if (!__KEXP_IS_VALID(kexp, karg)) - elog(ERROR, "XpuCode looks corrupted"); - if (i > 0) - appendStringInfo(buf, ", "); - appendStringInfo(buf, "%d:", desc->slot_id); - __xpucode_to_cstring(buf, karg, css, es, dcontext); - } - if (kexp->nr_args > 1) - appendStringInfoChar(buf, ']'); - } - appendStringInfoChar(buf, '}'); -} -#endif - -static void -__xpucode_aggfuncs_cstring(StringInfo buf, - const kern_expression *kexp, - const CustomScanState *css, /* optional */ - ExplainState *es, /* optional */ - List *dcontext) -{ - appendStringInfo(buf, "{AggFuncs <"); - for (int j=0; j < kexp->u.pagg.nattrs; j++) - { - const kern_aggregate_desc *desc = &kexp->u.pagg.desc[j]; - - if (j > 0) - appendStringInfo(buf, ", "); - switch (desc->action) - { - case KAGG_ACTION__VREF: - appendStringInfo(buf, "vref[%d]", desc->arg0_slot_id); - break; - case KAGG_ACTION__NROWS_ANY: - appendStringInfo(buf, "nrows[*]"); - break; - case KAGG_ACTION__NROWS_COND: - appendStringInfo(buf, "nrows[%d]", - desc->arg0_slot_id); - break; - case KAGG_ACTION__PMIN_INT: - case KAGG_ACTION__PMIN_FP: - appendStringInfo(buf, "pmin[%d]", - desc->arg0_slot_id); - break; - case KAGG_ACTION__PMAX_INT: - case KAGG_ACTION__PMAX_FP: - appendStringInfo(buf, "pmax[%d]", - desc->arg0_slot_id); - break; - case KAGG_ACTION__PSUM_INT: - case KAGG_ACTION__PSUM_FP: - appendStringInfo(buf, "psum[%d]", - desc->arg0_slot_id); - break; - case KAGG_ACTION__PAVG_INT: - case KAGG_ACTION__PAVG_FP: - appendStringInfo(buf, "pavg[%d]", - desc->arg0_slot_id); - break; - case KAGG_ACTION__STDDEV: - appendStringInfo(buf, "stddev[%d]", - desc->arg0_slot_id); - break; - case KAGG_ACTION__COVAR: - appendStringInfo(buf, "stddev[%d,%d]", - desc->arg0_slot_id, - desc->arg1_slot_id); - break; - default: - appendStringInfo(buf, "unknown[%d,%d]", - desc->arg0_slot_id, - desc->arg1_slot_id); - break; - } - } - appendStringInfo(buf, ">"); -} - -static void -__xpucode_to_cstring(StringInfo buf, - const kern_expression *kexp, - const CustomScanState *css, /* optional */ - ExplainState *es, /* optional */ - List *dcontext) /* optionsl */ -{ - const kern_expression *karg; - int i, pos; - - switch (kexp->opcode) - { - case FuncOpCode__ConstExpr: - __xpucode_const_cstring(buf, kexp); - return; - case FuncOpCode__ParamExpr: - __xpucode_param_cstring(buf, kexp); - return; - case FuncOpCode__VarExpr: - __xpucode_var_cstring(buf, kexp); - return; - case FuncOpCode__Projection: - appendStringInfo(buf, "{Projection <"); - for (int j=0; j < kexp->u.proj.nattrs; j++) - { - const kern_projection_desc *desc = &kexp->u.proj.desc[j]; - if (j > 0) - appendStringInfoChar(buf, ','); - appendStringInfo(buf, "%d", desc->slot_id); - } - appendStringInfo(buf, ">"); - break; - case FuncOpCode__LoadVars: - __xpucode_loadvars_cstring(buf, kexp, css, es, dcontext); - break; - case FuncOpCode__HashValue: - appendStringInfo(buf, "{HashValue"); - break; - case FuncOpCode__JoinQuals: - appendStringInfo(buf, "{JoinQuals: "); - for (i=0, karg=KEXP_FIRST_ARG(kexp); - i < kexp->nr_args; - i++, karg=KEXP_NEXT_ARG(karg)) - { - if (!__KEXP_IS_VALID(kexp,karg)) - elog(ERROR, "XpuCode looks corrupted"); - appendStringInfo(buf, "%s ", i > 0 ? "," : ""); - if ((karg->expflags & KEXP_FLAG__IS_PUSHED_DOWN) != 0) - appendStringInfoChar(buf, '<'); - __xpucode_to_cstring(buf, karg, css, es, dcontext); - if ((karg->expflags & KEXP_FLAG__IS_PUSHED_DOWN) != 0) - appendStringInfoChar(buf, '>'); - } - appendStringInfo(buf, "}"); - return; - case FuncOpCode__SaveExpr: - appendStringInfo(buf, "{SaveExpr slot=%d:", - kexp->u.save.slot_id); - break; - case FuncOpCode__AggFuncs: - __xpucode_aggfuncs_cstring(buf, kexp, css, es, dcontext); - break; - case FuncOpCode__Packed: - appendStringInfo(buf, "{Packed"); - pos = buf->len; - for (i=0; i < kexp->u.pack.npacked; i++) - { - karg = __PICKUP_PACKED_KEXP(kexp, i); - if (!karg) - continue; - if (!__KEXP_IS_VALID(kexp,karg)) - elog(ERROR, "XpuCode looks corrupted"); - if (buf->len > pos) - appendStringInfoChar(buf,','); - appendStringInfo(buf, " items[%u]=", i); - __xpucode_to_cstring(buf, karg, css, es, dcontext); - } - appendStringInfo(buf, "}"); - return; - case FuncOpCode__BoolExpr_And: - appendStringInfo(buf, "{Bool::AND"); - break; - case FuncOpCode__BoolExpr_Or: - appendStringInfo(buf, "{Bool::OR"); - break; - case FuncOpCode__BoolExpr_Not: - appendStringInfo(buf, "{Bool::NOT"); - break; - case FuncOpCode__NullTestExpr_IsNull: - appendStringInfo(buf, "{IsNull"); - break; - case FuncOpCode__NullTestExpr_IsNotNull: - appendStringInfo(buf, "{IsNotNull"); - break; - case FuncOpCode__BoolTestExpr_IsTrue: - appendStringInfo(buf, "{BoolTest::IsTrue"); - break; - case FuncOpCode__BoolTestExpr_IsNotTrue: - appendStringInfo(buf, "{BoolTest::IsNotTrue"); - break; - case FuncOpCode__BoolTestExpr_IsFalse: - appendStringInfo(buf, "{BoolTest::IsFalse"); - break; - case FuncOpCode__BoolTestExpr_IsNotFalse: - appendStringInfo(buf, "{BoolTest::IsNotFalse"); - break; - case FuncOpCode__BoolTestExpr_IsUnknown: - appendStringInfo(buf, "{BoolTest::IsUnknown"); - break; - case FuncOpCode__BoolTestExpr_IsNotUnknown: - appendStringInfo(buf, "{BoolTest::IsNotUnknown"); - break; - default: - { - devtype_info *dtype = devtype_lookup_by_opcode(kexp->exptype); - devfunc_info *dfunc = devfunc_lookup_by_opcode(kexp->opcode); - - appendStringInfo(buf, "{Func::%s(%s)", - dfunc->func_name, - dtype->type_name); - } - break; - } - if (kexp->nr_args > 0) - { - if (kexp->nr_args == 1) - appendStringInfo(buf, " arg="); - else - appendStringInfo(buf, " args=["); - - for (i=0, karg=KEXP_FIRST_ARG(kexp); - i < kexp->nr_args; - i++, karg=KEXP_NEXT_ARG(karg)) - { - if (!__KEXP_IS_VALID(kexp,karg)) - elog(ERROR, "XpuCode looks corrupted"); - if (i > 0) - appendStringInfo(buf, ", "); - __xpucode_to_cstring(buf, karg, css, es, dcontext); - } - if (kexp->nr_args > 1) - appendStringInfoChar(buf, ']'); - } - appendStringInfoChar(buf, '}'); -} - -void -pgstrom_explain_xpucode(const CustomScanState *css, - ExplainState *es, - List *dcontext, - const char *label, - bytea *xpucode) -{ - StringInfoData buf; - - if (xpucode) - { - const kern_expression *kexp = (const kern_expression *)VARDATA(xpucode); - - initStringInfo(&buf); - __xpucode_to_cstring(&buf, kexp, css, es, dcontext); - ExplainPropertyText(label, buf.data, es); - pfree(buf.data); - } -} - -char * -pgstrom_xpucode_to_string(bytea *xpu_code) -{ - StringInfoData buf; - - initStringInfo(&buf); - __xpucode_to_cstring(&buf, (const kern_expression *)VARDATA(xpu_code), - NULL, NULL, NIL); - - return buf.data; -} - -static void -pgstrom_devcache_invalidator(Datum arg, int cacheid, uint32 hashvalue) -{ - MemoryContextReset(devinfo_memcxt); - memset(devtype_info_slot, 0, sizeof(List *) * DEVTYPE_INFO_NSLOTS); - memset(devtype_code_slot, 0, sizeof(List *) * DEVTYPE_INFO_NSLOTS); - memset(devfunc_info_slot, 0, sizeof(List *) * DEVFUNC_INFO_NSLOTS); - memset(devfunc_code_slot, 0, sizeof(List *) * DEVFUNC_INFO_NSLOTS); -} - -void -pgstrom_init_codegen(void) -{ - devinfo_memcxt = AllocSetContextCreate(CacheMemoryContext, - "device type/func info cache", - ALLOCSET_DEFAULT_SIZES); - pgstrom_devcache_invalidator(0, 0, 0); - CacheRegisterSyscacheCallback(TYPEOID, pgstrom_devcache_invalidator, 0); - CacheRegisterSyscacheCallback(PROCOID, pgstrom_devcache_invalidator, 0); -} diff --git a/next/cuda_common.h b/next/cuda_common.h deleted file mode 100644 index aa7e25881..000000000 --- a/next/cuda_common.h +++ /dev/null @@ -1,274 +0,0 @@ -/* - * cuda_common.h - * - * Common header for CUDA device code, in addition to xPU common definitions. - * ---- - * Copyright 2011-2023 (C) KaiGai Kohei - * Copyright 2014-2023 (C) PG-Strom Developers Team - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the PostgreSQL License. - */ -#ifndef CUDA_COMMON_H -#define CUDA_COMMON_H -#include "xpu_common.h" - -#define WARPSIZE 32 -#define MAXTHREADS_PER_BLOCK 1024 -#define MAXWARPS_PER_BLOCK (MAXTHREADS_PER_BLOCK / WARPSIZE) - -#if defined(__CUDACC__) -/* - * Thread index at CUDA C - */ -#define get_group_id() (blockIdx.x) -#define get_num_groups() (gridDim.x) -#define get_local_id() (threadIdx.x) -#define get_local_size() (blockDim.x) -#define get_global_id() (threadIdx.x + blockIdx.x * blockDim.x) -#define get_global_size() (blockDim.x * gridDim.x) - -/* Dynamic shared memory entrypoint */ -extern __shared__ char __pgstrom_dynamic_shared_workmem[] __MAXALIGNED__; -#define SHARED_WORKMEM(UNITSZ,INDEX) \ - (__pgstrom_dynamic_shared_workmem + (UNITSZ)*(INDEX)) - -INLINE_FUNCTION(uint32_t) LaneId(void) -{ - uint32_t rv; - - asm volatile("mov.u32 %0, %laneid;" : "=r"(rv) ); - - return rv; -} - -INLINE_FUNCTION(uint32_t) DynamicShmemSize(void) -{ - uint32_t rv; - - asm volatile("mov.u32 %0, %dynamic_smem_size;" : "=r"(rv) ); - - return rv; -} - -INLINE_FUNCTION(uint32_t) TotalShmemSize(void) -{ - uint32_t rv; - - asm volatile("mov.u32 %0, %total_smem_size;" : "=r"(rv) ); - - return rv; -} - -template -INLINE_FUNCTION(T) -__reduce_stair_add_sync(T value, T *p_total_sum = NULL) -{ - uint32_t lane_id = LaneId(); - uint32_t mask; - T temp; - - assert(__activemask() == 0xffffffffU); - for (mask = 1; mask < warpSize; mask <<= 1) - { - temp = __shfl_sync(__activemask(), value, (lane_id & ~mask) | (mask - 1)); - if (lane_id & mask) - value += temp; - } - temp = __shfl_sync(__activemask(), value, warpSize - 1); - if (p_total_sum) - *p_total_sum = temp; - return value; -} - -INLINE_FUNCTION(void) -STROM_WRITEBACK_ERROR_STATUS(kern_errorbuf *ebuf, kern_context *kcxt) -{ - if (kcxt->errcode != ERRCODE_STROM_SUCCESS && - atomicCAS(&ebuf->errcode, - ERRCODE_STROM_SUCCESS, - kcxt->errcode) == ERRCODE_STROM_SUCCESS) - { - ebuf->errcode = kcxt->errcode; - ebuf->lineno = kcxt->error_lineno; - __strncpy(ebuf->filename, - kcxt->error_filename, - KERN_ERRORBUF_FILENAME_LEN); - __strncpy(ebuf->funcname, - kcxt->error_funcname, - KERN_ERRORBUF_FUNCNAME_LEN); - __strncpy(ebuf->message, - kcxt->error_message, - KERN_ERRORBUF_MESSAGE_LEN); - } -} -#endif /* __CUDACC__ */ - -/* ---------------------------------------------------------------- - * - * Definitions related to per-warp context - * - * ---------------------------------------------------------------- - */ -#define UNIT_TUPLES_PER_DEPTH (2 * WARPSIZE) -typedef struct -{ - uint32_t smx_row_count; /* just for suspend/resume */ - uint32_t __nrels__deprecated; /* number of inner relations, if JOIN */ - int depth; /* 'depth' when suspended */ - int scan_done; /* smallest depth that may produce more tuples */ - /* only KDS_FORMAT_BLOCK */ - uint32_t block_id; /* BLOCK format needs to keep htuples on the */ - uint32_t lp_count; /* lp_items array once, to pull maximum GPU */ - uint32_t lp_wr_pos; /* utilization by simultaneous execution of */ - uint32_t lp_rd_pos; /* the kern_scan_quals. */ - uint32_t lp_items[UNIT_TUPLES_PER_DEPTH]; - /* read/write_pos of the combination buffer for each depth */ - struct { - uint32_t read; /* read_pos of depth=X */ - uint32_t write; /* write_pos of depth=X */ - } pos[1]; /* variable length */ - /* - * <----- __KERN_WARP_CONTEXT_BASESZ -----> - * Above fields are always kept in the device shared memory. - * - * +-------------------------------------------------------------+------ - * | kvars_slot[nslots] + kvars_class[nslots] + extra_sz (pos-0) | - * | kvars_slot[nslots] + kvars_class[nslots] + extra_sz (pos-1) | - * | kvars_slot[nslots] + kvars_class[nslots] + extra_sz (pos-2) | depth=0 - * | : : : | - * | kvars_slot[nslots] + kvars_class[nslots] + extra_sz (pos-63)| - * +-------------------------------------------------------------+------ - * : : : - * +-------------------------------------------------------------+------ - * | kvars_slot[nslots] + kvars_class[nslots] + extra_sz (pos-0) | - * | kvars_slot[nslots] + kvars_class[nslots] + extra_sz (pos-1) | - * | kvars_slot[nslots] + kvars_class[nslots] + extra_sz (pos-2) | depth=nrels - * | : : : | - * | kvars_slot[nslots] + kvars_class[nslots] + extra_sz (pos-63)| - * +-------------------------------------------------------------+------ - */ -} kern_warp_context; - -#define __KERN_WARP_CONTEXT_BASESZ(n_rels) \ - MAXALIGN(offsetof(kern_warp_context, pos[(n_rels)+1])) -#define KERN_WARP_CONTEXT_UNITSZ(n_rels,nbytes) \ - (__KERN_WARP_CONTEXT_BASESZ(n_rels) + \ - (nbytes) * UNIT_TUPLES_PER_DEPTH * ((n_rels)+1)) -#define WARP_READ_POS(warp,depth) ((warp)->pos[(depth)].read) -#define WARP_WRITE_POS(warp,depth) ((warp)->pos[(depth)].write) - -/* - * definitions related to generic device executor routines - */ -EXTERN_FUNCTION(int) -execGpuScanLoadSource(kern_context *kcxt, - kern_warp_context *wp, - kern_data_store *kds_src, - kern_data_extra *kds_extra, - kern_expression *kexp_load_vars, - kern_expression *kexp_scan_quals, - char *kvars_addr_wp, - uint32_t *p_smx_row_count); -EXTERN_FUNCTION(int) -execGpuJoinProjection(kern_context *kcxt, - kern_warp_context *wp, - int n_rels, - kern_data_store *kds_dst, - kern_expression *kexp_projection, - char *kvars_addr_wp, - bool *p_try_suspend); -EXTERN_FUNCTION(int) -execGpuPreAggGroupBy(kern_context *kcxt, - kern_warp_context *wp, - int n_rels, - kern_data_store *kds_final, - char *kvars_addr_wp, - bool *p_try_suspend); -/* - * Definitions related to GpuScan/GpuJoin/GpuPreAgg - */ -typedef struct { - kern_errorbuf kerror; - uint32_t grid_sz; - uint32_t block_sz; - uint32_t extra_sz; - uint32_t kvars_nslots; /* width of the kvars slot */ - uint32_t kvars_nbytes; /* extra buffer size of kvars-slot */ - uint32_t n_rels; /* >0, if JOIN is involved */ - /* suspend/resume support */ - bool resume_context; - uint32_t suspend_count; - /* kernel statistics */ - uint32_t nitems_raw; /* nitems in the raw data chunk */ - uint32_t nitems_in; /* nitems after the scan_quals */ - uint32_t nitems_out; /* nitems of final results */ - struct { - uint32_t nitems_gist; /* nitems picked up by GiST index */ - uint32_t nitems_out; /* nitems after this depth */ - } stats[1]; - /* - * variable length fields - * +-----------------------------------+ - * | kern_warp_context[0] for warp-0 | - * | kern_warp_context[1] for warp-1 | - * | : : : | - * | kern_warp_context[nwarps-1] | - * +-----------------------------------+ ----- - * | l_state[num_rels] for each thread | only if JOIN is involved - * +-----------------------------------+ (n_rels > 0) - * | matched[num_rels] for each thread | - * +-----------------------------------+ ----- - */ -} kern_gputask; - -#define __KERN_GPUTASK_WARP_OFFSET(n_rels,nbytes,gid) \ - (MAXALIGN(offsetof(kern_gputask,stats[(n_rels)])) + \ - KERN_WARP_CONTEXT_UNITSZ(n_rels,nbytes) * ((gid)/WARPSIZE)) - -#define KERN_GPUTASK_WARP_CONTEXT(kgtask) \ - ((kern_warp_context *) \ - ((char *)(kgtask) + \ - __KERN_GPUTASK_WARP_OFFSET((kgtask)->n_rels, \ - (kgtask)->kvars_nbytes, \ - get_global_id()))) -#define KERN_GPUTASK_LSTATE_ARRAY(kgtask) \ - ((kgtask)->n_rels == 0 ? NULL : (uint32_t *) \ - ((char *)(kgtask) + \ - __KERN_GPUTASK_WARP_OFFSET((kgtask)->n_rels, \ - (kgtask)->kvars_nbytes, \ - get_global_size()) + \ - sizeof(uint32_t) * (kgtask)->n_rels * get_global_id())) -#define KERN_GPUTASK_MATCHED_ARRAY(kgtask) \ - ((kgtask)->n_rels == 0 ? NULL : (bool *) \ - ((char *)(kgtask) + \ - __KERN_GPUTASK_WARP_OFFSET((kgtask)->n_rels, \ - (kgtask)->kvars_nbytes, \ - get_global_size()) + \ - sizeof(uint32_t) * (kgtask)->n_rels * get_global_size() + \ - sizeof(bool) * (kgtask)->n_rels * get_global_id())) - -#define KERN_GPUTASK_LENGTH(n_rels,nbytes,n_threads) \ - (__KERN_GPUTASK_WARP_OFFSET((n_rels),(nbytes),(n_threads)) + \ - sizeof(uint32_t) * (n_rels) * (n_threads) + \ - sizeof(bool) * (n_rels) * (n_threads)) - -/* - * GPU Kernel Entrypoint - */ -KERNEL_FUNCTION(void) -kern_gpuscan_main(kern_session_info *session, - kern_gputask *kgtask, - kern_multirels *__kmrels, /* always null */ - kern_data_store *kds_src, - kern_data_extra *kds_extra, - kern_data_store *kds_dst); -KERNEL_FUNCTION(void) -kern_gpujoin_main(kern_session_info *session, - kern_gputask *kgtask, - kern_multirels *kmrels, - kern_data_store *kds_src, - kern_data_extra *kds_extra, - kern_data_store *kds_dst); - -#endif /* CUDA_COMMON_H */ diff --git a/next/cuda_gpujoin.cu b/next/cuda_gpujoin.cu deleted file mode 100644 index 4e1a5f08f..000000000 --- a/next/cuda_gpujoin.cu +++ /dev/null @@ -1,602 +0,0 @@ -/* - * cuda_gpujoin.cu - * - * GPU accelerated parallel relations join based on hash-join or - * nested-loop logic. - * -- - * Copyright 2011-2023 (C) KaiGai Kohei - * Copyright 2014-2023 (C) PG-Strom Developers Team - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the PostgreSQL License. - */ -#include "cuda_common.h" - -/* - * GPU Nested-Loop - */ -STATIC_FUNCTION(int) -execGpuJoinNestLoop(kern_context *kcxt, - kern_warp_context *wp, - kern_multirels *kmrels, - int depth, - char *src_kvars_addr_wp, - char *dst_kvars_addr_wp, - uint32_t &l_state, - bool &matched) -{ - kern_data_store *kds_heap = KERN_MULTIRELS_INNER_KDS(kmrels, depth-1); - bool *oj_map = KERN_MULTIRELS_OUTER_JOIN_MAP(kmrels, depth-1); - kern_expression *kexp; - uint32_t read_pos; - uint32_t write_pos; - uint32_t mask; - bool tuple_is_valid = false; - - if (WARP_WRITE_POS(wp,depth) >= WARP_READ_POS(wp,depth) + warpSize) - { - /* - * The destination depth already keeps warpSize or more pending - * tuple. So, flush out these tuples first. - */ - return depth+1; - } - - if (__all_sync(__activemask(), l_state >= kds_heap->nitems)) - { - /* - * OK, all the threads in this warp reached to the end of hash-slot - * chain. Due to the above checks, the next depth has enough space - * to store the result in this depth. - */ - if (LaneId() == 0) - WARP_READ_POS(wp,depth-1) = Min(WARP_READ_POS(wp,depth-1) + warpSize, - WARP_WRITE_POS(wp,depth-1)); - __syncwarp(); - l_state = 0; - matched = false; - if (wp->scan_done >= depth) - { - assert(wp->scan_done == depth); - if (WARP_READ_POS(wp,depth-1) >= WARP_WRITE_POS(wp,depth-1)) - { - if (LaneId() == 0) - wp->scan_done = Max(wp->scan_done, depth+1); - return depth+1; - } - /* - * Elsewhere, remaining tuples in the combination buffer - * shall be wiped-out first, then, we update 'scan_done' - * to mark this depth will never generate results any more. - */ - } - else - { - /* back to the previous depth to generate the source tuples. */ - if (WARP_READ_POS(wp,depth-1) + warpSize > WARP_WRITE_POS(wp,depth-1)) - return depth-1; - } - } - read_pos = WARP_READ_POS(wp,depth-1) + LaneId(); - if (read_pos < WARP_WRITE_POS(wp,depth-1)) - { - uint32_t index = l_state++; - - read_pos = (read_pos % UNIT_TUPLES_PER_DEPTH); - kcxt->kvars_slot = (kern_variable *) - (src_kvars_addr_wp + read_pos * kcxt->kvars_nbytes); - kcxt->kvars_class = (int *)(kcxt->kvars_slot + kcxt->kvars_nslots); - if (index < kds_heap->nitems) - { - kern_tupitem *tupitem; - uint32_t offset = KDS_GET_ROWINDEX(kds_heap)[index]; - xpu_int4_t status; - - tupitem = (kern_tupitem *)((char *)kds_heap + - kds_heap->length - - __kds_unpack(offset)); - kexp = SESSION_KEXP_JOIN_LOAD_VARS(kcxt->session, depth-1); - ExecLoadVarsHeapTuple(kcxt, kexp, depth, kds_heap, &tupitem->htup); - kexp = SESSION_KEXP_JOIN_QUALS(kcxt->session, depth-1); - if (EXEC_KERN_EXPRESSION(kcxt, kexp, &status)) - { - assert(!XPU_DATUM_ISNULL(&status)); - if (status.value > 0) - tuple_is_valid = true; - if (status.value != 0) - matched = true; - } - if (oj_map && matched) - { - assert(tupitem->rowid < kds_heap->nitems); - oj_map[tupitem->rowid] = true; - } - } - else if (kmrels->chunks[depth-1].left_outer && - index >= kds_heap->nitems && !matched) - { - /* fill up NULL fields, if FULL/LEFT OUTER JOIN */ - kexp = SESSION_KEXP_JOIN_LOAD_VARS(kcxt->session, depth-1); - ExecLoadVarsHeapTuple(kcxt, kexp, depth, kds_heap, NULL); - tuple_is_valid = true; - } - } - /* error checks */ - if (__any_sync(__activemask(), kcxt->errcode != ERRCODE_STROM_SUCCESS)) - return -1; - /* save the result */ - mask = __ballot_sync(__activemask(), tuple_is_valid); - if (LaneId() == 0) - { - write_pos = WARP_WRITE_POS(wp,depth); - WARP_WRITE_POS(wp,depth) += __popc(mask); - } - write_pos = __shfl_sync(__activemask(), write_pos, 0); - mask &= ((1U << LaneId()) - 1); - write_pos += __popc(mask); - - if (tuple_is_valid) - { - write_pos = (write_pos % UNIT_TUPLES_PER_DEPTH); - memcpy(dst_kvars_addr_wp + write_pos * kcxt->kvars_nbytes, - kcxt->kvars_slot, - kcxt->kvars_nbytes); - } - __syncwarp(); - if (WARP_WRITE_POS(wp,depth) >= WARP_READ_POS(wp,depth) + warpSize) - return depth+1; - return depth; -} - -/* - * GPU Hash-Join - */ -STATIC_FUNCTION(int) -execGpuJoinHashJoin(kern_context *kcxt, - kern_warp_context *wp, - kern_multirels *kmrels, - int depth, - char *src_kvars_addr_wp, - char *dst_kvars_addr_wp, - uint32_t &l_state, - bool &matched) -{ - kern_data_store *kds_hash = KERN_MULTIRELS_INNER_KDS(kmrels, depth-1); - bool *oj_map = KERN_MULTIRELS_OUTER_JOIN_MAP(kmrels, depth-1); - kern_expression *kexp = NULL; - kern_hashitem *khitem = NULL; - uint32_t read_pos; - uint32_t write_pos; - uint32_t index; - uint32_t mask; - bool tuple_is_valid = false; - - if (WARP_WRITE_POS(wp,depth) >= WARP_READ_POS(wp,depth) + warpSize) - { - /* - * Next depth already keeps warpSize or more pending tuples, - * so wipe out these tuples first. - */ - return depth+1; - } - - if (__all_sync(__activemask(), l_state == UINT_MAX)) - { - /* - * OK, all the threads in this warp reached to the end of hash-slot - * chain. Due to the above checks, the next depth has enough space - * to store the result in this depth. - * So, we process this depth again (if we have enough pending tuples), - * back to the previsou depth (if we don't have enough pending tuples - * in this depth), or move to the next depth if previous depth already - * reached to end of the chunk. - */ - if (LaneId() == 0) - WARP_READ_POS(wp,depth-1) = Min(WARP_READ_POS(wp,depth-1) + warpSize, - WARP_WRITE_POS(wp,depth-1)); - __syncwarp(); - l_state = 0; - matched = false; - if (wp->scan_done < depth) - { - /* - * The previous depth still may generate the source tuple. - */ - if (WARP_WRITE_POS(wp,depth-1) < WARP_READ_POS(wp,depth-1) + warpSize) - return depth-1; - } - else - { - assert(wp->scan_done == depth); - if (WARP_READ_POS(wp,depth-1) >= WARP_WRITE_POS(wp,depth-1)) - { - if (LaneId() == 0) - wp->scan_done = depth+1; - return depth+1; - } - /* - * Elsewhere, remaining tuples in the combination buffer - * shall be wiped-out first, then, we update 'scan_done' - * to mark this depth will never generate results any more. - */ - } - } - write_pos = WARP_WRITE_POS(wp,depth-1); - read_pos = WARP_READ_POS(wp,depth-1) + LaneId(); - index = (read_pos % UNIT_TUPLES_PER_DEPTH); - kcxt->kvars_slot = (kern_variable *) - (src_kvars_addr_wp + index * kcxt->kvars_nbytes); - kcxt->kvars_class = (int *)(kcxt->kvars_slot + kcxt->kvars_nslots); - - if (l_state == 0) - { - /* pick up the first item from the hash-slot */ - if (read_pos < write_pos) - { - xpu_int4_t hash; - uint32_t *hslot; - - kexp = SESSION_KEXP_HASH_VALUE(kcxt->session, depth-1); - if (EXEC_KERN_EXPRESSION(kcxt, kexp, &hash)) - { - assert(!XPU_DATUM_ISNULL(&hash)); - hslot = KDS_GET_HASHSLOT(kds_hash, hash.value); - for (khitem = KDS_HASH_FIRST_ITEM(kds_hash, hslot, NULL); - khitem != NULL && khitem->hash != hash.value; - khitem = KDS_HASH_NEXT_ITEM(kds_hash, khitem)); - } - } - else - { - l_state = UINT_MAX; - } - } - else if (l_state != UINT_MAX) - { - /* pick up the next one if any */ - uint32_t hash_value; - - khitem = (kern_hashitem *)((char *)kds_hash + __kds_unpack(l_state)); - hash_value = khitem->hash; - for (khitem = KDS_HASH_NEXT_ITEM(kds_hash, khitem); - khitem != NULL && khitem->hash != hash_value; - khitem = KDS_HASH_NEXT_ITEM(kds_hash, khitem)); - } - /* error checks */ - if (__any_sync(__activemask(), kcxt->errcode != ERRCODE_STROM_SUCCESS)) - return -1; - - if (khitem) - { - xpu_int4_t status; - - kexp = SESSION_KEXP_JOIN_LOAD_VARS(kcxt->session, depth-1); - ExecLoadVarsHeapTuple(kcxt, kexp, depth, kds_hash, &khitem->t.htup); - kexp = SESSION_KEXP_JOIN_QUALS(kcxt->session, depth-1); - if (EXEC_KERN_EXPRESSION(kcxt, kexp, &status)) - { - assert(!XPU_DATUM_ISNULL(&status)); - if (status.value > 0) - tuple_is_valid = true; - if (status.value != 0) - matched = true; - } - if (oj_map && matched) - { - assert(khitem->t.rowid < kds_hash->nitems); - oj_map[khitem->t.rowid] = true; - } - l_state = __kds_packed((char *)khitem - (char *)kds_hash); - } - else - { - if (kmrels->chunks[depth-1].left_outer && - l_state != UINT_MAX && !matched) - { - /* load NULL values on the inner portion */ - kexp = SESSION_KEXP_JOIN_LOAD_VARS(kcxt->session, depth-1); - ExecLoadVarsHeapTuple(kcxt, kexp, depth, kds_hash, NULL); - tuple_is_valid = true; - } - l_state = UINT_MAX; - } - /* error checks */ - if (__any_sync(__activemask(), kcxt->errcode != ERRCODE_STROM_SUCCESS)) - return -1; - /* save the result on the destination buffer */ - mask = __ballot_sync(__activemask(), tuple_is_valid); - if (LaneId() == 0) - { - write_pos = WARP_WRITE_POS(wp,depth); - WARP_WRITE_POS(wp,depth) += __popc(mask); - } - write_pos = __shfl_sync(__activemask(), write_pos, 0); - mask &= ((1U << LaneId()) - 1); - write_pos += __popc(mask); - if (tuple_is_valid) - { - index = write_pos % UNIT_TUPLES_PER_DEPTH; - memcpy(dst_kvars_addr_wp + index * kcxt->kvars_nbytes, - kcxt->kvars_slot, - kcxt->kvars_nbytes); - } - __syncwarp(); - if (WARP_WRITE_POS(wp,depth) >= WARP_READ_POS(wp,depth) + warpSize) - return depth+1; - return depth; -} - -/* - * GPU Projection - */ -PUBLIC_FUNCTION(int) -execGpuJoinProjection(kern_context *kcxt, - kern_warp_context *wp, - int n_rels, /* index of read/write-pos */ - kern_data_store *kds_dst, - kern_expression *kexp_projection, - char *kvars_addr_wp, - bool *p_try_suspend) -{ - uint32_t write_pos = WARP_WRITE_POS(wp,n_rels); - uint32_t read_pos = WARP_READ_POS(wp,n_rels); - uint32_t count; - uint32_t mask; - uint32_t row_id; - uint32_t offset; - int tupsz = 0; - int total_sz = 0; - bool try_suspend = false; - union { - struct { - uint32_t nitems; - uint32_t usage; - } i; - uint64_t v64; - } oldval, curval, newval; - - /* - * The previous depth still may produce new tuples, and number of - * the current result tuples is not sufficient to run projection. - */ - if (wp->scan_done <= n_rels && read_pos + warpSize > write_pos) - return n_rels; - - read_pos += LaneId(); - if (read_pos < write_pos) - { - int index = (read_pos % UNIT_TUPLES_PER_DEPTH); - - kcxt->kvars_slot = (kern_variable *) - (kvars_addr_wp + index * kcxt->kvars_nbytes); - kcxt->kvars_class = (int *)(kcxt->kvars_slot + kcxt->kvars_nslots); - tupsz = kern_estimate_heaptuple(kcxt, - kexp_projection, - kds_dst); - if (tupsz < 0) - STROM_ELOG(kcxt, "unable to compute tuple size"); - } - /* error checks */ - if (__any_sync(__activemask(), kcxt->errcode != ERRCODE_STROM_SUCCESS)) - return -1; - /* allocation of the destination buffer */ - assert(kds_dst->format == KDS_FORMAT_ROW); - mask = __ballot_sync(__activemask(), tupsz > 0); - count = __popc(mask); - mask &= ((1U << LaneId()) - 1); - row_id = __popc(mask); - assert(tupsz == 0 || row_id < count); - - offset = __reduce_stair_add_sync(tupsz, &total_sz); - if (LaneId() == 0) - { - curval.i.nitems = kds_dst->nitems; - curval.i.usage = kds_dst->usage; - do { - newval = oldval = curval; - newval.i.nitems += count; - newval.i.usage += __kds_packed(total_sz); - - if (KDS_HEAD_LENGTH(kds_dst) + - MAXALIGN(sizeof(uint32_t) * newval.i.nitems) + - __kds_unpack(newval.i.usage) > kds_dst->length) - { - try_suspend = true; - break; - } - } while ((curval.v64 = atomicCAS((unsigned long long *)&kds_dst->nitems, - oldval.v64, - newval.v64)) != oldval.v64); - } - oldval.v64 = __shfl_sync(__activemask(), oldval.v64, 0); - row_id += oldval.i.nitems; - /* data store has no space? */ - if (__any_sync(__activemask(), try_suspend)) - { - *p_try_suspend = true; - return -1; - } - /* write out the tuple */ - if (tupsz > 0) - { - kern_tupitem *tupitem; - - offset += __kds_unpack(oldval.i.usage); - KDS_GET_ROWINDEX(kds_dst)[row_id] = __kds_packed(offset); - tupitem = (kern_tupitem *) - ((char *)kds_dst + kds_dst->length - offset); - tupitem->rowid = row_id; - tupitem->t_len = kern_form_heaptuple(kcxt, - kexp_projection, - kds_dst, - &tupitem->htup); - } - /* update the read position */ - if (LaneId() == 0) - { - WARP_READ_POS(wp,n_rels) += count; - assert(WARP_WRITE_POS(wp,n_rels) >= WARP_READ_POS(wp,n_rels)); - } - __syncwarp(); - if (wp->scan_done <= n_rels) - { - if (WARP_WRITE_POS(wp,n_rels) < WARP_READ_POS(wp,n_rels) + warpSize) - return n_rels; /* back to the previous depth */ - } - else - { - if (WARP_READ_POS(wp,n_rels) >= WARP_WRITE_POS(wp,n_rels)) - return -1; /* ok, end of GpuJoin */ - } - return n_rels + 1; /* elsewhere, try again? */ -} - -/* - * kern_gpujoin_main - */ -KERNEL_FUNCTION(void) -kern_gpujoin_main(kern_session_info *session, - kern_gputask *kgtask, - kern_multirels *kmrels, - kern_data_store *kds_src, - kern_data_extra *kds_extra, - kern_data_store *kds_dst) -{ - kern_context *kcxt; - kern_warp_context *wp, *wp_saved; - char *kvars_addr_wp; - uint32_t kvars_chunksz; - uint32_t *l_state; - bool *matched; - uint32_t wp_base_sz; - uint32_t n_rels = (kmrels ? kmrels->num_rels : 0); - int depth; - __shared__ uint32_t smx_row_count; - - assert(kgtask->kvars_nslots == session->kcxt_kvars_nslots && - kgtask->kvars_nbytes == session->kcxt_kvars_nbytes && - kgtask->n_rels == n_rels); - /* setup execution context */ - INIT_KERNEL_CONTEXT(kcxt, session); - wp_base_sz = __KERN_WARP_CONTEXT_BASESZ(n_rels); - wp = (kern_warp_context *)SHARED_WORKMEM(wp_base_sz, get_local_id() / warpSize); - wp_saved = KERN_GPUTASK_WARP_CONTEXT(kgtask); - l_state = KERN_GPUTASK_LSTATE_ARRAY(kgtask); - matched = KERN_GPUTASK_MATCHED_ARRAY(kgtask); - kvars_chunksz = kcxt->kvars_nbytes * UNIT_TUPLES_PER_DEPTH; - kvars_addr_wp = (char *)wp_saved + wp_base_sz; - - if (kgtask->resume_context) - { - /* resume the warp-context from the previous execution */ - if (LaneId() == 0) - memcpy(wp, wp_saved, wp_base_sz); - if (get_local_id() == 0) - smx_row_count = wp->smx_row_count; - depth = __shfl_sync(__activemask(), wp->depth, 0); - } - else - { - /* zero clear the wp */ - if (LaneId() == 0) - memset(wp, 0, wp_base_sz); - if (get_local_id() == 0) - smx_row_count = 0; - depth = 0; - if (l_state) - memset(l_state, 0, sizeof(void *) * kcxt->kvars_nslots); - if (matched) - memset(matched, 0, sizeof(bool) * kcxt->kvars_nslots); - } - __syncthreads(); - - /* main logic of GpuJoin */ - while (depth >= 0) - { - kcxt_reset(kcxt); - if (depth == 0) - { - /* LOAD FROM THE SOURCE */ - depth = execGpuScanLoadSource(kcxt, wp, - kds_src, - kds_extra, - SESSION_KEXP_SCAN_LOAD_VARS(session), - SESSION_KEXP_SCAN_QUALS(session), - kvars_addr_wp, /* depth=0 */ - &smx_row_count); - } - else if (depth > n_rels) - { - bool try_suspend = false; - - assert(depth == n_rels+1); - if (session->xpucode_projection) - { - /* PROJECTION */ - depth = execGpuJoinProjection(kcxt, wp, - n_rels, - kds_dst, - SESSION_KEXP_PROJECTION(session), - kvars_addr_wp + kvars_chunksz * n_rels, - &try_suspend); - } - else - { - /* PRE-AGG */ - depth = execGpuPreAggGroupBy(kcxt, wp, - n_rels, - kds_dst, - kvars_addr_wp + kvars_chunksz * n_rels, - &try_suspend); - } - if (__any_sync(__activemask(), try_suspend)) - { - if (LaneId() == 0) - atomicAdd(&kgtask->suspend_count, 1); - assert(depth < 0); - } - } - else if (kmrels->chunks[depth-1].is_nestloop) - { - /* NEST-LOOP */ - depth = execGpuJoinNestLoop(kcxt, wp, - kmrels, - depth, - kvars_addr_wp + kvars_chunksz * (depth-1), - kvars_addr_wp + kvars_chunksz * depth, - l_state[depth-1], /* call by reference */ - matched[depth-1]); /* call by reference */ - } -#if 0 - else if (kmrels->chunks[depth-1].gist_offset != 0) - { - /* GiST-INDEX-JOIN */ - depth = execGpuJoinGiSTJoin(kcxt, wp, ...); - } -#endif - else - { - /* HASH-JOIN */ - depth = execGpuJoinHashJoin(kcxt, wp, - kmrels, - depth, - kvars_addr_wp + kvars_chunksz * (depth-1), - kvars_addr_wp + kvars_chunksz * depth, - l_state[depth-1], /* call by reference */ - matched[depth-1]); /* call by reference */ - } - assert(__shfl_sync(__activemask(), depth, 0) == depth); - /* bailout if any error status */ - if (__any_sync(__activemask(), kcxt->errcode != ERRCODE_STROM_SUCCESS)) - break; - } - __syncthreads(); - /* suspend the execution context */ - if (LaneId() == 0) - { - wp->depth = depth; - wp->smx_row_count = smx_row_count; - memcpy(wp_saved, wp, wp_base_sz); - } - STROM_WRITEBACK_ERROR_STATUS(&kgtask->kerror, kcxt); -} diff --git a/next/cuda_gpupreagg.cu b/next/cuda_gpupreagg.cu deleted file mode 100644 index d0a33814c..000000000 --- a/next/cuda_gpupreagg.cu +++ /dev/null @@ -1,1750 +0,0 @@ -/* - * cuda_gpupreagg.cu - * - * Device implementation of GpuScan - * ---- - * Copyright 2011-2023 (C) KaiGai Kohei - * Copyright 2014-2023 (C) PG-Strom Developers Team - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the PostgreSQL License. - */ -#include "cuda_common.h" -#include "float2.h" - -/* - * Atomic operations - */ -INLINE_FUNCTION(uint32_t) -__atomic_write_uint32(uint32_t *ptr, uint32_t ival) -{ - return atomicExch((unsigned int *)ptr, ival); -} - -INLINE_FUNCTION(uint64_t) -__atomic_write_uint64(uint64_t *ptr, uint64_t ival) -{ - return atomicExch((unsigned long long int *)ptr, ival); -} - -INLINE_FUNCTION(uint32_t) -__atomic_add_uint32(uint32_t *ptr, uint32_t ival) -{ - return atomicAdd((unsigned int *)ptr, (unsigned int)ival); -} - -INLINE_FUNCTION(uint64_t) -__atomic_add_uint64(uint64_t *ptr, uint64_t ival) -{ - return atomicAdd((unsigned long long *)ptr, (unsigned long long)ival); -} - -INLINE_FUNCTION(int64_t) -__atomic_add_int64(int64_t *ptr, int64_t ival) -{ - return atomicAdd((unsigned long long int *)ptr, (unsigned long long int)ival); -} - -INLINE_FUNCTION(float8_t) -__atomic_add_fp64(float8_t *ptr, float8_t fval) -{ - return atomicAdd((double *)ptr, (double)fval); -} - -INLINE_FUNCTION(int64_t) -__atomic_min_int64(int64_t *ptr, int64_t ival) -{ - return atomicMin((long long int *)ptr, (long long int)ival); -} - -INLINE_FUNCTION(int64_t) -__atomic_max_int64(int64_t *ptr, int64_t ival) -{ - return atomicMax((long long int *)ptr, (long long int)ival); -} - -INLINE_FUNCTION(float8_t) -__atomic_min_fp64(float8_t *ptr, float8_t fval) -{ - union { - unsigned long long ival; - float8_t fval; - } oldval, curval, newval; - - newval.fval = fval; - curval.fval = __volatileRead(ptr); - while (newval.fval < curval.fval) - { - oldval = curval; - curval.ival = atomicCAS((unsigned long long *)ptr, - oldval.ival, - newval.ival); - if (curval.ival == oldval.ival) - break; - } - return curval.fval; -} - -INLINE_FUNCTION(float8_t) -__atomic_max_fp64(float8_t *ptr, float8_t fval) -{ - union { - unsigned long long ival; - float8_t fval; - } oldval, curval, newval; - - newval.fval = fval; - curval.fval = __volatileRead(ptr); - while (newval.fval > curval.fval) - { - oldval = curval; - curval.ival = atomicCAS((unsigned long long *)ptr, - oldval.ival, - newval.ival); - if (curval.ival == oldval.ival) - break; - } - return curval.fval; -} - -INLINE_FUNCTION(uint32_t) -__atomic_cas_uint32(uint32_t *ptr, uint32_t comp, uint32_t newval) -{ - return atomicCAS((unsigned int *)ptr, - (unsigned int)comp, - (unsigned int)newval); -} - -INLINE_FUNCTION(uint64_t) -__atomic_cas_uint64(uint64_t *ptr, uint64_t comp, uint64_t newval) -{ - return atomicCAS((unsigned long long int *)ptr, - (unsigned long long int)comp, - (unsigned long long int)newval); -} - -/* - * __writeOutOneTuplePreAgg - */ -STATIC_FUNCTION(int32_t) -__writeOutOneTupleGroupKey(kern_context *kcxt, - kern_colmeta *cmeta, - kern_aggregate_desc *desc, - char *buffer) -{ - kern_variable *kvar; - int vclass; - int32_t nbytes; - - assert(desc->action == KAGG_ACTION__VREF && - desc->arg0_slot_id >= 0 && - desc->arg0_slot_id < kcxt->kvars_nslots); - vclass = kcxt->kvars_class[desc->arg0_slot_id]; - kvar = &kcxt->kvars_slot[desc->arg0_slot_id]; - switch (vclass) - { - case KVAR_CLASS__NULL: - return 0; - - case KVAR_CLASS__INLINE: - assert(cmeta->attlen >= 0 && - cmeta->attlen <= sizeof(kern_variable)); - if (buffer) - memcpy(buffer, kvar, cmeta->attlen); - return cmeta->attlen; - - case KVAR_CLASS__VARLENA: - assert(cmeta->attlen == -1); - nbytes = VARSIZE_ANY(kvar->ptr); - if (buffer) - memcpy(buffer, kvar->ptr, nbytes); - return nbytes; - - case KVAR_CLASS__XPU_DATUM: - { - xpu_datum_t *xdatum = (xpu_datum_t *) - ((char *)kcxt->kvars_slot + kvar->xpu.offset); - const xpu_datum_operators *expr_ops = xdatum->expr_ops; - - if (XPU_DATUM_ISNULL(xdatum)) - return 0; - assert(expr_ops->xpu_type_code == kvar->xpu.type_code); - return expr_ops->xpu_datum_write(kcxt, buffer, xdatum); - } - - default: - if (vclass < 0) - return -1; - if (cmeta->attlen >= 0) - { - if (buffer) - { - nbytes = Min(vclass, cmeta->attlen); - memcpy(buffer, kvar->ptr, nbytes); - if (nbytes < cmeta->attlen) - memset(buffer + nbytes, 0, cmeta->attlen - nbytes); - } - return cmeta->attlen; - } - else if (cmeta->attlen == -1) - { - nbytes = VARHDRSZ + vclass; - if (buffer) - { - memcpy(buffer+VARHDRSZ, kvar->ptr, vclass); - SET_VARSIZE(buffer, nbytes); - } - return nbytes; - } - } - return -1; -} - -STATIC_FUNCTION(int32_t) -__writeOutOneTuplePreAgg(kern_context *kcxt, - kern_data_store *kds_final, - HeapTupleHeaderData *htup, - kern_expression *kexp_actions) -{ - int nattrs = Min(kds_final->ncols, kexp_actions->u.pagg.nattrs); - uint32_t t_hoff, t_next; - uint16_t t_infomask = HEAP_HASNULL; - char *buffer = NULL; - - t_hoff = MAXALIGN(offsetof(HeapTupleHeaderData, - t_bits) + BITMAPLEN(nattrs)); - if (htup) - { - memset(htup, 0, t_hoff); - htup->t_choice.t_datum.datum_typmod = kds_final->tdtypmod; - htup->t_choice.t_datum.datum_typeid = kds_final->tdtypeid; - htup->t_ctid.ip_blkid.bi_hi = 0xffff; /* InvalidBlockNumber */ - htup->t_ctid.ip_blkid.bi_lo = 0xffff; - htup->t_ctid.ip_posid = 0; /* InvalidOffsetNumber */ - htup->t_infomask2 = (nattrs & HEAP_NATTS_MASK); - htup->t_hoff = t_hoff; - } - /* walk on the columns */ - for (int j=0; j < nattrs; j++) - { - kern_aggregate_desc *desc = &kexp_actions->u.pagg.desc[j]; - kern_colmeta *cmeta = &kds_final->colmeta[j]; - int nbytes; - - assert((char *)cmeta > (char *)kds_final && - (char *)cmeta < (char *)kds_final + kds_final->length); - assert(cmeta->attalign > 0 && cmeta->attalign <= 8); - t_next = TYPEALIGN(cmeta->attalign, t_hoff); - if (htup) - { - if (t_next > t_hoff) - memset((char *)htup + t_hoff, 0, t_next - t_hoff); - buffer = (char *)htup + t_next; - } - - switch (desc->action) - { - case KAGG_ACTION__VREF: - nbytes = __writeOutOneTupleGroupKey(kcxt, cmeta, desc, buffer); - if (nbytes < 0) - return -1; - break; - - case KAGG_ACTION__NROWS_ANY: - case KAGG_ACTION__NROWS_COND: - case KAGG_ACTION__PSUM_INT: - nbytes = sizeof(int64_t); - if (buffer) - *((int64_t *)buffer) = 0; - break; - - case KAGG_ACTION__PSUM_FP: - nbytes = sizeof(float8_t); - if (buffer) - *((float8_t *)buffer) = 0.0; - break; - - case KAGG_ACTION__PMIN_INT: - nbytes = sizeof(kagg_state__pminmax_int64_packed); - if (buffer) - { - kagg_state__pminmax_int64_packed *r = - (kagg_state__pminmax_int64_packed *)buffer; - r->nitems = 0; - r->value = LONG_MAX; - SET_VARSIZE(r, sizeof(kagg_state__pminmax_int64_packed)); - } - t_infomask |= HEAP_HASVARWIDTH; - break; - - case KAGG_ACTION__PMAX_INT: - nbytes = sizeof(kagg_state__pminmax_int64_packed); - if (buffer) - { - kagg_state__pminmax_int64_packed *r = - (kagg_state__pminmax_int64_packed *)buffer; - r->nitems = 0; - r->value = LONG_MIN; - SET_VARSIZE(r, sizeof(kagg_state__pminmax_int64_packed)); - } - t_infomask |= HEAP_HASVARWIDTH; - break; - - case KAGG_ACTION__PMIN_FP: - nbytes = sizeof(kagg_state__pminmax_fp64_packed); - if (buffer) - { - kagg_state__pminmax_fp64_packed *r = - (kagg_state__pminmax_fp64_packed *)buffer; - r->nitems = 0; - r->value = DBL_MAX; - SET_VARSIZE(r, sizeof(kagg_state__pminmax_fp64_packed)); - } - t_infomask |= HEAP_HASVARWIDTH; - break; - - case KAGG_ACTION__PMAX_FP: - nbytes = sizeof(kagg_state__pminmax_fp64_packed); - if (buffer) - { - kagg_state__pminmax_fp64_packed *r = - (kagg_state__pminmax_fp64_packed *)buffer; - r->nitems = 0; - r->value = -DBL_MAX; - SET_VARSIZE(r, sizeof(kagg_state__pminmax_fp64_packed)); - } - t_infomask |= HEAP_HASVARWIDTH; - break; - - case KAGG_ACTION__PAVG_INT: - nbytes = sizeof(kagg_state__pavg_int_packed); - if (buffer) - { - memset(buffer, 0, sizeof(kagg_state__pavg_int_packed)); - SET_VARSIZE(buffer, sizeof(kagg_state__pavg_int_packed)); - } - t_infomask |= HEAP_HASVARWIDTH; - break; - - case KAGG_ACTION__PAVG_FP: - nbytes = sizeof(kagg_state__pavg_fp_packed); - if (buffer) - { - memset(buffer, 0, sizeof(kagg_state__pavg_fp_packed)); - SET_VARSIZE(buffer, sizeof(kagg_state__pavg_fp_packed)); - } - t_infomask |= HEAP_HASVARWIDTH; - break; - - case KAGG_ACTION__STDDEV: - nbytes = sizeof(kagg_state__stddev_packed); - if (buffer) - { - memset(buffer, 0, sizeof(kagg_state__stddev_packed)); - SET_VARSIZE(buffer, sizeof(kagg_state__stddev_packed)); - } - t_infomask |= HEAP_HASVARWIDTH; - break; - - case KAGG_ACTION__COVAR: - nbytes = sizeof(kagg_state__covar_packed); - if (buffer) - { - memset(buffer, 0, sizeof(kagg_state__covar_packed)); - SET_VARSIZE(buffer, sizeof(kagg_state__covar_packed)); - } - t_infomask |= HEAP_HASVARWIDTH; - break; - - default: - STROM_ELOG(kcxt, "unknown xpuPreAgg action"); - return -1; - } - if (htup && nbytes > 0) - htup->t_bits[j>>3] |= (1<<(j&7)); - t_hoff = t_next + nbytes; - } - - if (htup) - htup->t_infomask = t_infomask; - return t_hoff; -} - -/* - * __update_nogroups__nrows_any - */ -INLINE_FUNCTION(void) -__update_nogroups__nrows_any(kern_context *kcxt, - char *buffer, - kern_colmeta *cmeta, - kern_aggregate_desc *desc, - bool kvars_is_valid) -{ - uint32_t mask; - - mask = __ballot_sync(__activemask(), kvars_is_valid); - if (LaneId() == 0) - __atomic_add_uint64((uint64_t *)buffer, __popc(mask)); -} - -/* - * __update_nogroups__nrows_cond - */ -INLINE_FUNCTION(void) -__update_nogroups__nrows_cond(kern_context *kcxt, - char *buffer, - kern_colmeta *cmeta, - kern_aggregate_desc *desc, - bool kvars_is_valid) -{ - uint32_t mask; - - if (kvars_is_valid) - { - if (kcxt->kvars_class[desc->arg0_slot_id] == KVAR_CLASS__NULL) - kvars_is_valid = false; - } - mask = __ballot_sync(__activemask(), kvars_is_valid); - if (LaneId() == 0) - __atomic_add_uint64((uint64_t *)buffer, __popc(mask)); -} - -/* - * __update_nogroups__XXXX - */ -INLINE_FUNCTION(void) -__update_nogroups__pmin_int(kern_context *kcxt, - char *buffer, - kern_colmeta *cmeta, - kern_aggregate_desc *desc, - bool kvars_is_valid) -{ - int64_t ival = LONG_MAX; - uint32_t mask; - - if (kvars_is_valid) - { - int slot_id = desc->arg0_slot_id; - int vclass = kcxt->kvars_class[slot_id]; - - if (vclass == KVAR_CLASS__INLINE) - ival = kcxt->kvars_slot[slot_id].i64; - else - { - assert(vclass == KVAR_CLASS__NULL); - kvars_is_valid = false; - } - } - mask = __ballot_sync(__activemask(), kvars_is_valid); - if (mask != 0) - { - kagg_state__pminmax_int64_packed *r = - (kagg_state__pminmax_int64_packed *)buffer; - - ival = Min(ival, __shfl_xor_sync(__activemask(), ival, 0x0001)); - ival = Min(ival, __shfl_xor_sync(__activemask(), ival, 0x0002)); - ival = Min(ival, __shfl_xor_sync(__activemask(), ival, 0x0004)); - ival = Min(ival, __shfl_xor_sync(__activemask(), ival, 0x0008)); - ival = Min(ival, __shfl_xor_sync(__activemask(), ival, 0x0010)); - - if (LaneId() == 0) - { - __atomic_add_uint32(&r->nitems, __popc(mask)); - __atomic_min_int64(&r->value, ival); - } - } -} - -/* - * __update_nogroups__pmax_int - */ -INLINE_FUNCTION(void) -__update_nogroups__pmax_int(kern_context *kcxt, - char *buffer, - kern_colmeta *cmeta, - kern_aggregate_desc *desc, - bool kvars_is_valid) -{ - int64_t ival = LONG_MIN; - uint32_t mask; - - if (kvars_is_valid) - { - int slot_id = desc->arg0_slot_id; - int vclass = kcxt->kvars_class[slot_id]; - - if (vclass == KVAR_CLASS__INLINE) - ival = kcxt->kvars_slot[slot_id].i64; - else - { - assert(vclass == KVAR_CLASS__NULL); - kvars_is_valid = false; - } - } - mask = __ballot_sync(__activemask(), kvars_is_valid); - if (mask != 0) - { - kagg_state__pminmax_int64_packed *r = - (kagg_state__pminmax_int64_packed *)buffer; - - ival = Max(ival, __shfl_xor_sync(__activemask(), ival, 0x0001)); - ival = Max(ival, __shfl_xor_sync(__activemask(), ival, 0x0002)); - ival = Max(ival, __shfl_xor_sync(__activemask(), ival, 0x0004)); - ival = Max(ival, __shfl_xor_sync(__activemask(), ival, 0x0008)); - ival = Max(ival, __shfl_xor_sync(__activemask(), ival, 0x0010)); - - if (LaneId() == 0) - { - __atomic_add_uint32(&r->nitems, __popc(mask)); - __atomic_max_int64(&r->value, ival); - } - } -} - -/* - * __update_nogroups__pmin_fp - */ -INLINE_FUNCTION(void) -__update_nogroups__pmin_fp(kern_context *kcxt, - char *buffer, - kern_colmeta *cmeta, - kern_aggregate_desc *desc, - bool kvars_is_valid) -{ - float8_t fval = DBL_MAX; - uint32_t mask; - - if (kvars_is_valid) - { - int slot_id = desc->arg0_slot_id; - int vclass = kcxt->kvars_class[slot_id]; - - if (vclass == KVAR_CLASS__INLINE) - fval = kcxt->kvars_slot[slot_id].fp64; - else - { - assert(vclass == KVAR_CLASS__NULL); - kvars_is_valid = false; - } - } - mask = __ballot_sync(__activemask(), kvars_is_valid); - if (mask != 0) - { - kagg_state__pminmax_fp64_packed *r = - (kagg_state__pminmax_fp64_packed *)buffer; - - fval = Min(fval, __shfl_xor_sync(__activemask(), fval, 0x0001)); - fval = Min(fval, __shfl_xor_sync(__activemask(), fval, 0x0002)); - fval = Min(fval, __shfl_xor_sync(__activemask(), fval, 0x0004)); - fval = Min(fval, __shfl_xor_sync(__activemask(), fval, 0x0008)); - fval = Min(fval, __shfl_xor_sync(__activemask(), fval, 0x0010)); - - if (LaneId() == 0) - { - __atomic_add_uint32(&r->nitems, __popc(mask)); - __atomic_min_fp64(&r->value, fval); - } - } -} - -/* - * __update_nogroups__pmax_fp - */ -INLINE_FUNCTION(void) -__update_nogroups__pmax_fp(kern_context *kcxt, - char *buffer, - kern_colmeta *cmeta, - kern_aggregate_desc *desc, - bool kvars_is_valid) -{ - float8_t fval = -DBL_MAX; - uint32_t mask; - - if (kvars_is_valid) - { - int slot_id = desc->arg0_slot_id; - int vclass = kcxt->kvars_class[slot_id]; - - if (vclass == KVAR_CLASS__INLINE) - fval = kcxt->kvars_slot[slot_id].fp64; - else - { - assert(vclass == KVAR_CLASS__NULL); - kvars_is_valid = false; - } - } - mask = __ballot_sync(__activemask(), kvars_is_valid); - if (mask != 0) - { - kagg_state__pminmax_fp64_packed *r = - (kagg_state__pminmax_fp64_packed *)buffer; - - fval = Min(fval, __shfl_xor_sync(__activemask(), fval, 0x0001)); - fval = Min(fval, __shfl_xor_sync(__activemask(), fval, 0x0002)); - fval = Min(fval, __shfl_xor_sync(__activemask(), fval, 0x0004)); - fval = Min(fval, __shfl_xor_sync(__activemask(), fval, 0x0008)); - fval = Min(fval, __shfl_xor_sync(__activemask(), fval, 0x0010)); - - if (LaneId() == 0) - { - __atomic_add_uint32(&r->nitems, __popc(mask)); - __atomic_max_fp64(&r->value, fval); - } - } -} - -/* - * __update_nogroups__psum_int - */ -INLINE_FUNCTION(void) -__update_nogroups__psum_int(kern_context *kcxt, - char *buffer, - kern_colmeta *cmeta, - kern_aggregate_desc *desc, - bool kvars_is_valid) -{ - int64_t ival = 0; - - if (kvars_is_valid) - { - int slot_id = desc->arg0_slot_id; - int vclass = kcxt->kvars_class[slot_id]; - - if (vclass == KVAR_CLASS__INLINE) - ival = kcxt->kvars_slot[slot_id].i64; - else - { - assert(vclass == KVAR_CLASS__NULL); - kvars_is_valid = false; - } - } - if (__any_sync(__activemask(), kvars_is_valid)) - { - ival += __shfl_xor_sync(__activemask(), ival, 0x0001); - ival += __shfl_xor_sync(__activemask(), ival, 0x0002); - ival += __shfl_xor_sync(__activemask(), ival, 0x0004); - ival += __shfl_xor_sync(__activemask(), ival, 0x0008); - ival += __shfl_xor_sync(__activemask(), ival, 0x0010); - if (LaneId() == 0) - __atomic_add_int64((int64_t *)buffer, ival); - } -} -/* - * __update_nogroups__psum_fp - */ -INLINE_FUNCTION(void) -__update_nogroups__psum_fp(kern_context *kcxt, - char *buffer, - kern_colmeta *cmeta, - kern_aggregate_desc *desc, - bool kvars_is_valid) -{ - float8_t fval = 0.0; - - if (kvars_is_valid) - { - int slot_id = desc->arg0_slot_id; - int vclass = kcxt->kvars_class[slot_id]; - - if (vclass == KVAR_CLASS__INLINE) - fval = kcxt->kvars_slot[slot_id].fp64; - else - { - assert(vclass == KVAR_CLASS__NULL); - kvars_is_valid = false; - } - } - if (__any_sync(__activemask(), kvars_is_valid)) - { - fval += __shfl_xor_sync(__activemask(), fval, 0x0001); - fval += __shfl_xor_sync(__activemask(), fval, 0x0002); - fval += __shfl_xor_sync(__activemask(), fval, 0x0004); - fval += __shfl_xor_sync(__activemask(), fval, 0x0008); - fval += __shfl_xor_sync(__activemask(), fval, 0x0010); - if (LaneId() == 0) - __atomic_add_fp64((float8_t *)buffer, fval); - } -} - -/* - * __update_nogroups__pavg_int - */ -INLINE_FUNCTION(void) -__update_nogroups__pavg_int(kern_context *kcxt, - char *buffer, - kern_colmeta *cmeta, - kern_aggregate_desc *desc, - bool kvars_is_valid) -{ - int64_t ival = 0; - uint32_t mask; - - if (kvars_is_valid) - { - int slot_id = desc->arg0_slot_id; - int vclass = kcxt->kvars_class[slot_id]; - - if (vclass == KVAR_CLASS__INLINE) - ival = kcxt->kvars_slot[slot_id].i64; - else - { - assert(vclass == KVAR_CLASS__NULL); - kvars_is_valid = false; - } - } - mask = __ballot_sync(__activemask(), kvars_is_valid); - if (mask != 0) - { - ival += __shfl_xor_sync(__activemask(), ival, 0x0001); - ival += __shfl_xor_sync(__activemask(), ival, 0x0002); - ival += __shfl_xor_sync(__activemask(), ival, 0x0004); - ival += __shfl_xor_sync(__activemask(), ival, 0x0008); - ival += __shfl_xor_sync(__activemask(), ival, 0x0010); - if (LaneId() == 0) - { - kagg_state__pavg_int_packed *r = - (kagg_state__pavg_int_packed *)buffer; - __atomic_add_uint32(&r->nitems, __popc(mask)); - __atomic_add_int64(&r->sum, ival); - } - } -} - -/* - * __update_nogroups__pavg_fp - */ -INLINE_FUNCTION(void) -__update_nogroups__pavg_fp(kern_context *kcxt, - char *buffer, - kern_colmeta *cmeta, - kern_aggregate_desc *desc, - bool kvars_is_valid) -{ - float8_t fval = 0; - uint32_t mask; - - if (kvars_is_valid) - { - int slot_id = desc->arg0_slot_id; - int vclass = kcxt->kvars_class[slot_id]; - - if (vclass == KVAR_CLASS__INLINE) - fval = kcxt->kvars_slot[slot_id].fp64; - else - { - assert(vclass == KVAR_CLASS__NULL); - kvars_is_valid = false; - } - } - mask = __ballot_sync(__activemask(), kvars_is_valid); - if (mask != 0) - { - fval += __shfl_xor_sync(__activemask(), fval, 0x0001); - fval += __shfl_xor_sync(__activemask(), fval, 0x0002); - fval += __shfl_xor_sync(__activemask(), fval, 0x0004); - fval += __shfl_xor_sync(__activemask(), fval, 0x0008); - fval += __shfl_xor_sync(__activemask(), fval, 0x0010); - if (LaneId() == 0) - { - kagg_state__pavg_fp_packed *r = - (kagg_state__pavg_fp_packed *)buffer; - __atomic_add_uint32(&r->nitems, __popc(mask)); - __atomic_add_fp64(&r->sum, fval); - } - } -} -/* - * __update_nogroups__pstddev - */ -INLINE_FUNCTION(void) -__update_nogroups__pstddev(kern_context *kcxt, - char *buffer, - kern_colmeta *cmeta, - kern_aggregate_desc *desc, - bool kvars_is_valid) -{ - float8_t sum_x = 0.0; - uint32_t mask; - - if (kvars_is_valid) - { - int slot_id = desc->arg0_slot_id; - int vclass = kcxt->kvars_class[slot_id]; - - if (vclass == KVAR_CLASS__INLINE) - sum_x = kcxt->kvars_slot[slot_id].fp64; - else - { - assert(vclass == KVAR_CLASS__NULL); - kvars_is_valid = false; - } - } - mask = __ballot_sync(__activemask(), kvars_is_valid); - if (mask != 0) - { - float8_t sum_x2 = sum_x * sum_x; - - /* sum_x */ - sum_x += __shfl_xor_sync(__activemask(), sum_x, 0x0001); - sum_x += __shfl_xor_sync(__activemask(), sum_x, 0x0002); - sum_x += __shfl_xor_sync(__activemask(), sum_x, 0x0004); - sum_x += __shfl_xor_sync(__activemask(), sum_x, 0x0008); - sum_x += __shfl_xor_sync(__activemask(), sum_x, 0x0010); - /* sum_x2 */ - sum_x2 += __shfl_xor_sync(__activemask(), sum_x2, 0x0001); - sum_x2 += __shfl_xor_sync(__activemask(), sum_x2, 0x0002); - sum_x2 += __shfl_xor_sync(__activemask(), sum_x2, 0x0004); - sum_x2 += __shfl_xor_sync(__activemask(), sum_x2, 0x0008); - sum_x2 += __shfl_xor_sync(__activemask(), sum_x2, 0x0010); - - if (LaneId() == 0) - { - kagg_state__stddev_packed *r = - (kagg_state__stddev_packed *)buffer; - __atomic_add_uint32(&r->nitems, __popc(mask)); - __atomic_add_fp64(&r->sum_x, sum_x); - __atomic_add_fp64(&r->sum_x2, sum_x2); - } - } -} - -/* - * __update_nogroups__pavg_covar - */ -INLINE_FUNCTION(void) -__update_nogroups__pcovar(kern_context *kcxt, - char *buffer, - kern_colmeta *cmeta, - kern_aggregate_desc *desc, - bool kvars_is_valid) -{ - float8_t sum_x = 0.0; - float8_t sum_y = 0.0; - uint32_t mask; - - if (kvars_is_valid) - { - if (kcxt->kvars_class[desc->arg0_slot_id] == KVAR_CLASS__INLINE && - kcxt->kvars_class[desc->arg1_slot_id] == KVAR_CLASS__INLINE) - { - sum_x = kcxt->kvars_slot[desc->arg0_slot_id].fp64; - sum_y = kcxt->kvars_slot[desc->arg0_slot_id].fp64; - } - else - { - assert(kcxt->kvars_class[desc->arg0_slot_id] == KVAR_CLASS__NULL || - kcxt->kvars_class[desc->arg1_slot_id] == KVAR_CLASS__NULL); - kvars_is_valid = false; - } - } - mask = __ballot_sync(__activemask(), kvars_is_valid); - if (mask != 0) - { - float8_t sum_xx = sum_x * sum_x; - float8_t sum_xy = sum_x * sum_y; - float8_t sum_yy = sum_y * sum_y; - - /* sum_x */ - sum_x += __shfl_xor_sync(__activemask(), sum_x, 0x0001); - sum_x += __shfl_xor_sync(__activemask(), sum_x, 0x0002); - sum_x += __shfl_xor_sync(__activemask(), sum_x, 0x0004); - sum_x += __shfl_xor_sync(__activemask(), sum_x, 0x0008); - sum_x += __shfl_xor_sync(__activemask(), sum_x, 0x0010); - - /* sum_y */ - sum_y += __shfl_xor_sync(__activemask(), sum_y, 0x0001); - sum_y += __shfl_xor_sync(__activemask(), sum_y, 0x0002); - sum_y += __shfl_xor_sync(__activemask(), sum_y, 0x0004); - sum_y += __shfl_xor_sync(__activemask(), sum_y, 0x0008); - sum_y += __shfl_xor_sync(__activemask(), sum_y, 0x0010); - - /* sum_xx */ - sum_xx += __shfl_xor_sync(__activemask(), sum_xx, 0x0001); - sum_xx += __shfl_xor_sync(__activemask(), sum_xx, 0x0002); - sum_xx += __shfl_xor_sync(__activemask(), sum_xx, 0x0004); - sum_xx += __shfl_xor_sync(__activemask(), sum_xx, 0x0008); - sum_xx += __shfl_xor_sync(__activemask(), sum_xx, 0x0010); - - /* sum_xy */ - sum_xy += __shfl_xor_sync(__activemask(), sum_xy, 0x0001); - sum_xy += __shfl_xor_sync(__activemask(), sum_xy, 0x0002); - sum_xy += __shfl_xor_sync(__activemask(), sum_xy, 0x0004); - sum_xy += __shfl_xor_sync(__activemask(), sum_xy, 0x0008); - sum_xy += __shfl_xor_sync(__activemask(), sum_xy, 0x0010); - - /* sum_yy */ - sum_yy += __shfl_xor_sync(__activemask(), sum_yy, 0x0001); - sum_yy += __shfl_xor_sync(__activemask(), sum_yy, 0x0002); - sum_yy += __shfl_xor_sync(__activemask(), sum_yy, 0x0004); - sum_yy += __shfl_xor_sync(__activemask(), sum_yy, 0x0008); - sum_yy += __shfl_xor_sync(__activemask(), sum_yy, 0x0010); - - if (LaneId() == 0) - { - kagg_state__covar_packed *r = - (kagg_state__covar_packed *)buffer; - __atomic_add_uint32(&r->nitems, __popc(mask)); - __atomic_add_fp64(&r->sum_x, sum_x); - __atomic_add_fp64(&r->sum_xx, sum_xx); - __atomic_add_fp64(&r->sum_y, sum_y); - __atomic_add_fp64(&r->sum_yy, sum_yy); - __atomic_add_fp64(&r->sum_xy, sum_xy); - } - } -} - -/* - * __updateOneTupleNoGroups - */ -STATIC_FUNCTION(void) -__updateOneTupleNoGroups(kern_context *kcxt, - kern_data_store *kds_final, - bool kvars_is_valid, - HeapTupleHeaderData *htup, - kern_expression *kexp_groupby_actions) -{ - int nattrs = (htup->t_infomask2 & HEAP_NATTS_MASK); - bool heap_hasnull = ((htup->t_infomask & HEAP_HASNULL) != 0); - uint32_t t_hoff; - char *buffer = NULL; - - t_hoff = offsetof(HeapTupleHeaderData, t_bits); - if (heap_hasnull) - t_hoff += BITMAPLEN(nattrs); - t_hoff = MAXALIGN(t_hoff); - - for (int j=0; j < nattrs; j++) - { - kern_aggregate_desc *desc = &kexp_groupby_actions->u.pagg.desc[j]; - kern_colmeta *cmeta = &kds_final->colmeta[j]; - - if (heap_hasnull && att_isnull(j, htup->t_bits)) - { - /* only grouping-key may have NULL */ - assert(desc->action == KAGG_ACTION__VREF); - continue; - } - - if (cmeta->attlen > 0) - t_hoff = TYPEALIGN(cmeta->attalign, t_hoff); - else if (!VARATT_NOT_PAD_BYTE((char *)htup + t_hoff)) - t_hoff = TYPEALIGN(cmeta->attalign, t_hoff); - buffer = ((char *)htup + t_hoff); - if (cmeta->attlen > 0) - t_hoff += cmeta->attlen; - else - t_hoff += VARSIZE_ANY(buffer); - - switch (desc->action) - { - case KAGG_ACTION__NROWS_ANY: - __update_nogroups__nrows_any(kcxt, buffer, - cmeta, desc, - kvars_is_valid); - break; - case KAGG_ACTION__NROWS_COND: - __update_nogroups__nrows_cond(kcxt, buffer, - cmeta, desc, - kvars_is_valid); - break; - case KAGG_ACTION__PMIN_INT: - __update_nogroups__pmin_int(kcxt, buffer, - cmeta, desc, - kvars_is_valid); - break; - case KAGG_ACTION__PMAX_INT: - __update_nogroups__pmax_int(kcxt, buffer, - cmeta, desc, - kvars_is_valid); - break; - case KAGG_ACTION__PMIN_FP: - __update_nogroups__pmin_fp(kcxt, buffer, - cmeta, desc, - kvars_is_valid); - break; - case KAGG_ACTION__PMAX_FP: - __update_nogroups__pmax_fp(kcxt, buffer, - cmeta, desc, - kvars_is_valid); - break; - case KAGG_ACTION__PSUM_INT: - __update_nogroups__psum_int(kcxt, buffer, - cmeta, desc, - kvars_is_valid); - break; - case KAGG_ACTION__PSUM_FP: - __update_nogroups__psum_fp(kcxt, buffer, - cmeta, desc, - kvars_is_valid); - break; - case KAGG_ACTION__PAVG_INT: - __update_nogroups__pavg_int(kcxt, buffer, - cmeta, desc, - kvars_is_valid); - break; - case KAGG_ACTION__PAVG_FP: - __update_nogroups__pavg_fp(kcxt, buffer, - cmeta, desc, - kvars_is_valid); - break; - case KAGG_ACTION__STDDEV: - __update_nogroups__pstddev(kcxt, buffer, - cmeta, desc, - kvars_is_valid); - break; - case KAGG_ACTION__COVAR: - __update_nogroups__pcovar(kcxt, buffer, - cmeta, desc, - kvars_is_valid); - break; - default: - /* - * No more partial aggregation exists after grouping-keys - */ - return; - } - } -} - -/* - * __insertOneTupleNoGroups - */ -STATIC_FUNCTION(kern_tupitem *) -__insertOneTupleNoGroups(kern_context *kcxt, - kern_data_store *kds_final, - kern_expression *kexp_groupby_actions) -{ - kern_tupitem *tupitem; - int32_t tupsz; - uint32_t required; - uint32_t usage; - size_t total_sz; - - assert(kds_final->format == KDS_FORMAT_ROW && - kds_final->hash_nslots == 0); - /* estimate length */ - tupsz = __writeOutOneTuplePreAgg(kcxt, kds_final, NULL, - kexp_groupby_actions); - assert(tupsz > 0); - required = MAXALIGN(offsetof(kern_tupitem, htup) + tupsz); - assert(required < 1000); - total_sz = (KDS_HEAD_LENGTH(kds_final) + - MAXALIGN(sizeof(uint32_t)) + - required + __kds_unpack(kds_final->usage)); - if (total_sz > kds_final->length) - return NULL; /* out of memory */ - usage = __atomic_add_uint32(&kds_final->usage, __kds_packed(required)); - tupitem = (kern_tupitem *)((char *)kds_final - + kds_final->length - - __kds_unpack(usage) - - required); - - __writeOutOneTuplePreAgg(kcxt, kds_final, - &tupitem->htup, - kexp_groupby_actions); - tupitem->t_len = tupsz; - tupitem->rowid = 0; - __atomic_write_uint32(KDS_GET_ROWINDEX(kds_final), - __kds_packed((char *)kds_final - + kds_final->length - - (char *)tupitem)); - return tupitem; -} - -STATIC_FUNCTION(bool) -__execGpuPreAggNoGroups(kern_context *kcxt, - kern_data_store *kds_final, - bool kvars_is_valid, - kern_expression *kexp_groupby_actions, - bool *p_try_suspend) -{ - kern_tupitem *tupitem; - bool try_suspend = false; - - assert(kds_final->format == KDS_FORMAT_ROW); - assert(kexp_groupby_actions->opcode == FuncOpCode__AggFuncs); - for (;;) - { - if (LaneId() == 0) - { - uint32_t nitems = __volatileRead(&kds_final->nitems); - uint32_t oldval; - - if (nitems == 1) - { - /* normal case; destination tuple already exists */ - tupitem = KDS_GET_TUPITEM(kds_final, 0); - assert(tupitem != NULL); - } - else if (nitems == 0) - { - oldval = __atomic_cas_uint32(&kds_final->nitems, 0, UINT_MAX); - if (oldval == 0) - { - /* LOCKED */ - tupitem = __insertOneTupleNoGroups(kcxt, kds_final, - kexp_groupby_actions); - if (!tupitem) - { - try_suspend = true; - /* UNLOCK */ - oldval = __atomic_write_uint32(&kds_final->nitems, 0); - assert(oldval == UINT_MAX); - } - else - { - /* UNLOCK */ - oldval = __atomic_write_uint32(&kds_final->nitems, 1); - assert(oldval == UINT_MAX); - } - } - else - { - assert(oldval == 0 || oldval == UINT_MAX); - tupitem = NULL; - } - } - else - { - assert(nitems == UINT_MAX); - /* works in progress - someone setup the destination tuple */ - tupitem = NULL; - } - } - /* out of memory? */ - try_suspend = __shfl_sync(__activemask(), try_suspend, 0); - if (try_suspend) - { - *p_try_suspend = true; - return false; - } - /* is the destination tuple ready? */ - tupitem = (kern_tupitem *)__shfl_sync(__activemask(), (uintptr_t)tupitem, 0); - if (tupitem != NULL) - break; - } - /* update partial aggregation */ - __updateOneTupleNoGroups(kcxt, kds_final, - kvars_is_valid, - &tupitem->htup, - kexp_groupby_actions); - return true; -} - - - - -/* - * __insertOneTupleGroupBy - */ -STATIC_FUNCTION(kern_hashitem *) -__insertOneTupleGroupBy(kern_context *kcxt, - kern_data_store *kds_final, - kern_expression *kexp_groupby_actions) -{ - kern_hashitem *hitem; - int32_t tupsz; - uint32_t required; - union { - uint64_t u64; - struct { - uint32_t nitems; - uint32_t usage; - } kds; - } oldval, curval, newval; - - assert(kds_final->format == KDS_FORMAT_HASH && - kds_final->hash_nslots > 0); - /* estimate length */ - tupsz = __writeOutOneTuplePreAgg(kcxt, kds_final, NULL, - kexp_groupby_actions); - assert(tupsz > 0); - required = MAXALIGN(offsetof(kern_hashitem, t.htup) + tupsz); - - /* expand kds_final */ - curval.kds.nitems = __volatileRead(&kds_final->nitems); - curval.kds.usage = __volatileRead(&kds_final->usage); - for (;;) - { - size_t total_sz; - - newval.kds.nitems = curval.kds.nitems + 1; - newval.kds.usage = curval.kds.usage + __kds_packed(required); - total_sz = (KDS_HEAD_LENGTH(kds_final) + - MAXALIGN(sizeof(uint32_t) * (kds_final->hash_nslots + - newval.kds.nitems)) + - __kds_unpack(curval.kds.usage)); - if (total_sz > kds_final->length) - return NULL; /* out of memory */ - oldval.u64 = __atomic_cas_uint64((uint64_t *)&kds_final->nitems, - curval.u64, - newval.u64); - if (oldval.u64 == curval.u64) - break; - curval.u64 = oldval.u64; - } - hitem = (kern_hashitem *)((char *)kds_final - + kds_final->length - - __kds_unpack(newval.kds.usage)); - __writeOutOneTuplePreAgg(kcxt, kds_final, - &hitem->t.htup, - kexp_groupby_actions); - hitem->t.t_len = tupsz; - hitem->t.rowid = newval.kds.nitems - 1; - KDS_GET_ROWINDEX(kds_final)[hitem->t.rowid] - = __kds_packed((char *)kds_final - + kds_final->length - - (char *)&hitem->t); - return hitem; -} - -/* - * __update_groupby__nrows_any - */ -INLINE_FUNCTION(void) -__update_groupby__nrows_any(kern_context *kcxt, - char *buffer, - kern_colmeta *cmeta, - kern_aggregate_desc *desc) -{ - __atomic_add_uint64((uint64_t *)buffer, 1); -} - -INLINE_FUNCTION(void) -__update_groupby__nrows_cond(kern_context *kcxt, - char *buffer, - kern_colmeta *cmeta, - kern_aggregate_desc *desc) -{ - if (kcxt->kvars_class[desc->arg0_slot_id] != KVAR_CLASS__NULL) - __atomic_add_uint64((uint64_t *)buffer, 1); -} - -INLINE_FUNCTION(void) -__update_groupby__pmin_int(kern_context *kcxt, - char *buffer, - kern_colmeta *cmeta, - kern_aggregate_desc *desc) -{ - int vclass = kcxt->kvars_class[desc->arg0_slot_id]; - - if (vclass == KVAR_CLASS__INLINE) - { - kagg_state__pminmax_int64_packed *r = - (kagg_state__pminmax_int64_packed *)buffer; - int64_t ival = kcxt->kvars_slot[desc->arg0_slot_id].i64; - - __atomic_add_uint32(&r->nitems, 1); - __atomic_min_int64(&r->value, ival); - } - else - { - assert(vclass == KVAR_CLASS__NULL); - } -} - -INLINE_FUNCTION(void) -__update_groupby__pmax_int(kern_context *kcxt, - char *buffer, - kern_colmeta *cmeta, - kern_aggregate_desc *desc) -{ - int vclass = kcxt->kvars_class[desc->arg0_slot_id]; - - if (vclass == KVAR_CLASS__INLINE) - { - kagg_state__pminmax_int64_packed *r = - (kagg_state__pminmax_int64_packed *)buffer; - int64_t ival = kcxt->kvars_slot[desc->arg0_slot_id].i64; - - __atomic_add_uint32(&r->nitems, 1); - __atomic_max_int64(&r->value, ival); - } - else - { - assert(vclass == KVAR_CLASS__NULL); - } -} - -INLINE_FUNCTION(void) -__update_groupby__pmin_fp(kern_context *kcxt, - char *buffer, - kern_colmeta *cmeta, - kern_aggregate_desc *desc) -{ - int vclass = kcxt->kvars_class[desc->arg0_slot_id]; - - if (vclass == KVAR_CLASS__INLINE) - { - kagg_state__pminmax_fp64_packed *r = - (kagg_state__pminmax_fp64_packed *)buffer; - float8_t fval = kcxt->kvars_slot[desc->arg0_slot_id].fp64; - - __atomic_add_uint32(&r->nitems, 1); - __atomic_min_fp64(&r->value, fval); - } - else - { - assert(vclass == KVAR_CLASS__NULL); - } -} - -INLINE_FUNCTION(void) -__update_groupby__pmax_fp(kern_context *kcxt, - char *buffer, - kern_colmeta *cmeta, - kern_aggregate_desc *desc) -{ - int vclass = kcxt->kvars_class[desc->arg0_slot_id]; - - if (vclass == KVAR_CLASS__INLINE) - { - kagg_state__pminmax_fp64_packed *r = - (kagg_state__pminmax_fp64_packed *)buffer; - float8_t fval = kcxt->kvars_slot[desc->arg0_slot_id].fp64; - - __atomic_add_uint32(&r->nitems, 1); - __atomic_max_fp64(&r->value, fval); - } - else - { - assert(vclass == KVAR_CLASS__NULL); - } -} - -INLINE_FUNCTION(void) -__update_groupby__psum_int(kern_context *kcxt, - char *buffer, - kern_colmeta *cmeta, - kern_aggregate_desc *desc) -{ - int vclass = kcxt->kvars_class[desc->arg0_slot_id]; - - if (vclass == KVAR_CLASS__INLINE) - { - int64_t ival = kcxt->kvars_slot[desc->arg0_slot_id].i64; - - __atomic_add_int64((int64_t *)buffer, ival); - } - else - { - assert(vclass == KVAR_CLASS__NULL); - } -} - -INLINE_FUNCTION(void) -__update_groupby__psum_fp(kern_context *kcxt, - char *buffer, - kern_colmeta *cmeta, - kern_aggregate_desc *desc) -{ - int vclass = kcxt->kvars_class[desc->arg0_slot_id]; - - if (vclass == KVAR_CLASS__INLINE) - { - float8_t fval = kcxt->kvars_slot[desc->arg0_slot_id].fp64; - - __atomic_add_fp64((float8_t *)buffer, fval); - } - else - { - assert(vclass == KVAR_CLASS__NULL); - } -} - -INLINE_FUNCTION(void) -__update_groupby__pavg_int(kern_context *kcxt, - char *buffer, - kern_colmeta *cmeta, - kern_aggregate_desc *desc) -{ - int vclass = kcxt->kvars_class[desc->arg0_slot_id]; - - if (vclass == KVAR_CLASS__INLINE) - { - kagg_state__pavg_int_packed *r = - (kagg_state__pavg_int_packed *)buffer; - int64_t ival = kcxt->kvars_slot[desc->arg0_slot_id].i64; - - __atomic_add_uint32(&r->nitems, 1); - __atomic_add_int64(&r->sum, ival); - } - else - { - assert(vclass == KVAR_CLASS__NULL); - } -} - -INLINE_FUNCTION(void) -__update_groupby__pavg_fp(kern_context *kcxt, - char *buffer, - kern_colmeta *cmeta, - kern_aggregate_desc *desc) -{ - int vclass = kcxt->kvars_class[desc->arg0_slot_id]; - - if (vclass == KVAR_CLASS__INLINE) - { - kagg_state__pavg_fp_packed *r = - (kagg_state__pavg_fp_packed *)buffer; - float8_t fval = kcxt->kvars_slot[desc->arg0_slot_id].fp64; - - __atomic_add_uint32(&r->nitems, 1); - __atomic_add_fp64(&r->sum, fval); - } - else - { - assert(vclass == KVAR_CLASS__NULL); - } -} - -INLINE_FUNCTION(void) -__update_groupby__pstddev(kern_context *kcxt, - char *buffer, - kern_colmeta *cmeta, - kern_aggregate_desc *desc) -{ - int vclass = kcxt->kvars_class[desc->arg0_slot_id]; - - if (vclass == KVAR_CLASS__INLINE) - { - kagg_state__stddev_packed *r = - (kagg_state__stddev_packed *)buffer; - float8_t fval = kcxt->kvars_slot[desc->arg0_slot_id].fp64; - - __atomic_add_uint32(&r->nitems, 1); - __atomic_add_fp64(&r->sum_x, fval); - __atomic_add_fp64(&r->sum_x2, fval * fval); - } - else - { - assert(vclass == KVAR_CLASS__NULL); - } -} - -INLINE_FUNCTION(void) -__update_groupby__pcovar(kern_context *kcxt, - char *buffer, - kern_colmeta *cmeta, - kern_aggregate_desc *desc) -{ - if (kcxt->kvars_class[desc->arg0_slot_id] == KVAR_CLASS__INLINE && - kcxt->kvars_class[desc->arg1_slot_id] == KVAR_CLASS__INLINE) - { - kagg_state__covar_packed *r = - (kagg_state__covar_packed *)buffer; - float8_t xval = kcxt->kvars_slot[desc->arg0_slot_id].fp64; - float8_t yval = kcxt->kvars_slot[desc->arg0_slot_id].fp64; - - __atomic_add_uint32(&r->nitems, 1); - __atomic_add_fp64(&r->sum_x, xval); - __atomic_add_fp64(&r->sum_xx, xval * xval); - __atomic_add_fp64(&r->sum_y, yval); - __atomic_add_fp64(&r->sum_yy, yval * yval); - __atomic_add_fp64(&r->sum_xy, xval * yval); - } - else - { - assert(kcxt->kvars_class[desc->arg0_slot_id] == KVAR_CLASS__NULL || - kcxt->kvars_class[desc->arg1_slot_id] == KVAR_CLASS__NULL); - } -} - -/* - * __updateOneTupleGroupBy - */ -STATIC_FUNCTION(void) -__updateOneTupleGroupBy(kern_context *kcxt, - kern_data_store *kds_final, - HeapTupleHeaderData *htup, - kern_expression *kexp_groupby_actions) -{ - int nattrs = (htup->t_infomask2 & HEAP_NATTS_MASK); - bool heap_hasnull = ((htup->t_infomask & HEAP_HASNULL) != 0); - uint32_t t_hoff; - char *buffer; - - t_hoff = offsetof(HeapTupleHeaderData, t_bits); - if (heap_hasnull) - t_hoff += BITMAPLEN(nattrs); - t_hoff = MAXALIGN(t_hoff); - - for (int j=0; j < nattrs; j++) - { - kern_aggregate_desc *desc = &kexp_groupby_actions->u.pagg.desc[j]; - kern_colmeta *cmeta = &kds_final->colmeta[j]; - - if (heap_hasnull && att_isnull(j, htup->t_bits)) - { - /* only grouping-key may have NULL */ - assert(desc->action == KAGG_ACTION__VREF); - continue; - } - - if (cmeta->attlen > 0) - t_hoff = TYPEALIGN(cmeta->attalign, t_hoff); - else if (!VARATT_NOT_PAD_BYTE((char *)htup + t_hoff)) - t_hoff = TYPEALIGN(cmeta->attalign, t_hoff); - buffer = ((char *)htup + t_hoff); - if (cmeta->attlen > 0) - t_hoff += cmeta->attlen; - else - t_hoff += VARSIZE_ANY(buffer); - - switch (desc->action) - { - case KAGG_ACTION__NROWS_ANY: - __update_groupby__nrows_any(kcxt, buffer, cmeta, desc); - break; - case KAGG_ACTION__NROWS_COND: - __update_groupby__nrows_cond(kcxt, buffer, cmeta, desc); - break; - case KAGG_ACTION__PMIN_INT: - __update_groupby__pmin_int(kcxt, buffer, cmeta, desc); - break; - case KAGG_ACTION__PMAX_INT: - __update_groupby__pmax_int(kcxt, buffer, cmeta, desc); - break; - case KAGG_ACTION__PMIN_FP: - __update_groupby__pmin_fp(kcxt, buffer, cmeta, desc); - break; - case KAGG_ACTION__PMAX_FP: - __update_groupby__pmax_fp(kcxt, buffer, cmeta, desc); - break; - case KAGG_ACTION__PSUM_INT: - __update_groupby__psum_int(kcxt, buffer, cmeta, desc); - break; - case KAGG_ACTION__PSUM_FP: - __update_groupby__psum_fp(kcxt, buffer, cmeta, desc); - break; - case KAGG_ACTION__PAVG_INT: - __update_groupby__pavg_int(kcxt, buffer, cmeta, desc); - break; - case KAGG_ACTION__PAVG_FP: - __update_groupby__pavg_fp(kcxt, buffer, cmeta, desc); - break; - case KAGG_ACTION__STDDEV: - __update_groupby__pstddev(kcxt, buffer, cmeta, desc); - break; - case KAGG_ACTION__COVAR: - __update_groupby__pcovar(kcxt, buffer, cmeta, desc); - break; - default: - /* - * No more partial aggregation exists after grouping-keys - */ - return; - } - } -} - -STATIC_FUNCTION(int) -__execGpuPreAggGroupBy(kern_context *kcxt, - kern_data_store *kds_final, - bool kvars_is_valid, - kern_expression *kexp_groupby_keyhash, - kern_expression *kexp_groupby_keyload, - kern_expression *kexp_groupby_keycomp, - kern_expression *kexp_groupby_actions, - bool *p_try_suspend) -{ - kern_hashitem *hitem = NULL; - xpu_int4_t hash; - - assert(kds_final->format == KDS_FORMAT_HASH); - /* - * compute hash value of the grouping keys - */ - memset(&hash, 0, sizeof(hash)); - if (kvars_is_valid) - { - if (EXEC_KERN_EXPRESSION(kcxt, kexp_groupby_keyhash, &hash)) - assert(!XPU_DATUM_ISNULL(&hash)); - } - if (__any_sync(__activemask(), kcxt->errcode != ERRCODE_STROM_SUCCESS)) - return false; - - /* - * lookup the destination grouping tuple. if not found, create a new one. - */ - do { - if (!XPU_DATUM_ISNULL(&hash) && !hitem) - { - uint32_t *hslot = KDS_GET_HASHSLOT(kds_final, hash.value); - uint32_t saved; - xpu_bool_t status; - - for (hitem = KDS_HASH_FIRST_ITEM(kds_final, hslot, &saved); - hitem != NULL; - hitem = KDS_HASH_NEXT_ITEM(kds_final, hitem)) - { - if (hitem->hash != hash.value) - continue; - ExecLoadVarsHeapTuple(kcxt, kexp_groupby_keyload, - -2, - kds_final, - &hitem->t.htup); - if (EXEC_KERN_EXPRESSION(kcxt, kexp_groupby_keycomp, &status)) - { - assert(!XPU_DATUM_ISNULL(&status)); - if (status.value) - break; - } - } - - if (!hitem && saved != UINT_MAX) - { - /* try lock */ - if (__atomic_cas_uint32(hslot, saved, UINT_MAX) == saved) - { - hitem = __insertOneTupleGroupBy(kcxt, kds_final, - kexp_groupby_actions); - if (hitem) - { - uint32_t offset; - - hitem->hash = hash.value; - hitem->next = saved; - offset = (char *)hitem - (char *)kds_final; - /* insert and unlock */ - __atomic_write_uint32(hslot, __kds_packed(offset)); - } - else - { - /* out of the memory */ - __atomic_write_uint32(hslot, saved); - *p_try_suspend = true; - } - } - } - } - /* suspend the kernel? */ - if (__any_sync(__activemask(), *p_try_suspend)) - return false; - /* error checks */ - if (__any_sync(__activemask(), kcxt->errcode != ERRCODE_STROM_SUCCESS)) - return false; - /* retry, if any threads are not ready yet */ - } while (__any_sync(__activemask(), !XPU_DATUM_ISNULL(&hash) && !hitem)); - - /* - * update the partial aggregation - */ - if (hitem) - __updateOneTupleGroupBy(kcxt, kds_final, - &hitem->t.htup, - kexp_groupby_actions); - return true; -} - -PUBLIC_FUNCTION(int) -execGpuPreAggGroupBy(kern_context *kcxt, - kern_warp_context *wp, - int n_rels, - kern_data_store *kds_final, - char *kvars_addr_wp, - bool *p_try_suspend) -{ - kern_session_info *session = kcxt->session; - kern_expression *kexp_groupby_keyhash = SESSION_KEXP_GROUPBY_KEYHASH(session); - kern_expression *kexp_groupby_keyload = SESSION_KEXP_GROUPBY_KEYLOAD(session); - kern_expression *kexp_groupby_keycomp = SESSION_KEXP_GROUPBY_KEYCOMP(session); - kern_expression *kexp_groupby_actions = SESSION_KEXP_GROUPBY_ACTIONS(session); - kern_expression *karg; - uint32_t write_pos = WARP_WRITE_POS(wp,n_rels); - uint32_t read_pos = WARP_READ_POS(wp,n_rels); - uint32_t i, mask; - bool status; - - /* - * The previous depth still may produce new tuples, and number of - * the current result tuples is not sufficient to run projection. - */ - if (wp->scan_done <= n_rels && read_pos + warpSize > write_pos) - return n_rels; - - read_pos += LaneId(); - if (read_pos < write_pos) - { - int index = (read_pos % UNIT_TUPLES_PER_DEPTH); - - kcxt->kvars_slot = (kern_variable *) - (kvars_addr_wp + index * kcxt->kvars_nbytes); - kcxt->kvars_class = (int *)(kcxt->kvars_slot + kcxt->kvars_nslots); - } - else - { - kcxt->kvars_slot = NULL; - kcxt->kvars_class = NULL; - } - mask = __ballot_sync(__activemask(), kcxt->kvars_class != NULL); - if (mask == 0) - goto skip_reduction; - - /* - * fillup the kvars_slot if it involves expressions - */ - if (kcxt->kvars_slot != NULL) - { - for (i=0, karg = KEXP_FIRST_ARG(kexp_groupby_actions); - i < kexp_groupby_actions->nr_args; - i++, karg = KEXP_NEXT_ARG(karg)) - { - assert(karg->opcode == FuncOpCode__SaveExpr); - if (!EXEC_KERN_EXPRESSION(kcxt, karg, NULL)) - { - assert(kcxt->errcode != ERRCODE_STROM_SUCCESS); - break; - } - } - } - if (__any_sync(__activemask(), kcxt->errcode != ERRCODE_STROM_SUCCESS)) - return -1; - /* - * main logic of GpuPreAgg - */ - assert(kexp_groupby_actions != NULL); - if (kexp_groupby_keyhash && - kexp_groupby_keyload && - kexp_groupby_keycomp) - { - status = __execGpuPreAggGroupBy(kcxt, kds_final, - kcxt->kvars_slot != NULL, - kexp_groupby_keyhash, - kexp_groupby_keyload, - kexp_groupby_keycomp, - kexp_groupby_actions, - p_try_suspend); - } - else - { - status = __execGpuPreAggNoGroups(kcxt, kds_final, - kcxt->kvars_slot != NULL, - kexp_groupby_actions, - p_try_suspend); - } - if (__any_sync(__activemask(), !status)) - return -1; - - /* - * Update the read position - */ -skip_reduction: - if (LaneId() == 0) - { - WARP_READ_POS(wp,n_rels) += __popc(mask); - assert(WARP_WRITE_POS(wp,n_rels) >= WARP_READ_POS(wp,n_rels)); - } - __syncwarp(); - if (wp->scan_done <= n_rels) - { - if (WARP_WRITE_POS(wp,n_rels) < WARP_READ_POS(wp,n_rels) + warpSize) - return n_rels; /* back to the previous depth */ - } - else - { - if (WARP_READ_POS(wp,n_rels) >= WARP_WRITE_POS(wp,n_rels)) - return -1; /* ok, end of GpuPreAgg */ - } - return n_rels + 1; /* elsewhere, try again? */ -} diff --git a/next/cuda_gpuscan.cu b/next/cuda_gpuscan.cu deleted file mode 100644 index 7e949fb99..000000000 --- a/next/cuda_gpuscan.cu +++ /dev/null @@ -1,494 +0,0 @@ -/* - * cuda_gpuscan.cu - * - * Device implementation of GpuScan - * ---- - * Copyright 2011-2023 (C) KaiGai Kohei - * Copyright 2014-2023 (C) PG-Strom Developers Team - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the PostgreSQL License. - */ -#include "cuda_common.h" - -/* ---------------------------------------------------------------- - * - * execGpuScanLoadSource and related - * - * ---------------------------------------------------------------- - */ -STATIC_FUNCTION(int) -__gpuscan_load_source_row(kern_context *kcxt, - kern_warp_context *wp, - kern_data_store *kds_src, - kern_expression *kexp_load_vars, - kern_expression *kexp_scan_quals, - char *kvars_addr_wp, - uint32_t *p_smx_row_count) -{ - uint32_t count; - uint32_t index; - uint32_t mask; - uint32_t wr_pos; - kern_tupitem *tupitem = NULL; - - /* fetch next warpSize tuples */ - if (LaneId() == 0) - count = atomicAdd(p_smx_row_count, 1); - count = __shfl_sync(__activemask(), count, 0); - index = (get_num_groups() * count + get_group_id()) * warpSize; - if (index >= kds_src->nitems) - { - if (LaneId() == 0) - wp->scan_done = 1; - __syncwarp(); - return 1; - } - index += LaneId(); - - if (index < kds_src->nitems) - { - uint32_t offset = KDS_GET_ROWINDEX(kds_src)[index]; - - assert(offset <= kds_src->usage); - tupitem = (kern_tupitem *)((char *)kds_src + - kds_src->length - - __kds_unpack(offset)); - assert((char *)tupitem >= (char *)kds_src && - (char *)tupitem < (char *)kds_src + kds_src->length); - kcxt->kvars_slot = (kern_variable *)alloca(kcxt->kvars_nbytes); - kcxt->kvars_class = (int *)(kcxt->kvars_slot + kcxt->kvars_nslots); - if (!ExecLoadVarsOuterRow(kcxt, - kexp_load_vars, - kexp_scan_quals, - kds_src, - &tupitem->htup)) - tupitem = NULL; - } - /* error checks */ - if (__any_sync(__activemask(), kcxt->errcode != ERRCODE_STROM_SUCCESS)) - return -1; - /* - * save the private kvars slot on the combination buffer (depth=0) - */ - mask = __ballot_sync(__activemask(), tupitem != NULL); - if (LaneId() == 0) - { - wr_pos = WARP_WRITE_POS(wp,0); - WARP_WRITE_POS(wp,0) += __popc(mask); - } - wr_pos = __shfl_sync(__activemask(), wr_pos, 0); - mask &= ((1U << LaneId()) - 1); - wr_pos += __popc(mask); - if (tupitem != NULL) - { - index = (wr_pos % UNIT_TUPLES_PER_DEPTH); - memcpy((char *)kvars_addr_wp + index * kcxt->kvars_nbytes, - kcxt->kvars_slot, - kcxt->kvars_nbytes); - } - kcxt->kvars_slot = NULL; - kcxt->kvars_class = NULL; - __syncwarp(); - /* move to the next depth if more than 32 htuples were fetched */ - return (WARP_WRITE_POS(wp,0) >= WARP_READ_POS(wp,0) + warpSize ? 1 : 0); -} - -/* - * __gpuscan_load_source_block - */ -STATIC_FUNCTION(int) -__gpuscan_load_source_block(kern_context *kcxt, - kern_warp_context *wp, - kern_data_store *kds_src, - kern_expression *kexp_load_vars, - kern_expression *kexp_scan_quals, - char *kvars_addr_wp, - uint32_t *p_smx_row_count) -{ - uint32_t block_id = __shfl_sync(__activemask(), wp->block_id, 0); - uint32_t wr_pos = __shfl_sync(__activemask(), wp->lp_wr_pos, 0); - uint32_t rd_pos = __shfl_sync(__activemask(), wp->lp_rd_pos, 0); - uint32_t count; - uint32_t mask; - - assert(wr_pos >= rd_pos); - if (block_id > kds_src->nitems || wr_pos >= rd_pos + warpSize) - { - HeapTupleHeaderData *htup = NULL; - uint32_t off; - int index; - - rd_pos += LaneId(); - if (rd_pos < wr_pos) - { - off = wp->lp_items[rd_pos % UNIT_TUPLES_PER_DEPTH]; - htup = (HeapTupleHeaderData *)((char *)kds_src + __kds_unpack(off)); - kcxt->kvars_slot = (kern_variable *)alloca(kcxt->kvars_nbytes); - kcxt->kvars_class = (int *)(kcxt->kvars_slot + kcxt->kvars_nslots); - if (!ExecLoadVarsOuterRow(kcxt, - kexp_load_vars, - kexp_scan_quals, - kds_src, htup)) - htup = NULL; - } - /* error checks */ - if (__any_sync(__activemask(), kcxt->errcode != ERRCODE_STROM_SUCCESS)) - return -1; - if (LaneId() == 0) - wp->lp_rd_pos = Min(wp->lp_wr_pos, - wp->lp_rd_pos + warpSize); - /* - * save the private kvars on the warp-buffer - */ - mask = __ballot_sync(__activemask(), htup != NULL); - if (LaneId() == 0) - { - wr_pos = WARP_WRITE_POS(wp,0); - WARP_WRITE_POS(wp,0) += __popc(mask); - } - wr_pos = __shfl_sync(__activemask(), wr_pos, 0); - mask &= ((1U << LaneId()) - 1); - wr_pos += __popc(mask); - if (htup != NULL) - { - index = (wr_pos % UNIT_TUPLES_PER_DEPTH); - memcpy(kvars_addr_wp + index * kcxt->kvars_nbytes, - kcxt->kvars_slot, - kcxt->kvars_nbytes); - } - kcxt->kvars_slot = NULL; - kcxt->kvars_class = NULL; - __syncwarp(); - /* end-of-scan checks */ - if (block_id > kds_src->nitems && /* no more blocks to fetch */ - wp->lp_rd_pos >= wp->lp_wr_pos) /* no more pending tuples */ - { - if (LaneId() == 0) - wp->scan_done = 1; - return 1; - } - /* move to the next depth if more than 32 htuples were fetched */ - return (WARP_WRITE_POS(wp,0) >= WARP_READ_POS(wp,0) + warpSize ? 1 : 0); - } - - /* - * Here, number of pending tuples (which is saved in the lp_items[]) is - * not enough to run ScanQuals checks. So, we move to the next bunch of - * line-items or next block. - * The pending tuples just passed the MVCC visivility checks, but - * ScanQuals check is not applied yet. We try to run ScanQuals checks - * with 32 threads simultaneously. - */ - if (block_id == 0) - { - /* - * block_id == 0 means this warp is not associated with particular - * block-page, so we try to fetch the next page. - */ - if (LaneId() == 0) - count = atomicAdd(p_smx_row_count, 1); - count = __shfl_sync(__activemask(), count, 0); - block_id = (get_num_groups() * count + get_group_id()) + 1; - if (LaneId() == 0) - wp->block_id = block_id; - } - if (block_id <= kds_src->nitems) - { - PageHeaderData *pg_page = KDS_BLOCK_PGPAGE(kds_src, block_id-1); - HeapTupleHeaderData *htup = NULL; - - count = __shfl_sync(__activemask(), wp->lp_count, 0); - if (count < PageGetMaxOffsetNumber(pg_page)) - { - count += LaneId(); - if (count < PageGetMaxOffsetNumber(pg_page)) - { - ItemIdData *lpp = &pg_page->pd_linp[count]; - - assert((char *)lpp < (char *)pg_page + BLCKSZ); - if (ItemIdIsNormal(lpp)) - htup = (HeapTupleHeaderData *)PageGetItem(pg_page, lpp); - else - htup = NULL; - } - /* put visible tuples on the lp_items[] array */ - mask = __ballot_sync(__activemask(), htup != NULL); - if (LaneId() == 0) - { - wr_pos = wp->lp_wr_pos; - wp->lp_wr_pos += __popc(mask); - } - wr_pos = __shfl_sync(__activemask(), wr_pos, 0); - mask &= ((1U << LaneId()) - 1); - wr_pos += __popc(mask); - if (htup != NULL) - { - wp->lp_items[wr_pos % UNIT_TUPLES_PER_DEPTH] - = __kds_packed((char *)htup - (char *)kds_src); - } - if (LaneId() == 0) - wp->lp_count += warpSize; - } - else - { - /* no more tuples to fetch from the current page */ - if (LaneId() == 0) - { - wp->block_id = 0; - wp->lp_count = 0; - } - __syncwarp(); - } - } - return 0; /* stay depth-0 */ -} - -/* - * __gpuscan_load_source_arrow - */ -STATIC_FUNCTION(int) -__gpuscan_load_source_arrow(kern_context *kcxt, - kern_warp_context *wp, - kern_data_store *kds_src, - kern_expression *kexp_load_vars, - kern_expression *kexp_scan_quals, - char *kvars_addr_wp, - uint32_t *p_smx_row_count) -{ - uint32_t kds_index; - uint32_t count; - uint32_t mask; - uint32_t wr_pos; - bool is_valid = false; - - /* fetch next warpSize tuples */ - if (LaneId() == 0) - count = atomicAdd(p_smx_row_count, 1); - count = __shfl_sync(__activemask(), count, 0); - kds_index = (get_num_groups() * count + get_group_id()) * warpSize; - if (kds_index >= kds_src->nitems) - { - wp->scan_done = 1; - __syncwarp(__activemask()); - return 1; - } - kds_index += LaneId(); - - if (kds_index < kds_src->nitems) - { - kcxt->kvars_slot = (kern_variable *)alloca(kcxt->kvars_nbytes); - kcxt->kvars_class = (int *)(kcxt->kvars_slot + kcxt->kvars_nslots); - if (ExecLoadVarsOuterArrow(kcxt, - kexp_load_vars, - kexp_scan_quals, - kds_src, - kds_index)) - is_valid = true; - } - /* error checks */ - if (__any_sync(__activemask(), kcxt->errcode != 0)) - return -1; - /* - * save the htuple on the local combination buffer (depth=0) - */ - mask = __ballot_sync(__activemask(), is_valid); - if (LaneId() == 0) - { - wr_pos = WARP_WRITE_POS(wp,0); - WARP_WRITE_POS(wp,0) += __popc(mask); - } - wr_pos = __shfl_sync(__activemask(), wr_pos, 0); - mask &= ((1U << LaneId()) - 1); - wr_pos += __popc(mask); - if (is_valid) - { - int index = (wr_pos % UNIT_TUPLES_PER_DEPTH); - - memcpy(kvars_addr_wp + index * kcxt->kvars_nbytes, - kcxt->kvars_slot, - kcxt->kvars_nbytes); - } - kcxt->kvars_slot = NULL; - kcxt->kvars_class = NULL; - /* move to the next depth if more than 32 htuples were fetched */ - return (WARP_WRITE_POS(wp,0) >= WARP_READ_POS(wp,0) + warpSize ? 1 : 0); -} - -/* - * __gpuscan_load_source_column - */ -INLINE_FUNCTION(int) -__gpuscan_load_source_column(kern_context *kcxt, - kern_warp_context *wp, - kern_data_store *kds_src, - kern_data_extra *kds_extra, - kern_expression *kexp_load_vars, - kern_expression *kern_scan_quals, - char *kvars_addr_wp, - uint32_t *p_smx_row_count) -{ - STROM_ELOG(kcxt, "KDS_FORMAT_COLUMN not implemented"); - return -1; -} - -PUBLIC_FUNCTION(int) -execGpuScanLoadSource(kern_context *kcxt, - kern_warp_context *wp, - kern_data_store *kds_src, - kern_data_extra *kds_extra, - kern_expression *kexp_load_vars, - kern_expression *kexp_scan_quals, - char *kvars_addr_wp, - uint32_t *p_smx_row_count) -{ - /* - * Move to the next depth (or projection), if combination buffer (depth=0) - * may overflow on the next action, or we already reached to the KDS tail. - */ - if (wp->scan_done || WARP_WRITE_POS(wp,0) >= WARP_READ_POS(wp,0) + warpSize) - return 1; - - switch (kds_src->format) - { - case KDS_FORMAT_ROW: - return __gpuscan_load_source_row(kcxt, wp, - kds_src, - kexp_load_vars, - kexp_scan_quals, - kvars_addr_wp, - p_smx_row_count); - case KDS_FORMAT_BLOCK: - return __gpuscan_load_source_block(kcxt, wp, - kds_src, - kexp_load_vars, - kexp_scan_quals, - kvars_addr_wp, - p_smx_row_count); - case KDS_FORMAT_ARROW: - return __gpuscan_load_source_arrow(kcxt, wp, - kds_src, - kexp_load_vars, - kexp_scan_quals, - kvars_addr_wp, - p_smx_row_count); - case KDS_FORMAT_COLUMN: - return __gpuscan_load_source_column(kcxt, wp, - kds_src, - kds_extra, - kexp_load_vars, - kexp_scan_quals, - kvars_addr_wp, - p_smx_row_count); - default: - STROM_ELOG(kcxt, "Bug? Unknown KDS format"); - break; - } - return -1; -} - -/* - * kern_gpuscan_main - */ -KERNEL_FUNCTION(void) -kern_gpuscan_main(kern_session_info *session, - kern_gputask *kgtask, - kern_multirels *__kmrels, /* should be NULL */ - kern_data_store *kds_src, - kern_data_extra *kds_extra, - kern_data_store *kds_dst) -{ - kern_context *kcxt; - kern_warp_context *wp, *wp_saved; - uint32_t wp_base_sz; - char *kvars_addr_wp; /* only depth-0 */ - int depth; - __shared__ uint32_t smx_row_count; - - assert(kgtask->kvars_nslots == session->kcxt_kvars_nslots && - kgtask->kvars_nbytes == session->kcxt_kvars_nbytes && - kgtask->n_rels == 0 && - __kmrels == NULL); - /* setup execution context */ - INIT_KERNEL_CONTEXT(kcxt, session); - wp_base_sz = __KERN_WARP_CONTEXT_BASESZ(0); - wp = (kern_warp_context *)SHARED_WORKMEM(wp_base_sz, get_local_id() / warpSize); - wp_saved = KERN_GPUTASK_WARP_CONTEXT(kgtask); - if (kgtask->resume_context) - { - /* resume warp-context from the previous execution */ - if (LaneId() == 0) - memcpy(wp, wp_saved, wp_base_sz); - if (get_local_id() == 0) - smx_row_count = wp->smx_row_count; - depth = __shfl_sync(__activemask(), wp->depth, 0); - } - else - { - /* zero clear the wp */ - if (LaneId() == 0) - memset(wp, 0, wp_base_sz); - if (get_local_id() == 0) - smx_row_count = 0; - depth = 0; - } - kvars_addr_wp = ((char *)wp_saved + wp_base_sz); - __syncthreads(); - - while (depth >= 0) - { - kcxt_reset(kcxt); - if (depth == 0) - { - /* LOAD FROM THE SOURCE */ - depth = execGpuScanLoadSource(kcxt, wp, - kds_src, - kds_extra, - SESSION_KEXP_SCAN_LOAD_VARS(session), - SESSION_KEXP_SCAN_QUALS(session), - kvars_addr_wp, - &smx_row_count); - } - else - { - bool try_suspend = false; - - assert(depth == 1); - if (session->xpucode_projection) - { - /* PROJECTION */ - depth = execGpuJoinProjection(kcxt, wp, - 0, /* no inner relations */ - kds_dst, - SESSION_KEXP_PROJECTION(session), - kvars_addr_wp, - &try_suspend); - } - else - { - /* PRE-AGG */ - depth = execGpuPreAggGroupBy(kcxt, wp, - 0, /* no inner relations */ - kds_dst, - kvars_addr_wp, - &try_suspend); - } - if (__any_sync(__activemask(), try_suspend)) - { - if (LaneId() == 0) - atomicAdd(&kgtask->suspend_count, 1); - assert(depth < 0); - } - } - __syncwarp(); - } - __syncthreads(); - - if (LaneId() == 0) - { - wp->depth = depth; - wp->smx_row_count = smx_row_count; - memcpy(wp_saved, wp, wp_base_sz); - } - STROM_WRITEBACK_ERROR_STATUS(&kgtask->kerror, kcxt); -} diff --git a/next/extra.c b/next/extra.c deleted file mode 100644 index e208ce63b..000000000 --- a/next/extra.c +++ /dev/null @@ -1,424 +0,0 @@ -/* - * extra.c - * - * Stuff related to invoke HeteroDB Extra Module - * ---- - * Copyright 2011-2023 (C) KaiGai Kohei - * Copyright 2014-2023 (C) PG-Strom Developers Team - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the PostgreSQL License. - */ -#include -#include "pg_strom.h" - -PG_FUNCTION_INFO_V1(pgstrom_license_query); - -/* - * heterodbExtraModuleInfo - */ -static char *(*p_heterodb_extra_module_init)(unsigned int pg_version_num) = NULL; - -static char * -heterodbExtraModuleInit(void) -{ - char *res; - - if (!p_heterodb_extra_module_init) - elog(ERROR, "HeteroDB Extra module is not loaded yet"); - res = p_heterodb_extra_module_init(PG_VERSION_NUM); - if (!res) - elog(ERROR, "out of memory"); - return res; -} - -/* - * heterodbExtraEreport - */ -static heterodb_extra_error_info *p_heterodb_extra_error_data = NULL; - -static inline void -heterodbExtraEreport(int elevel) -{ - elog(elevel, "(%s; %s:%d) %s", - p_heterodb_extra_error_data->funcname, - p_heterodb_extra_error_data->filename, - p_heterodb_extra_error_data->lineno, - p_heterodb_extra_error_data->message); -} - -/* - * heterodbLicenseReload - */ -static int (*p_heterodb_license_reload)(void) = NULL; -static int -heterodbLicenseReload(void) -{ - if (!p_heterodb_license_reload) - return -1; - return p_heterodb_license_reload(); -} - -/* - * heterodbLicenseQuery - */ -static ssize_t (*p_heterodb_license_query)( - char *buf, - size_t bufsz) = NULL; - -static ssize_t -heterodbLicenseQuery(char *buf, size_t bufsz) -{ - if (!p_heterodb_license_query) - return -1; - return p_heterodb_license_query(buf, bufsz); -} - -/* - * heterodbValidateDevice - */ -static int (*p_heterodb_validate_device)(int gpu_device_id, - const char *gpu_device_name, - const char *gpu_device_uuid) = NULL; -bool -heterodbValidateDevice(int gpu_device_id, - const char *gpu_device_name, - const char *gpu_device_uuid) -{ - if (!p_heterodb_validate_device) - return false; - return (p_heterodb_validate_device(gpu_device_id, - gpu_device_name, - gpu_device_uuid) > 0); -} - -/* - * pgstrom_license_query - */ -static char * -__heterodb_license_query(void) -{ - char *buf; - size_t bufsz; - ssize_t nbytes; - - if (heterodbLicenseReload() <= 0) - return NULL; - - bufsz = 2048; -retry: - buf = alloca(bufsz); - nbytes = heterodbLicenseQuery(buf, bufsz); - if (nbytes < 0) - return NULL; - if (nbytes < bufsz) - return pstrdup(buf); - bufsz += bufsz; - goto retry; -} - -Datum -pgstrom_license_query(PG_FUNCTION_ARGS) -{ - char *license; - - if (!superuser()) - ereport(ERROR, - (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), - (errmsg("only superuser can query commercial license")))); - license = __heterodb_license_query(); - if (!license) - PG_RETURN_NULL(); - - PG_RETURN_POINTER(DirectFunctionCall1(json_in, PointerGetDatum(license))); -} - -/* - * gpuDirectInitDriver - */ -static void (*p_cufile__driver_init_v2)() = NULL; - -static void -gpuDirectInitDriver(void) -{ - if (!p_cufile__driver_init_v2) - elog(ERROR, "heterodb_extra: cufile__driver_init_v2 is missing"); - p_cufile__driver_init_v2(); -} - -/* - * gpuDirectOpenDriver - */ -static int (*p_cufile__driver_open_v2)() = NULL; - -bool -gpuDirectOpenDriver(void) -{ - if (!p_cufile__driver_open_v2) - { - elog(ERROR, "heterodb_extra: p_cufile__driver_open_v2 is missing"); - return false; - } - return (p_cufile__driver_open_v2() == 0); -} - -/* - * gpuDirectCloseDriver - */ -static int (*p_cufile__driver_close_v2)() = NULL; - -void -gpuDirectCloseDriver(void) -{ - if (p_cufile__driver_close_v2) - { - if (p_cufile__driver_close_v2() != 0) - heterodbExtraEreport(LOG); - } -} - -/* - * gpuDirectMapGpuMemory - */ -static int (*p_cufile__map_gpu_memory_v2)(CUdeviceptr m_segment, - size_t segment_sz) = NULL; -bool -gpuDirectMapGpuMemory(CUdeviceptr m_segment, - size_t segment_sz) -{ - if (!p_cufile__map_gpu_memory_v2) - return false; - return (p_cufile__map_gpu_memory_v2(m_segment, segment_sz) == 0); -} - -/* - * gpuDirectUnmapGpuMemory - */ -static int (*p_cufile__unmap_gpu_memory_v2)(CUdeviceptr m_segment) = NULL; - -bool -gpuDirectUnmapGpuMemory(CUdeviceptr m_segment) -{ - if (!p_cufile__unmap_gpu_memory_v2) - return false; - return (p_cufile__unmap_gpu_memory_v2(m_segment) == 0); -} - -/* - * gpuDirectFileReadIOV - */ -static int (*p_cufile__read_file_iov_v2)( - const char *pathname, - CUdeviceptr m_segment, - off_t m_offset, - const strom_io_vector *iovec) = NULL; - -bool -gpuDirectFileReadIOV(const char *pathname, - CUdeviceptr m_segment, - off_t m_offset, - const strom_io_vector *iovec) -{ - if (!p_cufile__read_file_iov_v2) - return false; - return (p_cufile__read_file_iov_v2(pathname, - m_segment, - m_offset, - iovec) == 0); -} - -/* - * gpuDirectGetProperty - */ -static int (*p_cufile__get_property_v2)(char *buffer, - size_t buffer_sz) = NULL; -char * -gpuDirectGetProperty(void) -{ - char buffer[2000]; - - if (!p_cufile__get_property_v2) - elog(ERROR, "heterodb_extra: cufile__get_property_v2 is missing"); - if (p_cufile__get_property_v2(buffer, sizeof(buffer)) < 0) - heterodbExtraEreport(ERROR); - return pstrdup(buffer); -} - -/* - * gpuDirectSetProperty - */ -static int (*p_cufile__set_property_v2)(const char *key, - const char *value) = NULL; -void -gpuDirectSetProperty(const char *key, const char *value) -{ - if (!p_cufile__set_property_v2) - elog(ERROR, "heterodb_extra: cufile__set_property_v2 is missing"); - if (p_cufile__set_property_v2(key, value) != 0) - heterodbExtraEreport(ERROR); -} - -/* - * gpuDirectIsSupported - */ -bool -gpuDirectIsAvailable(void) -{ - bool has_gpudirectsql_supported = false; - - if (p_cufile__driver_init_v2 && - p_cufile__driver_open_v2 && - p_cufile__driver_close_v2 && - p_cufile__map_gpu_memory_v2 && - p_cufile__unmap_gpu_memory_v2 && - p_cufile__read_file_iov_v2 && - p_cufile__get_property_v2 && - p_cufile__set_property_v2) - { - for (int i=0; i < numGpuDevAttrs; i++) - { - if (gpuDevAttrs[i].DEV_SUPPORT_GPUDIRECTSQL) - { - has_gpudirectsql_supported = true; - break; - } - } - } - return has_gpudirectsql_supported; -} - -/* lookup_heterodb_extra_function */ -static void * -lookup_heterodb_extra_function(void *handle, const char *symbol) -{ - void *fn_addr; - - fn_addr = dlsym(handle, symbol); - if (!fn_addr) - elog(ERROR, "could not find extra symbol \"%s\" - %s", - symbol, dlerror()); - return fn_addr; -} -#define LOOKUP_HETERODB_EXTRA_FUNCTION(symbol) \ - p_##symbol = lookup_heterodb_extra_function(handle, #symbol) - -/* - * parse_heterodb_extra_module_info - */ -static void -parse_heterodb_extra_module_info(const char *extra_module_info, - uint32 *p_api_version, - bool *p_has_cufile) -{ - char *buffer; - long api_version = 0; - bool has_cufile = false; - char *tok, *pos, *end; - - buffer = alloca(strlen(extra_module_info) + 1); - strcpy(buffer, extra_module_info); - for (tok = strtok_r(buffer, ",", &pos); - tok != NULL; - tok = strtok_r(NULL, ",", &pos)) - { - if (strncmp(tok, "api_version=", 12) == 0) - { - api_version = strtol(tok+12, &end, 10); - if (api_version < 0 || *end != '\0') - elog(ERROR, "invalid extra module token [%s]", tok); - } - else if (strncmp(tok, "cufile=", 7) == 0) - { - if (strcmp(tok+7, "on") == 0) - has_cufile = true; - else if (strcmp(tok+7, "off") == 0) - has_cufile = false; - else - elog(ERROR, "invalid extra module token [%s]", tok); - } - } - if (api_version < HETERODB_EXTRA_API_VERSION) - elog(ERROR, "HeteroDB Extra Module has Unsupported API version [%08lu]", - api_version); - *p_api_version = api_version; - *p_has_cufile = has_cufile; -} - -/* - * pgstrom_init_extra - */ -void -pgstrom_init_extra(void) -{ - void *handle; - char *license; - char *extra_module_info; - - /* load the extra module */ - handle = dlopen(HETERODB_EXTRA_FILENAME, - RTLD_NOW | RTLD_LOCAL); - if (!handle) - { - handle = dlopen(HETERODB_EXTRA_PATHNAME, RTLD_NOW | RTLD_LOCAL); - if (!handle) - { - elog(LOG, "HeteroDB Extra module is not available"); - return; - } - } - - PG_TRY(); - { - uint32 api_version = 0; - bool has_cufile = false; - - LOOKUP_HETERODB_EXTRA_FUNCTION(heterodb_extra_error_data); - LOOKUP_HETERODB_EXTRA_FUNCTION(heterodb_extra_module_init); - extra_module_info = heterodbExtraModuleInit(); - parse_heterodb_extra_module_info(extra_module_info, - &api_version, - &has_cufile); - if (has_cufile) - { - LOOKUP_HETERODB_EXTRA_FUNCTION(cufile__driver_init_v2); - LOOKUP_HETERODB_EXTRA_FUNCTION(cufile__driver_open_v2); - LOOKUP_HETERODB_EXTRA_FUNCTION(cufile__driver_close_v2); - LOOKUP_HETERODB_EXTRA_FUNCTION(cufile__map_gpu_memory_v2); - LOOKUP_HETERODB_EXTRA_FUNCTION(cufile__unmap_gpu_memory_v2); - LOOKUP_HETERODB_EXTRA_FUNCTION(cufile__read_file_iov_v2); - LOOKUP_HETERODB_EXTRA_FUNCTION(cufile__get_property_v2); - LOOKUP_HETERODB_EXTRA_FUNCTION(cufile__set_property_v2); - - gpuDirectInitDriver(); - } - LOOKUP_HETERODB_EXTRA_FUNCTION(heterodb_license_reload); - LOOKUP_HETERODB_EXTRA_FUNCTION(heterodb_license_query); - LOOKUP_HETERODB_EXTRA_FUNCTION(heterodb_validate_device); - } - PG_CATCH(); - { - p_heterodb_extra_error_data = NULL; - p_heterodb_extra_module_init = NULL; - p_cufile__driver_init_v2 = NULL; - p_cufile__driver_open_v2 = NULL; - p_cufile__driver_close_v2 = NULL; - p_cufile__map_gpu_memory_v2 = NULL; - p_cufile__unmap_gpu_memory_v2 = NULL; - p_cufile__read_file_iov_v2 = NULL; - p_cufile__get_property_v2 = NULL; - p_cufile__set_property_v2 = NULL; - p_heterodb_license_reload = NULL; - p_heterodb_license_query = NULL; - p_heterodb_validate_device = NULL; - PG_RE_THROW(); - } - PG_END_TRY(); - elog(LOG, "HeteroDB Extra module loaded [%s]", extra_module_info); - - license = __heterodb_license_query(); - if (license) - { - elog(LOG, "HeteroDB License: %s", license); - pfree(license); - } -} diff --git a/next/gpu_device.c b/next/gpu_device.c deleted file mode 100644 index f07334e0e..000000000 --- a/next/gpu_device.c +++ /dev/null @@ -1,698 +0,0 @@ -/* - * gpu_device.c - * - * Routines to collect GPU device information. - * ---- - * Copyright 2011-2023 (C) KaiGai Kohei - * Copyright 2014-2023 (C) PG-Strom Developers Team - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the PostgreSQL License. - */ -#include "pg_strom.h" -#include "cuda_common.h" - -/* variable declarations */ -GpuDevAttributes *gpuDevAttrs = NULL; -int numGpuDevAttrs = 0; -double pgstrom_gpu_setup_cost; /* GUC */ -double pgstrom_gpu_tuple_cost; /* GUC */ -double pgstrom_gpu_operator_cost; /* GUC */ -double pgstrom_gpu_direct_seq_page_cost; /* GUC */ -/* catalog of device attributes */ -typedef enum { - DEVATTRKIND__INT, - DEVATTRKIND__BYTES, - DEVATTRKIND__KB, - DEVATTRKIND__KHZ, - DEVATTRKIND__COMPUTEMODE, - DEVATTRKIND__BOOL, - DEVATTRKIND__BITS, -} DevAttrKind; - -static struct { - CUdevice_attribute attr_id; - size_t attr_offset; - const char *attr_label; - const char *attr_desc; -} GpuDevAttrCatalog[] = { -#define DEV_ATTR(LABEL,DESC) \ - { CU_DEVICE_ATTRIBUTE_##LABEL, \ - offsetof(struct GpuDevAttributes, LABEL), \ - #LABEL, DESC }, -#include "gpu_devattrs.h" -#undef DEV_ATTR -}; - -/* declaration */ -Datum pgstrom_gpu_device_info(PG_FUNCTION_ARGS); - -/* - * collectGpuDevAttrs - */ -static void -__collectGpuDevAttrs(GpuDevAttributes *dattrs, CUdevice cuda_device) -{ - CUresult rc; - char path[1024]; - char linebuf[1024]; - FILE *filp; - struct stat stat_buf; - - rc = cuDeviceGetName(dattrs->DEV_NAME, sizeof(dattrs->DEV_NAME), cuda_device); - if (rc != CUDA_SUCCESS) - __FATAL("failed on cuDeviceGetName: %s", cuStrError(rc)); - rc = cuDeviceGetUuid((CUuuid *)dattrs->DEV_UUID, cuda_device); - if (rc != CUDA_SUCCESS) - __FATAL("failed on cuDeviceGetUuid: %s", cuStrError(rc)); - rc = cuDeviceTotalMem(&dattrs->DEV_TOTAL_MEMSZ, cuda_device); - if (rc != CUDA_SUCCESS) - __FATAL("failed on cuDeviceTotalMem: %s", cuStrError(rc)); -#define DEV_ATTR(LABEL,DESC) \ - rc = cuDeviceGetAttribute(&dattrs->LABEL, \ - CU_DEVICE_ATTRIBUTE_##LABEL, \ - cuda_device); \ - if (rc != CUDA_SUCCESS) \ - __FATAL("failed on cuDeviceGetAttribute(" #LABEL "): %s", \ - cuStrError(rc)); -#include "gpu_devattrs.h" -#undef DEV_ATTR - /* - * Some other fields to be fetched from Sysfs - */ - snprintf(path, sizeof(path), - "/sys/bus/pci/devices/%04x:%02x:%02x.0/numa_node", - dattrs->PCI_DOMAIN_ID, - dattrs->PCI_BUS_ID, - dattrs->PCI_DEVICE_ID); - filp = fopen(path, "r"); - if (!filp) - dattrs->NUMA_NODE_ID = -1; /* unknown */ - else - { - if (!fgets(linebuf, sizeof(linebuf), filp)) - dattrs->NUMA_NODE_ID = -1; /* unknown */ - else - dattrs->NUMA_NODE_ID = atoi(linebuf); - fclose(filp); - } - - snprintf(path, sizeof(path), - "/sys/bus/pci/devices/%04x:%02x:%02x.0/resource1", - dattrs->PCI_DOMAIN_ID, - dattrs->PCI_BUS_ID, - dattrs->PCI_DEVICE_ID); - if (stat(path, &stat_buf) == 0) - dattrs->DEV_BAR1_MEMSZ = stat_buf.st_size; - else - dattrs->DEV_BAR1_MEMSZ = 0; /* unknown */ - - /* - * GPU-Direct SQL is supported? - */ - if (dattrs->GPU_DIRECT_RDMA_SUPPORTED) - { - if (dattrs->DEV_BAR1_MEMSZ == 0 /* unknown */ || - dattrs->DEV_BAR1_MEMSZ > (256UL << 20)) - dattrs->DEV_SUPPORT_GPUDIRECTSQL = true; - } -} - -static int -collectGpuDevAttrs(int fdesc) -{ - GpuDevAttributes dattrs; - CUdevice cuda_device; - CUresult rc; - int i, nr_gpus; - - rc = cuInit(0); - if (rc != CUDA_SUCCESS) - __FATAL("failed on cuInit: %s", cuStrError(rc)); - rc = cuDeviceGetCount(&nr_gpus); - if (rc != CUDA_SUCCESS) - __FATAL("failed on cuDeviceGetCount: %s", cuStrError(rc)); - - for (i=0; i < nr_gpus; i++) - { - ssize_t offset, nbytes; - - rc = cuDeviceGet(&cuda_device, i); - if (rc != CUDA_SUCCESS) - __FATAL("failed on cuDeviceGet: %s", cuStrError(rc)); - memset(&dattrs, 0, sizeof(GpuDevAttributes)); - dattrs.DEV_ID = i; - __collectGpuDevAttrs(&dattrs, cuda_device); - - for (offset=0; offset < sizeof(GpuDevAttributes); offset += nbytes) - { - nbytes = write(fdesc, ((char *)&dattrs) + offset, - sizeof(GpuDevAttributes) - offset); - if (nbytes == 0) - break; - if (nbytes < 0) - __FATAL("failed on write(pipefd): %m"); - } - } - return 0; -} - -/* - * receiveGpuDevAttrs - */ -static void -receiveGpuDevAttrs(int fdesc) -{ - GpuDevAttributes *__devAttrs = NULL; - GpuDevAttributes dattrs_saved; - int nitems = 0; - int nrooms = 0; - bool is_saved = false; - - for (;;) - { - GpuDevAttributes dtemp; - ssize_t nbytes; - - nbytes = __readFile(fdesc, &dtemp, sizeof(GpuDevAttributes)); - if (nbytes == 0) - break; /* end */ - if (nbytes != sizeof(GpuDevAttributes)) - elog(ERROR, "failed on collect GPU device attributes"); - if (dtemp.COMPUTE_CAPABILITY_MAJOR < 6) - { - elog(LOG, "PG-Strom: GPU%d %s - CC %d.%d is not supported", - dtemp.DEV_ID, - dtemp.DEV_NAME, - dtemp.COMPUTE_CAPABILITY_MAJOR, - dtemp.COMPUTE_CAPABILITY_MINOR); - continue; - } - if (heterodbValidateDevice(dtemp.DEV_ID, - dtemp.DEV_NAME, - dtemp.DEV_UUID)) - { - if (nitems >= nrooms) - { - nrooms += 10; - __devAttrs = realloc(__devAttrs, sizeof(GpuDevAttributes) * nrooms); - if (!__devAttrs) - elog(ERROR, "out of memory"); - } - memcpy(&__devAttrs[nitems++], &dtemp, sizeof(GpuDevAttributes)); - } - else if (!is_saved) - { - memcpy(&dattrs_saved, &dtemp, sizeof(GpuDevAttributes)); - is_saved = true; - } - } - - if (nitems == 0 && is_saved) - { - __devAttrs = malloc(sizeof(GpuDevAttributes)); - if (!__devAttrs) - elog(ERROR, "out of memory"); - memcpy(&__devAttrs[nitems++], &dattrs_saved, sizeof(GpuDevAttributes)); - } - numGpuDevAttrs = nitems; - gpuDevAttrs = __devAttrs; -} - -/* - * pgstrom_collect_gpu_devices - */ -static void -pgstrom_collect_gpu_devices(void) -{ - int i, pipefd[2]; - pid_t child; - StringInfoData buf; - - if (pipe(pipefd) != 0) - elog(ERROR, "failed on pipe(2): %m"); - child = fork(); - if (child == 0) - { - close(pipefd[0]); - _exit(collectGpuDevAttrs(pipefd[1])); - } - else if (child > 0) - { - int status; - - close(pipefd[1]); - PG_TRY(); - { - receiveGpuDevAttrs(pipefd[0]); - } - PG_CATCH(); - { - /* cleanup */ - kill(child, SIGKILL); - close(pipefd[0]); - PG_RE_THROW(); - } - PG_END_TRY(); - close(pipefd[0]); - - while (waitpid(child, &status, 0) < 0) - { - if (errno != EINTR) - { - kill(child, SIGKILL); - elog(ERROR, "failed on waitpid: %m"); - } - } - if (WEXITSTATUS(status) != 0) - elog(ERROR, "GPU device attribute collector exited with %d", - WEXITSTATUS(status)); - } - else - { - close(pipefd[0]); - close(pipefd[1]); - elog(ERROR, "failed on fork(2): %m"); - } - initStringInfo(&buf); - for (i=0; i < numGpuDevAttrs; i++) - { - GpuDevAttributes *dattrs = &gpuDevAttrs[i]; - - resetStringInfo(&buf); - appendStringInfo(&buf, "GPU%d %s (%d SMs; %dMHz, L2 %dkB)", - dattrs->DEV_ID, dattrs->DEV_NAME, - dattrs->MULTIPROCESSOR_COUNT, - dattrs->CLOCK_RATE / 1000, - dattrs->L2_CACHE_SIZE >> 10); - if (dattrs->DEV_TOTAL_MEMSZ > (4UL << 30)) - appendStringInfo(&buf, ", RAM %.2fGB", - ((double)dattrs->DEV_TOTAL_MEMSZ / - (double)(1UL << 30))); - else - appendStringInfo(&buf, ", RAM %zuMB", - dattrs->DEV_TOTAL_MEMSZ >> 20); - if (dattrs->MEMORY_CLOCK_RATE > (1UL << 20)) - appendStringInfo(&buf, " (%dbits, %.2fGHz)", - dattrs->GLOBAL_MEMORY_BUS_WIDTH, - ((double)dattrs->MEMORY_CLOCK_RATE / - (double)(1UL << 20))); - else - appendStringInfo(&buf, " (%dbits, %dMHz)", - dattrs->GLOBAL_MEMORY_BUS_WIDTH, - dattrs->MEMORY_CLOCK_RATE >> 10); - if (dattrs->DEV_BAR1_MEMSZ > (1UL << 30)) - appendStringInfo(&buf, ", PCI-E Bar1 %luGB", - dattrs->DEV_BAR1_MEMSZ >> 30); - else if (dattrs->DEV_BAR1_MEMSZ > (1UL << 20)) - appendStringInfo(&buf, ", PCI-E Bar1 %luMB", - dattrs->DEV_BAR1_MEMSZ >> 30); - appendStringInfo(&buf, ", CC %d.%d", - dattrs->COMPUTE_CAPABILITY_MAJOR, - dattrs->COMPUTE_CAPABILITY_MINOR); - elog(LOG, "PG-Strom: %s", buf.data); - } - pfree(buf.data); -} - -/* - * pgstrom_gpu_operator_ratio - */ -double -pgstrom_gpu_operator_ratio(void) -{ - if (cpu_operator_cost > 0.0) - { - return pgstrom_gpu_operator_cost / cpu_operator_cost; - } - return (pgstrom_gpu_operator_cost == 0.0 ? 1.0 : disable_cost); -} - -/* - * pgstrom_init_gpu_options - init GUC options related to GPUs - */ -static void -pgstrom_init_gpu_options(void) -{ - /* cost factor for GPU setup */ - DefineCustomRealVariable("pg_strom.gpu_setup_cost", - "Cost to setup GPU device to run", - NULL, - &pgstrom_gpu_setup_cost, - 100 * DEFAULT_SEQ_PAGE_COST, - 0, - DBL_MAX, - PGC_USERSET, - GUC_NOT_IN_SAMPLE, - NULL, NULL, NULL); - /* cost factor for each Gpu task */ - DefineCustomRealVariable("pg_strom.gpu_tuple_cost", - "Default cost to transfer GPU<->Host per tuple", - NULL, - &pgstrom_gpu_tuple_cost, - DEFAULT_CPU_TUPLE_COST, - 0, - DBL_MAX, - PGC_USERSET, - GUC_NOT_IN_SAMPLE, - NULL, NULL, NULL); - /* cost factor for GPU operator */ - DefineCustomRealVariable("pg_strom.gpu_operator_cost", - "Cost of processing each operators by GPU", - NULL, - &pgstrom_gpu_operator_cost, - DEFAULT_CPU_OPERATOR_COST / 16.0, - 0, - DBL_MAX, - PGC_USERSET, - GUC_NOT_IN_SAMPLE, - NULL, NULL, NULL); - /* cost factor for GPU-Direct SQL */ - DefineCustomRealVariable("pg_strom.gpu_direct_seq_page_cost", - "Cost for sequential page read by GPU-Direct SQL", - NULL, - &pgstrom_gpu_direct_seq_page_cost, - DEFAULT_SEQ_PAGE_COST / 4.0, - 0, - DBL_MAX, - PGC_USERSET, - GUC_NOT_IN_SAMPLE, - NULL, NULL, NULL); -} - -/* - * pgstrom_init_gpu_device - */ -bool -pgstrom_init_gpu_device(void) -{ - static char *cuda_visible_devices = NULL; - - /* - * Set CUDA_VISIBLE_DEVICES environment variable prior to CUDA - * initialization - */ - DefineCustomStringVariable("pg_strom.cuda_visible_devices", - "CUDA_VISIBLE_DEVICES environment variables", - NULL, - &cuda_visible_devices, - NULL, - PGC_POSTMASTER, - GUC_NOT_IN_SAMPLE, - NULL, NULL, NULL); - if (cuda_visible_devices) - { - if (setenv("CUDA_VISIBLE_DEVICES", cuda_visible_devices, 1) != 0) - elog(ERROR, "failed to set CUDA_VISIBLE_DEVICES"); - } - /* collect device attributes using child process */ - pgstrom_collect_gpu_devices(); - if (numGpuDevAttrs > 0) - { - pgstrom_init_gpu_options(); - return true; - } - return false; -} - -/* - * gpuClientOpenSession - */ -static int -__gpuClientChooseDevice(const Bitmapset *gpuset) -{ - static bool rr_initialized = false; - static uint32 rr_counter = 0; - - if (!rr_initialized) - { - rr_counter = (uint32)getpid(); - rr_initialized = true; - } - - if (!bms_is_empty(gpuset)) - { - int num = bms_num_members(gpuset); - int *dindex = alloca(sizeof(int) * num); - int i, k; - - for (i=0, k=bms_next_member(gpuset, -1); - k >= 0; - i++, k=bms_next_member(gpuset, k)) - { - dindex[i] = k; - } - Assert(i == num); - return dindex[rr_counter++ % num]; - } - /* a simple round-robin if no GPUs preference */ - return (rr_counter++ % numGpuDevAttrs); -} - -void -gpuClientOpenSession(pgstromTaskState *pts, - const XpuCommand *session) -{ - struct sockaddr_un addr; - pgsocket sockfd; - int cuda_dindex = __gpuClientChooseDevice(pts->optimal_gpus); - char namebuf[32]; - - sockfd = socket(AF_UNIX, SOCK_STREAM, 0); - if (sockfd < 0) - elog(ERROR, "failed on socket(2): %m"); - - memset(&addr, 0, sizeof(addr)); - addr.sun_family = AF_UNIX; - snprintf(addr.sun_path, sizeof(addr.sun_path), - ".pg_strom.%u.gpu%u.sock", - PostmasterPid, cuda_dindex); - if (connect(sockfd, (struct sockaddr *)&addr, sizeof(addr)) != 0) - { - close(sockfd); - elog(ERROR, "failed on connect('%s'): %m", addr.sun_path); - } - snprintf(namebuf, sizeof(namebuf), "GPU-%d", cuda_dindex); - - __xpuClientOpenSession(pts, session, sockfd, namebuf, cuda_dindex); -} - -/* - * optimal_workgroup_size - calculates the optimal block size - * according to the function and device attributes - */ -static __thread size_t __dynamic_shmem_per_block; -static __thread size_t __dynamic_shmem_per_warp; - -static size_t -blocksize_to_shmemsize_helper(int blocksize) -{ - int n_warps = (blocksize + WARPSIZE - 1) / WARPSIZE; - - return MAXALIGN(__dynamic_shmem_per_block + - __dynamic_shmem_per_warp * n_warps); -} - -CUresult -gpuOptimalBlockSize(int *p_grid_sz, - int *p_block_sz, - unsigned int *p_shmem_sz, - CUfunction kern_function, - size_t dynamic_shmem_per_block, - size_t dynamic_shmem_per_warp) -{ - CUresult rc; - - if (dynamic_shmem_per_warp == 0) - { - rc = cuOccupancyMaxPotentialBlockSize(p_grid_sz, - p_block_sz, - kern_function, - NULL, - dynamic_shmem_per_block, - 0); - if (rc == CUDA_SUCCESS) - *p_shmem_sz = dynamic_shmem_per_block; - } - else - { - __dynamic_shmem_per_block = dynamic_shmem_per_block; - __dynamic_shmem_per_warp = dynamic_shmem_per_warp; - rc = cuOccupancyMaxPotentialBlockSize(p_grid_sz, - p_block_sz, - kern_function, - blocksize_to_shmemsize_helper, - dynamic_shmem_per_block, - 0); - if (rc == CUDA_SUCCESS) - *p_shmem_sz = blocksize_to_shmemsize_helper(*p_block_sz); - } - return rc; -} - -/* - * pgstrom_gpu_device_info - SQL function to dump device info - */ -PG_FUNCTION_INFO_V1(pgstrom_gpu_device_info); -Datum -pgstrom_gpu_device_info(PG_FUNCTION_ARGS) -{ - FuncCallContext *fncxt; - GpuDevAttributes *dattrs; - int dindex; - int aindex; - int i, val; - const char *att_name; - const char *att_value; - const char *att_desc; - Datum values[4]; - bool isnull[4]; - HeapTuple tuple; - - if (SRF_IS_FIRSTCALL()) - { - TupleDesc tupdesc; - MemoryContext oldcxt; - - fncxt = SRF_FIRSTCALL_INIT(); - oldcxt = MemoryContextSwitchTo(fncxt->multi_call_memory_ctx); - - tupdesc = CreateTemplateTupleDesc(4); - TupleDescInitEntry(tupdesc, (AttrNumber) 1, "gpu_id", - INT4OID, -1, 0); - TupleDescInitEntry(tupdesc, (AttrNumber) 2, "att_name", - TEXTOID, -1, 0); - TupleDescInitEntry(tupdesc, (AttrNumber) 3, "att_value", - TEXTOID, -1, 0); - TupleDescInitEntry(tupdesc, (AttrNumber) 4, "att_desc", - TEXTOID, -1, 0); - fncxt->tuple_desc = BlessTupleDesc(tupdesc); - - fncxt->user_fctx = 0; - - MemoryContextSwitchTo(oldcxt); - } - fncxt = SRF_PERCALL_SETUP(); - - dindex = fncxt->call_cntr / (lengthof(GpuDevAttrCatalog) + 5); - aindex = fncxt->call_cntr % (lengthof(GpuDevAttrCatalog) + 5); - if (dindex >= numGpuDevAttrs) - SRF_RETURN_DONE(fncxt); - dattrs = &gpuDevAttrs[dindex]; - switch (aindex) - { - case 0: - att_name = "DEV_NAME"; - att_desc = "GPU Device Name"; - att_value = dattrs->DEV_NAME; - break; - case 1: - att_name = "DEV_ID"; - att_desc = "GPU Device ID"; - att_value = psprintf("%d", dattrs->DEV_ID); - break; - case 2: - att_name = "DEV_UUID"; - att_desc = "GPU Device UUID"; - att_value = psprintf("GPU-%02x%02x%02x%02x-%02x%02x-%02x%02x-" - "%02x%02x-%02x%02x%02x%02x%02x%02x", - (uint8_t)dattrs->DEV_UUID[0], - (uint8_t)dattrs->DEV_UUID[1], - (uint8_t)dattrs->DEV_UUID[2], - (uint8_t)dattrs->DEV_UUID[3], - (uint8_t)dattrs->DEV_UUID[4], - (uint8_t)dattrs->DEV_UUID[5], - (uint8_t)dattrs->DEV_UUID[6], - (uint8_t)dattrs->DEV_UUID[7], - (uint8_t)dattrs->DEV_UUID[8], - (uint8_t)dattrs->DEV_UUID[9], - (uint8_t)dattrs->DEV_UUID[10], - (uint8_t)dattrs->DEV_UUID[11], - (uint8_t)dattrs->DEV_UUID[12], - (uint8_t)dattrs->DEV_UUID[13], - (uint8_t)dattrs->DEV_UUID[14], - (uint8_t)dattrs->DEV_UUID[15]); - break; - case 3: - att_name = "DEV_TOTAL_MEMSZ"; - att_desc = "GPU Total RAM Size"; - att_value = format_bytesz(dattrs->DEV_TOTAL_MEMSZ); - break; - case 4: - att_name = "DEV_BAR1_MEMSZ"; - att_desc = "GPU PCI Bar1 Size"; - att_value = format_bytesz(dattrs->DEV_BAR1_MEMSZ); - break; - case 5: - att_name = "NUMA_NODE_ID"; - att_desc = "GPU NUMA Node Id"; - att_value = psprintf("%d", dattrs->NUMA_NODE_ID); - break; - default: - i = aindex - 6; - val = *((int *)((char *)dattrs + - GpuDevAttrCatalog[i].attr_offset)); - att_name = GpuDevAttrCatalog[i].attr_label; - att_desc = GpuDevAttrCatalog[i].attr_desc; - switch (GpuDevAttrCatalog[i].attr_id) - { - case CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK: - case CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY: - case CU_DEVICE_ATTRIBUTE_MAX_PITCH: - case CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE: - case CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR: - /* bytes */ - att_value = format_bytesz((size_t)val); - break; - - case CU_DEVICE_ATTRIBUTE_CLOCK_RATE: - case CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE: - /* clock */ - if (val > 4000000) - att_value = psprintf("%.2f GHz", (double)val/1000000.0); - else if (val > 4000) - att_value = psprintf("%d MHz", val / 1000); - else - att_value = psprintf("%d kHz", val); - break; - - case CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH: - /* bits */ - att_value = psprintf("%s", val != 0 ? "True" : "False"); - break; - - case CU_DEVICE_ATTRIBUTE_COMPUTE_MODE: - /* compute mode */ - switch (val) - { - case CU_COMPUTEMODE_DEFAULT: - att_value = "Default"; - break; - case CU_COMPUTEMODE_PROHIBITED: - att_value = "Prohibited"; - break; - case CU_COMPUTEMODE_EXCLUSIVE_PROCESS: - att_value = "Exclusive Process"; - break; - default: - att_value = "Unknown"; - break; - } - break; - - default: - att_value = psprintf("%d", val); - break; - } - break; - } - memset(isnull, 0, sizeof(isnull)); - values[0] = Int32GetDatum(dattrs->DEV_ID); - values[1] = CStringGetTextDatum(att_name); - values[2] = CStringGetTextDatum(att_value); - values[3] = CStringGetTextDatum(att_desc); - - tuple = heap_form_tuple(fncxt->tuple_desc, values, isnull); - - SRF_RETURN_NEXT(fncxt, HeapTupleGetDatum(tuple)); -} diff --git a/next/main.c b/next/main.c deleted file mode 100644 index f3d13c8a6..000000000 --- a/next/main.c +++ /dev/null @@ -1,533 +0,0 @@ -/* - * main.c - * - * Entrypoint of PG-Strom extension - * ---- - * Copyright 2011-2023 (C) KaiGai Kohei - * Copyright 2014-2023 (C) PG-Strom Developers Team - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the PostgreSQL License. - */ -#include "pg_strom.h" - -PG_MODULE_MAGIC; - -/* misc variables */ -bool pgstrom_enabled; /* GUC */ -bool pgstrom_cpu_fallback_enabled; /* GUC */ -bool pgstrom_regression_test_mode; /* GUC */ -int pgstrom_max_async_tasks; /* GUC */ -long PAGE_SIZE; -long PAGE_MASK; -int PAGE_SHIFT; -long PHYS_PAGES; -long PAGES_PER_BLOCK; - -static planner_hook_type planner_hook_next = NULL; -static CustomPathMethods pgstrom_dummy_path_methods; -static CustomScanMethods pgstrom_dummy_plan_methods; - -/* pg_strom.githash() */ -PG_FUNCTION_INFO_V1(pgstrom_githash); -Datum -pgstrom_githash(PG_FUNCTION_ARGS) -{ -#ifdef PGSTROM_GITHASH - PG_RETURN_TEXT_P(cstring_to_text(PGSTROM_GITHASH)); -#else - PG_RETURN_NULL(); -#endif -} - -/* - * pg_kern_ereport - raise an ereport at host side - */ -void -pg_kern_ereport(kern_context *kcxt) -{ - ereport(ERROR, (errcode(kcxt->errcode), - errmsg("%s:%u %s", - kcxt->error_filename, - kcxt->error_lineno, - kcxt->error_message))); -} - -/* - * pg_hash_any - the standard hash function at device code - */ -uint32_t -pg_hash_any(const void *ptr, int sz) -{ - return (uint32_t)hash_any((const unsigned char *)ptr, sz); -} - -/* - * pgstrom_init_gucs - */ -static void -pgstrom_init_gucs(void) -{ - /* Disables PG-Strom features at all */ - DefineCustomBoolVariable("pg_strom.enabled", - "Enables the planner's use of PG-Strom", - NULL, - &pgstrom_enabled, - true, - PGC_USERSET, - GUC_NOT_IN_SAMPLE, - NULL, NULL, NULL); - /* turn on/off CPU fallback if GPU could not execute the query */ - DefineCustomBoolVariable("pg_strom.cpu_fallback", - "Enables CPU fallback if GPU required re-run", - NULL, - &pgstrom_cpu_fallback_enabled, - false, - PGC_USERSET, - GUC_NOT_IN_SAMPLE, - NULL, NULL, NULL); - /* disables some platform specific EXPLAIN output */ - DefineCustomBoolVariable("pg_strom.regression_test_mode", - "Disables some platform specific output in EXPLAIN; that can lead undesired test failed but harmless", - NULL, - &pgstrom_regression_test_mode, - false, - PGC_USERSET, - GUC_NO_SHOW_ALL | GUC_NOT_IN_SAMPLE, - NULL, NULL, NULL); - DefineCustomIntVariable("pg_strom.max_async_tasks", - "Limit of conccurent execution at the xPU devices", - NULL, - &pgstrom_max_async_tasks, - 7, - 1, - 255, - PGC_SUSET, - GUC_NOT_IN_SAMPLE, - NULL, NULL, NULL); -} - -/* - * xPU-aware path tracker - * - * motivation: add_path() and add_partial_path() keeps only cheapest paths. - * Once some other dominates GpuXXX paths, it shall be wiped out, even if - * it potentially has a chance for more optimization (e.g, GpuJoin outer - * pull-up, GpuPreAgg + GpuJoin combined mode). - * So, we preserve PG-Strom related Path-nodes for the later referenced. - */ -typedef struct -{ - PlannerInfo *root; - Relids relids; - bool parallel_path; - uint32_t devkind; /* one of DEVKIND_* */ - CustomPath *cpath; -} custom_path_entry; - -static HTAB *custom_path_htable = NULL; - -static uint32 -custom_path_entry_hashvalue(const void *key, Size keysize) -{ - custom_path_entry *cent = (custom_path_entry *)key; - uint32 hash; - - hash = hash_bytes((unsigned char *)¢->root, sizeof(PlannerInfo *)); - hash ^= bms_hash_value(cent->relids); - if (cent->parallel_path) - hash ^= 0x9e3779b9U; - hash ^= hash_uint32(cent->devkind); - - return hash; -} - -static int -custom_path_entry_compare(const void *key1, const void *key2, Size keysize) -{ - custom_path_entry *cent1 = (custom_path_entry *)key1; - custom_path_entry *cent2 = (custom_path_entry *)key2; - - if (cent1->root == cent2->root && - bms_equal(cent1->relids, cent2->relids) && - cent1->parallel_path == cent2->parallel_path && - cent1->devkind == cent2->devkind) - return 0; - /* not equal */ - return 1; -} - -CustomPath * -custom_path_find_cheapest(PlannerInfo *root, - RelOptInfo *rel, - bool parallel_path, - uint32_t devkind) -{ - custom_path_entry hkey; - custom_path_entry *cent; - - memset(&hkey, 0, sizeof(custom_path_entry)); - hkey.root = root; - hkey.relids = rel->relids; - hkey.parallel_path = (parallel_path ? true : false); - hkey.devkind = (devkind & DEVKIND__ANY); - - cent = hash_search(custom_path_htable, &hkey, HASH_FIND, NULL); - if (!cent) - return NULL; - return cent->cpath; -} - -bool -custom_path_remember(PlannerInfo *root, - RelOptInfo *rel, - bool parallel_path, - uint32_t devkind, - const CustomPath *cpath) -{ - custom_path_entry hkey; - custom_path_entry *cent; - bool found; - - Assert((devkind & DEVKIND__ANY) == DEVKIND__NVIDIA_GPU || - (devkind & DEVKIND__ANY) == DEVKIND__NVIDIA_DPU); - memset(&hkey, 0, sizeof(custom_path_entry)); - hkey.root = root; - hkey.relids = rel->relids; - hkey.parallel_path = (parallel_path ? true : false); - hkey.devkind = (devkind & DEVKIND__ANY); - - cent = hash_search(custom_path_htable, &hkey, HASH_ENTER, &found); - if (found) - { - /* new path is more expensive than prior one! */ - if (cent->cpath->path.total_cost <= cpath->path.total_cost) - return false; - } - cent->cpath = (CustomPath *)pgstrom_copy_pathnode(&cpath->path); - - return true; -} - -/* -------------------------------------------------------------------------------- - * - * add/remove dummy plan node - * - * -------------------------------------------------------------------------------- */ -Path * -pgstrom_create_dummy_path(PlannerInfo *root, Path *subpath) -{ - CustomPath *cpath = makeNode(CustomPath); - RelOptInfo *upper_rel = subpath->parent; - PathTarget *upper_target = upper_rel->reltarget; - PathTarget *sub_target = subpath->pathtarget; - ListCell *lc1, *lc2; - - /* sanity checks */ - if (list_length(upper_target->exprs) != list_length(sub_target->exprs)) - elog(ERROR, "CustomScan(dummy): incompatible tlist is supplied"); - forboth (lc1, upper_target->exprs, - lc2, sub_target->exprs) - { - Node *node1 = lfirst(lc1); - Node *node2 = lfirst(lc2); - - if (exprType(node1) != exprType(node2)) - elog(ERROR, "CustomScan(dummy): incompatible tlist entry: [%s] <-> [%s]", - nodeToString(node1), - nodeToString(node2)); - } - Assert(subpath->parent == upper_rel); - cpath->path.pathtype = T_CustomScan; - cpath->path.parent = upper_rel; - cpath->path.pathtarget = upper_target; - cpath->path.param_info = NULL; - cpath->path.parallel_aware = subpath->parallel_aware; - cpath->path.parallel_safe = subpath->parallel_safe; - cpath->path.parallel_workers = subpath->parallel_workers; - cpath->path.pathkeys = subpath->pathkeys; - cpath->path.rows = subpath->rows; - cpath->path.startup_cost = subpath->startup_cost; - cpath->path.total_cost = subpath->total_cost; - - cpath->custom_paths = list_make1(subpath); - cpath->methods = &pgstrom_dummy_path_methods; - - return &cpath->path; -} - -/* - * pgstrom_dummy_create_plan - PlanCustomPath callback - */ -static Plan * -pgstrom_dummy_create_plan(PlannerInfo *root, - RelOptInfo *rel, - CustomPath *best_path, - List *tlist, - List *clauses, - List *custom_plans) -{ - CustomScan *cscan = makeNode(CustomScan); - - Assert(list_length(custom_plans) == 1); - cscan->scan.plan.parallel_aware = best_path->path.parallel_aware; - cscan->scan.plan.targetlist = tlist; - cscan->scan.plan.qual = NIL; - cscan->scan.plan.lefttree = linitial(custom_plans); - cscan->scan.scanrelid = 0; - cscan->custom_scan_tlist = tlist; - cscan->methods = &pgstrom_dummy_plan_methods; - - return &cscan->scan.plan; -} - -/* - * pgstrom_dummy_create_scan_state - CreateCustomScanState callback - */ -static Node * -pgstrom_dummy_create_scan_state(CustomScan *cscan) -{ - elog(ERROR, "Bug? dummy custom scan should not remain at the executor stage"); -} - -/* - * pgstrom_removal_dummy_plans - * - * Due to the interface design of the create_upper_paths_hook, some other path - * nodes can be stacked on the GpuPreAgg node, with the original final target- - * list. Even if a pair of Agg + GpuPreAgg adopted its modified target-list, - * the stacked path nodes (like sorting, window functions, ...) still consider - * it has the original target-list. - * It makes a problem at setrefs.c when PostgreSQL optimizer tries to replace - * the expressions by var-node using OUTER_VAR, because Agg + GpuPreAgg pair - * does not have the original expression, then it leads "variable not found" - * error. - */ -static void -pgstrom_removal_dummy_plans(PlannedStmt *pstmt, Plan **p_plan) -{ - Plan *plan = *p_plan; - ListCell *lc; - - Assert(plan != NULL); - switch (nodeTag(plan)) - { - case T_Append: - { - Append *splan = (Append *)plan; - - foreach (lc, splan->appendplans) - pgstrom_removal_dummy_plans(pstmt, (Plan **)&lfirst(lc)); - } - break; - - case T_MergeAppend: - { - MergeAppend *splan = (MergeAppend *)plan; - - foreach (lc, splan->mergeplans) - pgstrom_removal_dummy_plans(pstmt, (Plan **)&lfirst(lc)); - } - break; - - case T_BitmapAnd: - { - BitmapAnd *splan = (BitmapAnd *)plan; - - foreach (lc, splan->bitmapplans) - pgstrom_removal_dummy_plans(pstmt, (Plan **)&lfirst(lc)); - } - break; - - case T_BitmapOr: - { - BitmapOr *splan = (BitmapOr *)plan; - - foreach (lc, splan->bitmapplans) - pgstrom_removal_dummy_plans(pstmt, (Plan **)&lfirst(lc)); - } - break; - - case T_SubqueryScan: - { - SubqueryScan *sscan = (SubqueryScan *)plan; - - pgstrom_removal_dummy_plans(pstmt, &sscan->subplan); - } - break; - - case T_CustomScan: - { - CustomScan *cscan = (CustomScan *)plan; - - if (cscan->methods == &pgstrom_dummy_plan_methods) - { - Plan *subplan = outerPlan(cscan); - ListCell *lc1, *lc2; - - /* sanity checks */ - Assert(innerPlan(cscan) == NULL); - if (list_length(cscan->scan.plan.targetlist) != - list_length(subplan->targetlist)) - elog(ERROR, "Bug? dummy plan's targelist length mismatch"); - forboth (lc1, cscan->scan.plan.targetlist, - lc2, subplan->targetlist) - { - TargetEntry *tle1 = lfirst(lc1); - TargetEntry *tle2 = lfirst(lc2); - - if (exprType((Node *)tle1->expr) != exprType((Node *)tle2->expr)) - elog(ERROR, "Bug? dummy TLE type mismatch [%s] [%s]", - nodeToString(tle1), - nodeToString(tle2)); - /* assign resource name */ - tle2->resname = tle1->resname; - } - *p_plan = subplan; - pgstrom_removal_dummy_plans(pstmt, p_plan); - return; - } - foreach (lc, cscan->custom_plans) - pgstrom_removal_dummy_plans(pstmt, (Plan **)&lfirst(lc)); - } - break; - - default: - /* nothing special sub-plans */ - break; - } - if (plan->lefttree) - pgstrom_removal_dummy_plans(pstmt, &plan->lefttree); - if (plan->righttree) - pgstrom_removal_dummy_plans(pstmt, &plan->righttree); -} - -/* - * pgstrom_post_planner - */ -static PlannedStmt * -pgstrom_post_planner(Query *parse, - const char *query_string, - int cursorOptions, - ParamListInfo boundParams) -{ - HTAB *custom_path_htable_saved = custom_path_htable; - HASHCTL hctl; - PlannedStmt *pstmt; - ListCell *lc; - - PG_TRY(); - { - memset(&hctl, 0, sizeof(HASHCTL)); - hctl.hcxt = CurrentMemoryContext; - hctl.keysize = offsetof(custom_path_entry, cpath); - hctl.entrysize = sizeof(custom_path_entry); - hctl.hash = custom_path_entry_hashvalue; - hctl.match = custom_path_entry_compare; - custom_path_htable = hash_create("HTable to preserve Custom-Paths", - 512, - &hctl, - HASH_CONTEXT | - HASH_ELEM | - HASH_FUNCTION | - HASH_COMPARE); - pstmt = planner_hook_next(parse, - query_string, - cursorOptions, - boundParams); - } - PG_CATCH(); - { - hash_destroy(custom_path_htable); - custom_path_htable = custom_path_htable_saved; - PG_RE_THROW(); - } - PG_END_TRY(); - hash_destroy(custom_path_htable); - custom_path_htable = custom_path_htable_saved; - - /* remove dummy plan */ - pgstrom_removal_dummy_plans(pstmt, &pstmt->planTree); - foreach (lc, pstmt->subplans) - pgstrom_removal_dummy_plans(pstmt, (Plan **)&lfirst(lc)); - - return pstmt; -} - -/* - * pgstrom_sigpoll_handler - */ -static void -pgstrom_sigpoll_handler(SIGNAL_ARGS) -{ - /* do nothing here, but invocation of this handler may wake up epoll(2) / poll(2) */ -} - -/* - * _PG_init - * - * Main entrypoint of PG-Strom. It shall be invoked only once when postmaster - * process is starting up, then it calls other sub-systems to initialize for - * each ones. - */ -void -_PG_init(void) -{ - /* - * PG-Strom must be loaded using shared_preload_libraries - */ - if (!process_shared_preload_libraries_in_progress) - ereport(ERROR, - (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), - errmsg("PG-Strom must be loaded via shared_preload_libraries"))); - /* init misc variables */ - PAGE_SIZE = sysconf(_SC_PAGESIZE); - PAGE_MASK = PAGE_SIZE - 1; - PAGE_SHIFT = get_next_log2(PAGE_SIZE); - PHYS_PAGES = sysconf(_SC_PHYS_PAGES); - PAGES_PER_BLOCK = BLCKSZ / PAGE_SIZE; - - /* init pg-strom infrastructure */ - pgstrom_init_gucs(); - pgstrom_init_extra(); - pgstrom_init_codegen(); - pgstrom_init_relscan(); - pgstrom_init_brin(); - pgstrom_init_arrow_fdw(); - pgstrom_init_executor(); - /* dump version number */ - elog(LOG, "PG-Strom version %s built for PostgreSQL %s (git: %s)", - PGSTROM_VERSION, - PG_MAJORVERSION, - PGSTROM_GITHASH); - /* init GPU related stuff */ - if (pgstrom_init_gpu_device()) - { - pgstrom_init_gpu_service(); - pgstrom_init_gpu_scan(); - pgstrom_init_gpu_join(); - pgstrom_init_gpu_preagg(); - } - /* init DPU related stuff */ - if (pgstrom_init_dpu_device()) - { - pgstrom_init_dpu_scan(); - pgstrom_init_dpu_join(); - pgstrom_init_dpu_preagg(); - } - pgstrom_init_pcie(); - /* dummy custom-scan node */ - memset(&pgstrom_dummy_path_methods, 0, sizeof(CustomPathMethods)); - pgstrom_dummy_path_methods.CustomName = "Dummy"; - pgstrom_dummy_path_methods.PlanCustomPath = pgstrom_dummy_create_plan; - - memset(&pgstrom_dummy_plan_methods, 0, sizeof(CustomScanMethods)); - pgstrom_dummy_plan_methods.CustomName = "Dummy"; - pgstrom_dummy_plan_methods.CreateCustomScanState = pgstrom_dummy_create_scan_state; - - /* post planner hook */ - planner_hook_next = (planner_hook ? planner_hook : standard_planner); - planner_hook = pgstrom_post_planner; - /* signal handler for wake up */ - pqsignal(SIGPOLL, pgstrom_sigpoll_handler); -} diff --git a/next/pg_strom.h b/next/pg_strom.h deleted file mode 100644 index 7260b8545..000000000 --- a/next/pg_strom.h +++ /dev/null @@ -1,941 +0,0 @@ -/* - * pg_strom.h - * - * Header file of pg_strom module - * -- - * Copyright 2011-2023 (C) KaiGai Kohei - * Copyright 2014-2023 (C) PG-Strom Developers Team - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the PostgreSQL License. - */ -#ifndef PG_STROM_H -#define PG_STROM_H - -#include "postgres.h" -#if PG_VERSION_NUM < 150000 -#error Base PostgreSQL version must be v15 or later -#endif -#define PG_MAJOR_VERSION (PG_VERSION_NUM / 100) -#define PG_MINOR_VERSION (PG_VERSION_NUM % 100) - -#include "access/brin.h" -#include "access/heapam.h" -#include "access/genam.h" -#include "access/reloptions.h" -#include "access/relscan.h" -#include "access/syncscan.h" -#include "access/table.h" -#include "access/tableam.h" -#include "access/visibilitymap.h" -#include "access/xact.h" -#include "catalog/binary_upgrade.h" -#include "catalog/dependency.h" -#include "catalog/indexing.h" -#include "catalog/namespace.h" -#include "catalog/objectaccess.h" -#include "catalog/pg_aggregate.h" -#include "catalog/pg_am.h" -#include "catalog/pg_amop.h" -#include "catalog/pg_cast.h" -#include "catalog/pg_depend.h" -#include "catalog/pg_foreign_table.h" -#include "catalog/pg_foreign_data_wrapper.h" -#include "catalog/pg_foreign_server.h" -#include "catalog/pg_user_mapping.h" -#include "catalog/pg_extension.h" -#include "catalog/pg_namespace.h" -#include "catalog/pg_proc.h" -#include "catalog/pg_statistic.h" -#include "catalog/pg_tablespace_d.h" -#include "catalog/pg_type.h" -#include "commands/defrem.h" -#include "commands/event_trigger.h" -#include "commands/extension.h" -#include "commands/tablecmds.h" -#include "commands/tablespace.h" -#include "commands/typecmds.h" -#include "common/hashfn.h" -#include "common/int.h" -#include "executor/nodeSubplan.h" -#include "foreign/fdwapi.h" -#include "foreign/foreign.h" -#include "funcapi.h" -#include "libpq/pqformat.h" -#include "lib/stringinfo.h" -#include "miscadmin.h" -#include "nodes/extensible.h" -#include "nodes/makefuncs.h" -#include "nodes/nodeFuncs.h" -#include "nodes/pathnodes.h" -#include "optimizer/clauses.h" -#include "optimizer/cost.h" -#include "optimizer/optimizer.h" -#include "optimizer/pathnode.h" -#include "optimizer/paths.h" -#include "optimizer/plancat.h" -#include "optimizer/planner.h" -#include "optimizer/planmain.h" -#include "optimizer/restrictinfo.h" -#include "optimizer/tlist.h" -#include "parser/parse_func.h" -#include "postmaster/bgworker.h" -#include "postmaster/postmaster.h" -#include "storage/bufmgr.h" -#include "storage/buf_internals.h" -#include "storage/ipc.h" -#include "storage/fd.h" -#include "storage/latch.h" -#include "storage/pmsignal.h" -#include "storage/shmem.h" -#include "storage/smgr.h" -#include "utils/builtins.h" -#include "utils/cash.h" -#include "utils/catcache.h" -#include "utils/date.h" -#include "utils/datetime.h" -#include "utils/float.h" -#include "utils/fmgroids.h" -#include "utils/guc.h" -#include "utils/inet.h" -#include "utils/inval.h" -#include "utils/jsonb.h" -#include "utils/lsyscache.h" -#include "utils/pg_locale.h" -#include "utils/rangetypes.h" -#include "utils/regproc.h" -#include "utils/rel.h" -#include "utils/resowner.h" -#include "utils/ruleutils.h" -#include "utils/selfuncs.h" -#include "utils/spccache.h" -#include "utils/syscache.h" -#include "utils/timestamp.h" -#include "utils/tuplestore.h" -#include "utils/typcache.h" -#include "utils/uuid.h" -#include "utils/wait_event.h" -#include -#define CUDA_API_PER_THREAD_DEFAULT_STREAM 1 -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "xpu_common.h" -#include "pg_utils.h" -#include "heterodb_extra.h" - -/* ------------------------------------------------ - * - * Global Type Definitions - * - * ------------------------------------------------ - */ -typedef struct GpuDevAttributes -{ - int32 NUMA_NODE_ID; - int32 DEV_ID; - char DEV_NAME[256]; - char DEV_UUID[sizeof(CUuuid)]; - size_t DEV_TOTAL_MEMSZ; - size_t DEV_BAR1_MEMSZ; - bool DEV_SUPPORT_GPUDIRECTSQL; -#define DEV_ATTR(LABEL,DESC) \ - int32 LABEL; -#include "gpu_devattrs.h" -#undef DEV_ATTR -} GpuDevAttributes; - -extern GpuDevAttributes *gpuDevAttrs; -extern int numGpuDevAttrs; -#define GPUKERNEL_MAX_SM_MULTIPLICITY 4 - -/* - * devtype/devfunc/devcast definitions - */ -struct devtype_info; -struct devfunc_info; -struct devcast_info; - -typedef uint32_t (*devtype_hashfunc_f)(bool isnull, Datum value); - -typedef struct devtype_info -{ - uint32_t hash; - TypeOpCode type_code; - Oid type_oid; - uint64_t type_flags; - int16 type_length; - int16 type_align; - bool type_byval; - bool type_is_negative; - const char *type_name; - const char *type_extension; - int type_sizeof; - int type_alignof; - devtype_hashfunc_f type_hashfunc; - /* oid of type related functions */ - Oid type_eqfunc; - Oid type_cmpfunc; - /* alias type, if any */ - struct devtype_info *type_alias; - /* element type of array, if type is array */ - struct devtype_info *type_element; - /* attribute of sub-fields, if type is composite */ - int comp_nfields; - struct devtype_info *comp_subtypes[1]; -} devtype_info; - -typedef struct devfunc_info -{ - dlist_node chain; - uint32_t hash; - FuncOpCode func_code; - const char *func_extension; - const char *func_name; - Oid func_oid; - struct devtype_info *func_rettype; - uint64_t func_flags; - int func_cost; - bool func_is_negative; - int func_nargs; - struct devtype_info *func_argtypes[1]; -} devfunc_info; - -typedef struct XpuConnection XpuConnection; -typedef struct GpuCacheState GpuCacheState; -typedef struct DpuStorageEntry DpuStorageEntry; -typedef struct ArrowFdwState ArrowFdwState; -typedef struct BrinIndexState BrinIndexState; - -/* - * pgstromPlanInfo - */ -typedef struct -{ - JoinType join_type; /* one of JOIN_* */ - double join_nrows; /* estimated nrows in this depth */ - List *hash_outer_keys;/* hash-keys for outer-side */ - List *hash_outer_keys_fallback; - List *hash_inner_keys;/* hash-keys for inner-side */ - List *hash_inner_keys_fallback; - List *join_quals; /* join quals */ - List *join_quals_fallback; - List *other_quals; /* other quals */ - List *other_quals_fallback; - Oid gist_index_oid; /* GiST index oid */ - AttrNumber gist_index_col; /* GiST index column number */ - Node *gist_clause; /* GiST index clause */ - Selectivity gist_selectivity; /* GiST selectivity */ -} pgstromPlanInnerInfo; - -typedef struct -{ - uint32_t task_kind; /* one of TASK_KIND__* */ - const Bitmapset *gpu_cache_devs; /* device for GpuCache, if any */ - const Bitmapset *gpu_direct_devs; /* device for GPU-Direct SQL, if any */ - const DpuStorageEntry *ds_entry; /* target DPU if DpuJoin */ - /* Plan information */ - const Bitmapset *outer_refs; /* referenced columns */ - List *used_params; /* param list in use */ - List *host_quals; /* host qualifiers to scan the outer */ - Index scan_relid; /* relid of the outer relation to scan */ - List *scan_quals; /* device qualifiers to scan the outer */ - List *scan_quals_fallback;/* 'scan_quals' for CPU fallback */ - double scan_tuples; /* copy of baserel->tuples */ - double scan_rows; /* copy of baserel->rows */ - double parallel_divisor; /* parallel divisor */ - Cost final_cost; /* cost for sendback and host-side tasks */ - /* BRIN-index support */ - Oid brin_index_oid; /* OID of BRIN-index, if any */ - List *brin_index_conds; /* BRIN-index key conditions */ - List *brin_index_quals; /* Original BRIN-index qualifier */ - /* XPU code for JOIN */ - bytea *kexp_scan_kvars_load; /* VarLoads at depth=0 */ - bytea *kexp_scan_quals; - bytea *kexp_join_kvars_load_packed; /* VarLoads at depth>0 */ - bytea *kexp_join_quals_packed; - bytea *kexp_hash_keys_packed; - bytea *kexp_gist_quals_packed; - bytea *kexp_projection; - bytea *kexp_groupby_keyhash; - bytea *kexp_groupby_keyload; - bytea *kexp_groupby_keycomp; - bytea *kexp_groupby_actions; - List *kvars_depth; - List *kvars_resno; - List *kvars_types; /* type-oid, if it needs extra buffer on kvars-slot */ - List *kvars_exprs; - uint32_t extra_flags; - uint32_t extra_bufsz; - /* fallback projection */ - List *fallback_tlist; /* fallback_slot -> custom_scan_tlist if JOIN/PREAGG */ - /* group-by parameters */ - List *groupby_actions; /* list of KAGG_ACTION__* on the kds_final */ - List *groupby_keys; /* resno of grouping keys, if GROUP BY exists */ - /* inner relations */ - int num_rels; - pgstromPlanInnerInfo inners[FLEXIBLE_ARRAY_MEMBER]; -} pgstromPlanInfo; - -/* - * pgstromSharedState - */ -typedef struct -{ - pg_atomic_uint64 inner_nitems; - pg_atomic_uint64 inner_usage; -} pgstromSharedInnerState; - -typedef struct -{ - dsm_handle ss_handle; /* DSM handle of the SharedState */ - uint32_t ss_length; /* length of the SharedState */ - /* pg-strom's unique plan-id */ - uint64_t query_plan_id; - /* control variables to detect the last plan-node at parallel execution */ - pg_atomic_uint32 scan_task_control; - slock_t __rjoin_control_lock; - /* statistics */ - pg_atomic_uint64 source_ntuples; - pg_atomic_uint64 source_nvalids; - pg_atomic_uint32 source_nblocks; /* only KDS_FORMAT_BLOCK */ - /* for arrow_fdw */ - pg_atomic_uint32 arrow_rbatch_index; - pg_atomic_uint32 arrow_rbatch_nload; /* # of loaded record-batches */ - pg_atomic_uint32 arrow_rbatch_nskip; /* # of skipped record-batches */ - /* for gpu-cache */ - pg_atomic_uint32 gcache_fetch_count; - /* for gpu/dpu-direct */ - pg_atomic_uint32 heap_normal_nblocks; - pg_atomic_uint32 heap_direct_nblocks; - pg_atomic_uint32 heap_fallback_nblocks; - /* for brin-index */ - pg_atomic_uint32 brin_index_fetched; - pg_atomic_uint32 brin_index_skipped; - /* for join-inner-preload */ - ConditionVariable preload_cond; /* sync object */ - slock_t preload_mutex; /* mutex for inner-preloading */ - int preload_phase; /* one of INNER_PHASE__* in gpu_join.c */ - int preload_nr_scanning;/* # of scanning process */ - int preload_nr_setup; /* # of setup process */ - uint32_t preload_shmem_handle; /* host buffer handle */ - uint64_t preload_shmem_length; /* host buffer length */ - /* for join-inner relations */ - uint32_t num_rels; /* if xPU-JOIN involved */ - pgstromSharedInnerState inners[FLEXIBLE_ARRAY_MEMBER]; - /* - * MEMO: ...and ParallelBlockTableScanDescData should be allocated - * next to the inners[nmum_rels] array - */ -} pgstromSharedState; - -typedef struct -{ - PlanState *ps; - ExprContext *econtext; - /* - * inner preload buffer - */ - List *preload_tuples; - List *preload_hashes; /* if hash-join or gist-join */ - size_t preload_usage; - /* - * join properties (common) - */ - int depth; - JoinType join_type; - ExprState *join_quals; - ExprState *other_quals; - /* - * join properties (hash-join) - */ - List *hash_outer_keys; /* list of ExprState */ - List *hash_inner_keys; /* list of ExprState */ - List *hash_outer_dtypes; /* list of devtype_info */ - List *hash_inner_dtypes; /* list of devtype_info */ - /* - * join properties (gist-join) - */ - Relation gist_irel; - ExprState *gist_clause; -} pgstromTaskInnerState; - -struct pgstromTaskState -{ - CustomScanState css; - uint32_t task_kind; /* one of TASK_KIND__* */ - const Bitmapset *optimal_gpus; /* candidate GPUs to connect */ - const DpuStorageEntry *ds_entry; /* candidate DPUs to connect */ - XpuConnection *conn; - pgstromSharedState *ps_state; /* on the shared-memory segment */ - pgstromPlanInfo *pp_info; - GpuCacheState *gcache_state; - ArrowFdwState *arrow_state; - BrinIndexState *br_state; - kern_multirels *h_kmrels; /* host inner buffer (if JOIN) */ - const char *kds_pathname; /* pathname to be used for KDS setup */ - /* current chunk (already processed by the device) */ - XpuCommand *curr_resp; - HeapTupleData curr_htup; - kern_data_store *curr_kds; - int curr_chunk; - int64_t curr_index; - bool scan_done; - bool final_done; - /* control variables to handle right outer join */ - slock_t *rjoin_control_lock; - int *rjoin_control_array; /* per xPU device */ - /* base relation scan, if any */ - TupleTableSlot *base_slot; - ExprState *base_quals; /* equivalent to device quals */ - /* CPU fallback support */ - off_t *fallback_tuples; - size_t fallback_index; - size_t fallback_nitems; - size_t fallback_nrooms; - size_t fallback_usage; - size_t fallback_bufsz; - char *fallback_buffer; - TupleTableSlot *fallback_slot; /* host-side kvars-slot */ - ProjectionInfo *fallback_proj; /* base or fallback slot -> custom_tlist */ - /* request command buffer (+ status for table scan) */ - TBMIterateResult *curr_tbm; - Buffer curr_vm_buffer; /* for visibility-map */ - BlockNumber curr_block_num; /* for KDS_FORMAT_BLOCK */ - BlockNumber curr_block_tail; /* for KDS_FORMAT_BLOCK */ - StringInfoData xcmd_buf; - /* callbacks */ - TupleTableSlot *(*cb_next_tuple)(struct pgstromTaskState *pts); - XpuCommand *(*cb_next_chunk)(struct pgstromTaskState *pts, - struct iovec *xcmd_iov, int *xcmd_iovcnt); - XpuCommand *(*cb_final_chunk)(struct pgstromTaskState *pts, - kern_final_task *fin, - struct iovec *xcmd_iov, int *xcmd_iovcnt); - void (*cb_cpu_fallback)(struct pgstromTaskState *pts, - struct kern_data_store *kds, - HeapTuple htuple); - /* inner relations state (if JOIN) */ - int num_rels; - pgstromTaskInnerState inners[FLEXIBLE_ARRAY_MEMBER]; -}; -typedef struct pgstromTaskState pgstromTaskState; - -/* - * Global variables - */ -extern long PAGE_SIZE; -extern long PAGE_MASK; -extern int PAGE_SHIFT; -extern long PHYS_PAGES; -extern long PAGES_PER_BLOCK; /* (BLCKSZ / PAGE_SIZE) */ -#define PAGE_ALIGN(x) TYPEALIGN(PAGE_SIZE,(x)) -#define PGSTROM_CHUNK_SIZE ((size_t)(65534UL << 10)) - -/* - * extra.c - */ -extern void pgstrom_init_extra(void); -extern bool heterodbValidateDevice(int gpu_device_id, - const char *gpu_device_name, - const char *gpu_device_uuid); -extern bool gpuDirectOpenDriver(void); -extern void gpuDirectCloseDriver(void); -extern bool gpuDirectMapGpuMemory(CUdeviceptr m_segment, - size_t segment_sz); -extern bool gpuDirectUnmapGpuMemory(CUdeviceptr m_segment); -extern bool gpuDirectFileReadIOV(const char *pathname, - CUdeviceptr m_segment, - off_t m_offset, - const strom_io_vector *iovec); -extern char *gpuDirectGetProperty(void); -extern void gpuDirectSetProperty(const char *key, const char *value); -extern bool gpuDirectIsAvailable(void); - -/* - * codegen.c - */ -typedef struct -{ - int elevel; /* ERROR or DEBUG2 */ - Expr *top_expr; - List *used_params; - uint32_t required_flags; - uint32_t extra_flags; - uint32_t extra_bufsz; - uint32_t device_cost; - uint32_t kexp_flags; - List *kvars_depth; - List *kvars_resno; - List *kvars_types; - List *kvars_exprs; - List *tlist_dev; - uint32_t kvars_nslots; - List *input_rels_tlist; -} codegen_context; - -extern devtype_info *pgstrom_devtype_lookup(Oid type_oid); -extern devfunc_info *pgstrom_devfunc_lookup(Oid func_oid, - List *func_args, - Oid func_collid); - -extern devfunc_info *devtype_lookup_equal_func(devtype_info *dtype, Oid coll_id); -extern devfunc_info *devtype_lookup_compare_func(devtype_info *dtype, Oid coll_id); - -extern void codegen_context_init(codegen_context *context, - uint32_t task_kind); -extern bytea *codegen_build_qualifiers(codegen_context *context, - List *dev_quals); -extern bytea *codegen_build_scan_loadvars(codegen_context *context); -extern bytea *codegen_build_scan_quals(codegen_context *context, - List *dev_quals); -extern bytea *codegen_build_join_loadvars(codegen_context *context); -extern bytea *codegen_build_packed_joinquals(codegen_context *context, - List *stacked_join_quals, - List *stacked_other_quals); -extern bytea *codegen_build_packed_hashkeys(codegen_context *context, - List *stacked_hash_values); -extern bytea *codegen_build_projection(codegen_context *context); -extern void codegen_build_groupby_actions(codegen_context *context, - pgstromPlanInfo *pp_info); -extern void codegen_build_packed_xpucode(bytea **p_xpucode, - List *exprs_list, - bool inject_hash_value, - List *input_rels_tlist, - uint32_t *p_extra_flags, - uint32_t *p_extra_bufsz, - uint32_t *p_kvars_nslots, - List **p_used_params); -extern bool pgstrom_xpu_expression(Expr *expr, - uint32_t task_kind, - List *input_rels_tlist, - int *p_devcost); -extern bool pgstrom_gpu_expression(Expr *expr, - List *input_rels_tlist, - int *p_devcost); -extern bool pgstrom_dpu_expression(Expr *expr, - List *input_rels_tlist, - int *p_devcost); -extern void pgstrom_explain_xpucode(const CustomScanState *css, - ExplainState *es, - List *dcontext, - const char *label, - bytea *xpucode); -extern char *pgstrom_xpucode_to_string(bytea *xpu_code); -extern void pgstrom_init_codegen(void); - -/* - * brin.c - */ -extern IndexOptInfo *pgstromTryFindBrinIndex(PlannerInfo *root, - RelOptInfo *baserel, - List **p_indexConds, - List **p_indexQuals, - int64_t *p_indexNBlocks); -extern Cost cost_brin_bitmap_build(PlannerInfo *root, - RelOptInfo *baserel, - IndexOptInfo *indexOpt, - List *indexQuals); - -extern void pgstromBrinIndexExecBegin(pgstromTaskState *pts, - Oid index_oid, - List *index_conds, - List *index_quals); -extern bool pgstromBrinIndexNextChunk(pgstromTaskState *pts); -extern TBMIterateResult *pgstromBrinIndexNextBlock(pgstromTaskState *pts); -extern void pgstromBrinIndexExecEnd(pgstromTaskState *pts); -extern void pgstromBrinIndexExecReset(pgstromTaskState *pts); -extern Size pgstromBrinIndexEstimateDSM(pgstromTaskState *pts); -extern Size pgstromBrinIndexInitDSM(pgstromTaskState *pts, char *dsm_addr); -extern Size pgstromBrinIndexAttachDSM(pgstromTaskState *pts, char *dsm_addr); -extern void pgstromBrinIndexShutdownDSM(pgstromTaskState *pts); -extern void pgstromBrinIndexExplain(pgstromTaskState *pts, - List *dcontext, - ExplainState *es); -extern void pgstrom_init_brin(void); - -/* - * relscan.c - */ -extern Bitmapset *pickup_outer_referenced(PlannerInfo *root, - RelOptInfo *base_rel, - Bitmapset *referenced); -extern size_t estimate_kern_data_store(TupleDesc tupdesc); -extern size_t setup_kern_data_store(kern_data_store *kds, - TupleDesc tupdesc, - size_t length, - char format); -extern XpuCommand *pgstromRelScanChunkDirect(pgstromTaskState *pts, - struct iovec *xcmd_iov, - int *xcmd_iovcnt); -extern XpuCommand *pgstromRelScanChunkNormal(pgstromTaskState *pts, - struct iovec *xcmd_iov, - int *xcmd_iovcnt); -extern void pgstromStoreFallbackTuple(pgstromTaskState *pts, HeapTuple tuple); -extern TupleTableSlot *pgstromFetchFallbackTuple(pgstromTaskState *pts); -extern void pgstrom_init_relscan(void); - -/* - * optimizer.c - */ - - - -/* - * executor.c - */ -extern void __xpuClientOpenSession(pgstromTaskState *pts, - const XpuCommand *session, - pgsocket sockfd, - const char *devname, - int dev_index); -extern int -xpuConnectReceiveCommands(pgsocket sockfd, - void *(*alloc_f)(void *priv, size_t sz), - void (*attach_f)(void *priv, XpuCommand *xcmd), - void *priv, - const char *error_label); -extern void xpuClientCloseSession(XpuConnection *conn); -extern void xpuClientSendCommand(XpuConnection *conn, const XpuCommand *xcmd); -extern void xpuClientPutResponse(XpuCommand *xcmd); -extern const XpuCommand *pgstromBuildSessionInfo(pgstromTaskState *pts, - uint32_t join_inner_handle, - TupleDesc tdesc_final); -extern void pgstromExecInitTaskState(CustomScanState *node, - EState *estate, - int eflags); -extern TupleTableSlot *pgstromExecTaskState(CustomScanState *node); -extern void pgstromExecEndTaskState(CustomScanState *node); -extern void pgstromExecResetTaskState(CustomScanState *node); -extern Size pgstromSharedStateEstimateDSM(CustomScanState *node, - ParallelContext *pcxt); -extern void pgstromSharedStateInitDSM(CustomScanState *node, - ParallelContext *pcxt, - void *coordinate); -extern void pgstromSharedStateAttachDSM(CustomScanState *node, - shm_toc *toc, - void *coordinate); -extern void pgstromSharedStateShutdownDSM(CustomScanState *node); -extern void pgstromExplainTaskState(CustomScanState *node, - List *ancestors, - ExplainState *es); -extern void pgstrom_init_executor(void); - -/* - * pcie.c - */ -extern const Bitmapset *GetOptimalGpuForFile(const char *pathname); -extern const Bitmapset *GetOptimalGpuForRelation(Relation relation); -extern const Bitmapset *GetOptimalGpuForBaseRel(PlannerInfo *root, - RelOptInfo *baserel); -extern void pgstrom_init_pcie(void); - -/* - * gpu_device.c - */ -extern double pgstrom_gpu_setup_cost; /* GUC */ -extern double pgstrom_gpu_tuple_cost; /* GUC */ -extern double pgstrom_gpu_operator_cost; /* GUC */ -extern double pgstrom_gpu_direct_seq_page_cost; /* GUC */ -extern double pgstrom_gpu_operator_ratio(void); -extern void gpuClientOpenSession(pgstromTaskState *pts, - const XpuCommand *session); -extern CUresult gpuOptimalBlockSize(int *p_grid_sz, - int *p_block_sz, - unsigned int *p_shmem_sz, - CUfunction kern_function, - size_t dynamic_shmem_per_block, - size_t dynamic_shmem_per_warp); -extern bool pgstrom_init_gpu_device(void); - -/* - * gpu_service.c - */ -struct gpuClient -{ - struct gpuContext *gcontext;/* per-device status */ - dlist_node chain; /* gcontext->client_list */ - CUmodule cuda_module;/* preload cuda binary */ - kern_session_info *session; /* per session info (on cuda managed memory) */ - struct gpuQueryBuffer *gq_buf; /* per query join/preagg device buffer */ - pg_atomic_uint32 refcnt; /* odd number, if error status */ - pthread_mutex_t mutex; /* mutex to write the socket */ - int sockfd; /* connection to PG backend */ - pthread_t worker; /* receiver thread */ -}; -typedef struct gpuClient gpuClient; - -extern int pgstrom_max_async_gpu_tasks; /* GUC */ -extern bool pgstrom_load_gpu_debug_module; /* GUC */ -extern const char *cuStrError(CUresult rc); -extern void __gpuClientELogRaw(gpuClient *gclient, - kern_errorbuf *errorbuf); -extern void __gpuClientELog(gpuClient *gclient, - int errcode, - const char *filename, int lineno, - const char *funcname, - const char *fmt, ...); -#define gpuClientELog(gclient,fmt,...) \ - __gpuClientELog((gclient), ERRCODE_DEVICE_INTERNAL, \ - __FILE__, __LINE__, __FUNCTION__, \ - (fmt), ##__VA_ARGS__) -#define gpuClientFatal(gclient,fmt,...) \ - __gpuClientELog((gclient), ERRCODE_DEVICE_FATAL, \ - __FILE__, __LINE__, __FUNCTION__, \ - (fmt), ##__VA_ARGS__) - -extern __thread int CU_DINDEX_PER_THREAD; -extern __thread CUdevice CU_DEVICE_PER_THREAD; -extern __thread CUcontext CU_CONTEXT_PER_THREAD; -extern __thread CUevent CU_EVENT_PER_THREAD; - -typedef struct -{ - CUdeviceptr __base__; - size_t __offset__; - size_t __length__; - CUdeviceptr m_devptr; -} gpuMemChunk; - -extern const gpuMemChunk *gpuMemAlloc(size_t bytesize); -extern void gpuMemFree(const gpuMemChunk *chunk); -extern const gpuMemChunk *gpuservLoadKdsBlock(gpuClient *gclient, - kern_data_store *kds, - const char *pathname, - strom_io_vector *kds_iovec); -extern const gpuMemChunk *gpuservLoadKdsArrow(gpuClient *gclient, - kern_data_store *kds, - const char *pathname, - strom_io_vector *kds_iovec); -extern bool gpuServiceGoingTerminate(void); -extern void gpuClientWriteBack(gpuClient *gclient, - XpuCommand *resp, - size_t resp_sz, - int kds_nitems, - kern_data_store **kds_array); -extern void pgstrom_init_gpu_service(void); - -/* - * gpu_cache.c - */ - - - - - -/* - * gpu_scan.c - */ -extern void sort_device_qualifiers(List *dev_quals_list, - List *dev_costs_list); -extern CustomPath *buildXpuScanPath(PlannerInfo *root, - RelOptInfo *baserel, - bool parallel_path, - bool allow_host_quals, - bool allow_no_device_quals, - uint32_t task_kind); -extern CustomScan *PlanXpuScanPathCommon(PlannerInfo *root, - RelOptInfo *baserel, - CustomPath *best_path, - List *tlist, - List *clauses, - pgstromPlanInfo *pp_info, - const CustomScanMethods *methods); -extern void ExecFallbackCpuScan(pgstromTaskState *pts, - kern_data_store *kds, - HeapTuple tuple); -extern void gpuservHandleGpuScanExec(gpuClient *gclient, XpuCommand *xcmd); -extern void pgstrom_init_gpu_scan(void); - -/* - * gpu_join.c - */ -extern void form_pgstrom_plan_info(CustomScan *cscan, - pgstromPlanInfo *pp_info); -extern pgstromPlanInfo *deform_pgstrom_plan_info(CustomScan *cscan); -extern void extract_input_path_params(const Path *input_path, - const Path *inner_path, /* optional */ - pgstromPlanInfo **p_pp_info, - List **p_input_paths_tlist, - List **p_inner_paths_list); -extern void xpujoin_add_custompath(PlannerInfo *root, - RelOptInfo *joinrel, - RelOptInfo *outerrel, - RelOptInfo *innerrel, - JoinType join_type, - JoinPathExtraData *extra, - uint32_t task_kind, - const CustomPathMethods *methods); -extern List *build_fallback_exprs_scan(Index scan_relid, List *scan_exprs); -extern List *build_fallback_exprs_join(codegen_context *context, - List *join_exprs); -extern CustomScan *PlanXpuJoinPathCommon(PlannerInfo *root, - RelOptInfo *joinrel, - CustomPath *cpath, - List *tlist, - List *custom_plans, - pgstromPlanInfo *pp_info, - const CustomScanMethods *methods); -extern uint32_t GpuJoinInnerPreload(pgstromTaskState *pts); -extern void ExecFallbackCpuJoin(pgstromTaskState *pts, - kern_data_store *kds, - HeapTuple tuple); -extern void ExecFallbackCpuJoinRightOuter(pgstromTaskState *pts); -extern void pgstrom_init_gpu_join(void); - -/* - * gpu_groupby.c - */ -extern int pgstrom_hll_register_bits; -extern void xpupreagg_add_custompath(PlannerInfo *root, - RelOptInfo *input_rel, - RelOptInfo *group_rel, - void *extra, - uint32_t task_kind, - const CustomPathMethods *methods); -extern void ExecFallbackCpuPreAgg(pgstromTaskState *pts, - kern_data_store *kds, - HeapTuple tuple); -extern void pgstrom_init_gpu_preagg(void); - -/* - * arrow_fdw.c and arrow_read.c - */ -extern bool baseRelIsArrowFdw(RelOptInfo *baserel); -extern bool RelationIsArrowFdw(Relation frel); -extern const Bitmapset *GetOptimalGpusForArrowFdw(PlannerInfo *root, - RelOptInfo *baserel); -extern const DpuStorageEntry *GetOptimalDpuForArrowFdw(PlannerInfo *root, - RelOptInfo *baserel); -extern bool pgstromArrowFdwExecInit(pgstromTaskState *pts, - List *outer_quals, - const Bitmapset *outer_refs); -extern XpuCommand *pgstromScanChunkArrowFdw(pgstromTaskState *pts, - struct iovec *xcmd_iov, - int *xcmd_iovcnt); -extern void pgstromArrowFdwExecEnd(ArrowFdwState *arrow_state); -extern void pgstromArrowFdwExecReset(ArrowFdwState *arrow_state); -extern void pgstromArrowFdwInitDSM(ArrowFdwState *arrow_state, - pgstromSharedState *ps_state); -extern void pgstromArrowFdwAttachDSM(ArrowFdwState *arrow_state, - pgstromSharedState *ps_state); -extern void pgstromArrowFdwShutdown(ArrowFdwState *arrow_state); -extern void pgstromArrowFdwExplain(ArrowFdwState *arrow_state, - Relation frel, - ExplainState *es, - List *dcontext); -extern bool kds_arrow_fetch_tuple(TupleTableSlot *slot, - kern_data_store *kds, - size_t index, - const Bitmapset *referenced); -extern void pgstrom_init_arrow_fdw(void); - -/* - * dpu_device.c - */ -extern double pgstrom_dpu_setup_cost; -extern double pgstrom_dpu_operator_cost; -extern double pgstrom_dpu_seq_page_cost; -extern double pgstrom_dpu_tuple_cost; -extern bool pgstrom_dpu_handle_cached_pages; -extern double pgstrom_dpu_operator_ratio(void); - -extern const DpuStorageEntry *GetOptimalDpuForFile(const char *filename, - const char **p_dpu_pathname); -extern const DpuStorageEntry *GetOptimalDpuForBaseRel(PlannerInfo *root, - RelOptInfo *baserel); -extern const DpuStorageEntry *GetOptimalDpuForRelation(Relation relation, - const char **p_dpu_pathname); -extern const char *DpuStorageEntryBaseDir(const DpuStorageEntry *ds_entry); -extern bool DpuStorageEntryIsEqual(const DpuStorageEntry *ds_entry1, - const DpuStorageEntry *ds_entry2); -extern int DpuStorageEntryGetEndpointId(const DpuStorageEntry *ds_entry); -extern const DpuStorageEntry *DpuStorageEntryByEndpointId(int endpoint_id); -extern int DpuStorageEntryCount(void); -extern void DpuClientOpenSession(pgstromTaskState *pts, - const XpuCommand *session); -extern void explainDpuStorageEntry(const DpuStorageEntry *ds_entry, - ExplainState *es); -extern bool pgstrom_init_dpu_device(void); - -/* - * dpu_scan.c - */ -extern CustomPathMethods dpuscan_path_methods; -extern void pgstrom_init_dpu_scan(void); - -/* - * dpu_join.c - */ -extern bool pgstrom_enable_dpujoin; -extern bool pgstrom_enable_dpuhashjoin; -extern bool pgstrom_enable_dpugistindex; -extern void pgstrom_init_dpu_join(void); - -/* - * dpu_preagg.c - */ -extern void pgstrom_init_dpu_preagg(void); - -/* - * misc.c - */ -extern Node *fixup_varnode_to_origin(Node *node, List *cscan_tlist); -extern int __appendBinaryStringInfo(StringInfo buf, - const void *data, int datalen); -extern int __appendZeroStringInfo(StringInfo buf, int nbytes); -extern char *get_type_name(Oid type_oid, bool missing_ok); -extern Oid get_relation_am(Oid rel_oid, bool missing_ok); -extern List *bms_to_pglist(const Bitmapset *bms); -extern Bitmapset *bms_from_pglist(List *pglist); -extern Float *__makeFloat(double fval); -extern Const *__makeByteaConst(bytea *data); -extern bytea *__getByteaConst(Const *con); -extern ssize_t __readFile(int fdesc, void *buffer, size_t nbytes); -extern ssize_t __preadFile(int fdesc, void *buffer, size_t nbytes, off_t f_pos); -extern ssize_t __writeFile(int fdesc, const void *buffer, size_t nbytes); -extern ssize_t __pwriteFile(int fdesc, const void *buffer, size_t nbytes, off_t f_pos); - -extern uint32_t __shmemCreate(const DpuStorageEntry *ds_entry); -extern void __shmemDrop(uint32_t shmem_handle); -extern void *__mmapShmem(uint32_t shmem_handle, - size_t shmem_length, - const DpuStorageEntry *ds_entry); -extern bool __munmapShmem(void *mmap_addr); - -extern Path *pgstrom_copy_pathnode(const Path *pathnode); - -/* - * main.c - */ -extern bool pgstrom_enabled; -extern bool pgstrom_cpu_fallback_enabled; -extern bool pgstrom_regression_test_mode; -extern int pgstrom_max_async_tasks; -extern CustomPath *custom_path_find_cheapest(PlannerInfo *root, - RelOptInfo *rel, - bool parallel_aware, - uint32_t devkind); -extern bool custom_path_remember(PlannerInfo *root, - RelOptInfo *rel, - bool parallel_aware, - uint32_t devkind, - const CustomPath *cpath); -extern Path *pgstrom_create_dummy_path(PlannerInfo *root, Path *subpath); -extern void _PG_init(void); - -#endif /* PG_STROM_H */ diff --git a/next/relscan.c b/next/relscan.c deleted file mode 100644 index 6e49914c5..000000000 --- a/next/relscan.c +++ /dev/null @@ -1,918 +0,0 @@ -/* - * relscan.c - * - * Routines related to outer relation scan - * ---- - * Copyright 2011-2023 (C) KaiGai Kohei - * Copyright 2014-2023 (C) PG-Strom Developers Team - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the PostgreSQL License. - */ -#include "pg_strom.h" - -/* ---------------------------------------------------------------- - * - * Routines to support optimization / path or plan construction - * - * ---------------------------------------------------------------- - */ -Bitmapset * -pickup_outer_referenced(PlannerInfo *root, - RelOptInfo *base_rel, - Bitmapset *referenced) -{ - ListCell *lc; - int j, k; - - if (base_rel->reloptkind == RELOPT_BASEREL) - { - for (j=base_rel->min_attr; j <= base_rel->max_attr; j++) - { - if (j <= 0 || !base_rel->attr_needed[j - base_rel->min_attr]) - continue; - k = j - FirstLowInvalidHeapAttributeNumber; - referenced = bms_add_member(referenced, k); - } - } - else if (base_rel->reloptkind == RELOPT_OTHER_MEMBER_REL) - { - foreach (lc, root->append_rel_list) - { - AppendRelInfo *apinfo = lfirst(lc); - RelOptInfo *parent_rel; - Bitmapset *parent_refs; - Var *var; - - if (apinfo->child_relid != base_rel->relid) - continue; - Assert(apinfo->parent_relid < root->simple_rel_array_size); - parent_rel = root->simple_rel_array[apinfo->parent_relid]; - parent_refs = pickup_outer_referenced(root, parent_rel, NULL); - - for (k = bms_next_member(parent_refs, -1); - k >= 0; - k = bms_next_member(parent_refs, k)) - { - j = k + FirstLowInvalidHeapAttributeNumber; - if (j <= 0) - bms_add_member(referenced, k); - else if (j > list_length(apinfo->translated_vars)) - elog(ERROR, "Bug? column reference out of range"); - else - { - var = list_nth(apinfo->translated_vars, j-1); - Assert(IsA(var, Var)); - j = var->varattno - FirstLowInvalidHeapAttributeNumber; - referenced = bms_add_member(referenced, j); - } - } - break; - } - if (!lc) - elog(ERROR, "Bug? AppendRelInfo not found (relid=%u)", - base_rel->relid); - } - else - { - elog(ERROR, "Bug? outer relation is not a simple relation"); - } - return referenced; -} - -/* ---------------------------------------------------------------- - * - * Routines to setup kern_data_store - * - * ---------------------------------------------------------------- - */ -static int -count_num_of_subfields(Oid type_oid) -{ - TypeCacheEntry *tcache; - int j, count = 0; - - tcache = lookup_type_cache(type_oid, TYPECACHE_TUPDESC); - if (OidIsValid(tcache->typelem) && tcache->typlen == -1) - { - /* array type */ - count = 1 + count_num_of_subfields(tcache->typelem); - } - else if (tcache->tupDesc) - { - /* composite type */ - TupleDesc tupdesc = tcache->tupDesc; - - for (j=0; j < tupdesc->natts; j++) - { - Form_pg_attribute attr = TupleDescAttr(tupdesc, j); - - count += count_num_of_subfields(attr->atttypid); - } - } - return count; -} - -static void -__setup_kern_colmeta(kern_data_store *kds, - int column_index, - const char *attname, - int attnum, - bool attbyval, - char attalign, - int16 attlen, - Oid atttypid, - int atttypmod, - int *p_attcacheoff) -{ - kern_colmeta *cmeta = &kds->colmeta[column_index]; - TypeCacheEntry *tcache; - - memset(cmeta, 0, sizeof(kern_colmeta)); - cmeta->attbyval = attbyval; - cmeta->attalign = typealign_get_width(attalign); - cmeta->attlen = attlen; - if (attlen == 0 || attlen < -1) - elog(ERROR, "attribute %s has unexpected length (%d)", attname, attlen); - else if (attlen == -1) - kds->has_varlena = true; - cmeta->attnum = attnum; - - if (!p_attcacheoff || *p_attcacheoff < 0) - cmeta->attcacheoff = -1; - else if (attlen > 0) - { - cmeta->attcacheoff = att_align_nominal(*p_attcacheoff, attalign); - *p_attcacheoff = cmeta->attcacheoff + attlen; - } - else if (attlen == -1) - { - /* - * Note that attcacheoff is also available on varlena datum - * only if it appeared at the first, and its offset is aligned. - * Elsewhere, we cannot utilize the attcacheoff for varlena - */ - uint32_t __off = att_align_nominal(*p_attcacheoff, attalign); - - if (*p_attcacheoff == __off) - cmeta->attcacheoff = __off; - else - cmeta->attcacheoff = -1; - *p_attcacheoff = -1; - } - else - { - cmeta->attcacheoff = *p_attcacheoff = -1; - } - cmeta->atttypid = atttypid; - cmeta->atttypmod = atttypmod; - strncpy(cmeta->attname, attname, NAMEDATALEN); - - /* array? composite type? */ - tcache = lookup_type_cache(atttypid, TYPECACHE_TUPDESC); - if (OidIsValid(tcache->typelem) && tcache->typlen == -1) - { - char elem_name[NAMEDATALEN+10]; - int16 elem_len; - bool elem_byval; - char elem_align; - - cmeta->atttypkind = TYPE_KIND__ARRAY; - cmeta->idx_subattrs = kds->nr_colmeta++; - cmeta->num_subattrs = 1; - - snprintf(elem_name, sizeof(elem_name), "__%s", attname); - get_typlenbyvalalign(tcache->typelem, - &elem_len, - &elem_byval, - &elem_align); - __setup_kern_colmeta(kds, - cmeta->idx_subattrs, - elem_name, /* attname */ - 1, /* attnum */ - elem_byval, /* attbyval */ - elem_align, /* attalign */ - elem_len, /* attlen */ - tcache->typelem, /* atttypid */ - -1, /* atttypmod */ - NULL); /* attcacheoff */ - } - else if (tcache->tupDesc) - { - TupleDesc tupdesc = tcache->tupDesc; - int j, attcacheoff = -1; - - cmeta->atttypkind = TYPE_KIND__COMPOSITE; - cmeta->idx_subattrs = kds->nr_colmeta; - cmeta->num_subattrs = tupdesc->natts; - kds->nr_colmeta += tupdesc->natts; - - for (j=0; j < tupdesc->natts; j++) - { - Form_pg_attribute attr = TupleDescAttr(tupdesc, j); - - __setup_kern_colmeta(kds, - cmeta->idx_subattrs + j, - NameStr(attr->attname), - attr->attnum, - attr->attbyval, - attr->attalign, - attr->attlen, - attr->atttypid, - attr->atttypmod, - &attcacheoff); - } - } - else - { - switch (tcache->typtype) - { - case TYPTYPE_BASE: - cmeta->atttypkind = TYPE_KIND__BASE; - break; - case TYPTYPE_DOMAIN: - cmeta->atttypkind = TYPE_KIND__DOMAIN; - break; - case TYPTYPE_ENUM: - cmeta->atttypkind = TYPE_KIND__ENUM; - break; - case TYPTYPE_PSEUDO: - cmeta->atttypkind = TYPE_KIND__PSEUDO; - break; - case TYPTYPE_RANGE: - cmeta->atttypkind = TYPE_KIND__RANGE; - break; - default: - elog(ERROR, "Unexpected typtype ('%c')", tcache->typtype); - break; - } - } - /* - * for the reverse references to KDS - */ - cmeta->kds_format = kds->format; - cmeta->kds_offset = (char *)cmeta - (char *)kds; -} - -size_t -setup_kern_data_store(kern_data_store *kds, - TupleDesc tupdesc, - size_t length, - char format) -{ - int j, attcacheoff = -1; - - memset(kds, 0, offsetof(kern_data_store, colmeta)); - kds->length = length; - kds->nitems = 0; - kds->usage = 0; - kds->ncols = tupdesc->natts; - kds->format = format; - kds->tdhasoid = false; /* PG12 removed 'oid' system column */ - kds->tdtypeid = tupdesc->tdtypeid; - kds->tdtypmod = tupdesc->tdtypmod; - kds->table_oid = InvalidOid; /* to be set by the caller */ - kds->hash_nslots = 0; /* to be set by the caller, if any */ - kds->nr_colmeta = tupdesc->natts; - - if (format == KDS_FORMAT_ROW || - format == KDS_FORMAT_HASH || - format == KDS_FORMAT_BLOCK) - attcacheoff = 0; - - for (j=0; j < tupdesc->natts; j++) - { - Form_pg_attribute attr = TupleDescAttr(tupdesc, j); - - __setup_kern_colmeta(kds, j, - NameStr(attr->attname), - attr->attnum, - attr->attbyval, - attr->attalign, - attr->attlen, - attr->atttypid, - attr->atttypmod, - &attcacheoff); - } - /* internal system attribute */ - if (format == KDS_FORMAT_COLUMN) - { - kern_colmeta *cmeta = &kds->colmeta[kds->nr_colmeta++]; - - memset(cmeta, 0, sizeof(kern_colmeta)); - cmeta->attbyval = true; - cmeta->attalign = sizeof(int32_t); - cmeta->attlen = sizeof(GpuCacheSysattr); - cmeta->attnum = -1; - cmeta->attcacheoff = -1; - cmeta->atttypid = InvalidOid; - cmeta->atttypmod = -1; - cmeta->atttypkind = TYPE_KIND__BASE; - strcpy(cmeta->attname, "__gcache_sysattr__"); - } - return MAXALIGN(offsetof(kern_data_store, colmeta[kds->nr_colmeta])); -} - -size_t -estimate_kern_data_store(TupleDesc tupdesc) -{ - int j, nr_colmeta = tupdesc->natts; - - for (j=0; j < tupdesc->natts; j++) - { - Form_pg_attribute attr = TupleDescAttr(tupdesc, j); - - nr_colmeta += count_num_of_subfields(attr->atttypid); - } - /* internal system attribute if KDS_FORMAT_COLUMN */ - nr_colmeta++; - return MAXALIGN(offsetof(kern_data_store, colmeta[nr_colmeta])); -} - -/* - * Routines to store/fetch fallback tuples - */ -void -pgstromStoreFallbackTuple(pgstromTaskState *pts, HeapTuple htuple) -{ - MemoryContext memcxt = pts->css.ss.ps.state->es_query_cxt; - kern_tupitem *titem; - size_t sz; - - if (!pts->fallback_tuples) - { - pts->fallback_index = 0; - pts->fallback_nitems = 0; - pts->fallback_nrooms = 1000; - pts->fallback_tuples = - MemoryContextAlloc(memcxt, sizeof(off_t) * pts->fallback_nrooms); - } - if (!pts->fallback_buffer) - { - pts->fallback_usage = 0; - pts->fallback_bufsz = 8 * BLCKSZ; - pts->fallback_buffer = - MemoryContextAlloc(memcxt, pts->fallback_bufsz); - } - sz = MAXALIGN(offsetof(kern_tupitem, htup) + htuple->t_len); - while (pts->fallback_usage + sz > pts->fallback_bufsz) - { - pts->fallback_bufsz *= 2 + BLCKSZ; - pts->fallback_buffer = repalloc_huge(pts->fallback_buffer, - pts->fallback_bufsz); - } - while (pts->fallback_nitems >= pts->fallback_nrooms) - { - pts->fallback_nrooms *= 2 + 100; - pts->fallback_tuples = repalloc_huge(pts->fallback_tuples, - sizeof(off_t) * pts->fallback_nrooms); - } - titem = (kern_tupitem *)(pts->fallback_buffer + - pts->fallback_usage); - titem->t_len = htuple->t_len; - titem->rowid = pts->fallback_nitems++; - memcpy(&titem->htup, htuple->t_data, htuple->t_len); - - pts->fallback_tuples[titem->rowid] = pts->fallback_usage; - pts->fallback_usage += sz; -} - -TupleTableSlot * -pgstromFetchFallbackTuple(pgstromTaskState *pts) -{ - if (pts->fallback_tuples && - pts->fallback_buffer && - pts->fallback_index < pts->fallback_nitems) - { - TupleTableSlot *slot = pts->css.ss.ss_ScanTupleSlot; - HeapTuple htuple = palloc0(sizeof(HeapTupleData)); - kern_tupitem *titem; - - titem = (kern_tupitem *)(pts->fallback_buffer + - pts->fallback_tuples[pts->fallback_index++]); - htuple->t_len = titem->t_len; - htuple->t_data = &titem->htup; - ExecForceStoreHeapTuple(htuple, slot, true); - /* reset the buffer if last one */ - if (pts->fallback_index == pts->fallback_nitems) - { - pts->fallback_index = 0; - pts->fallback_nitems = 0; - pts->fallback_usage = 0; - } - return slot; - } - return NULL; -} - -/* ---------------------------------------------------------------- - * - * Routines to load chunks from storage - * - * ---------------------------------------------------------------- - */ -#define __XCMD_KDS_SRC_OFFSET(buf) \ - (((XpuCommand *)((buf)->data))->u.task.kds_src_offset) -#define __XCMD_GET_KDS_SRC(buf) \ - ((kern_data_store *)((buf)->data + __XCMD_KDS_SRC_OFFSET(buf))) - -static void -__relScanDirectFallbackBlock(pgstromTaskState *pts, - kern_data_store *kds, - BlockNumber block_num) -{ - pgstromSharedState *ps_state = pts->ps_state; - Relation relation = pts->css.ss.ss_currentRelation; - HeapScanDesc h_scan = (HeapScanDesc)pts->css.ss.ss_currentScanDesc; - Snapshot snapshot = pts->css.ss.ps.state->es_snapshot; - Buffer buffer; - Page page; - int lines; - OffsetNumber lineoff; - ItemId lpp; - - buffer = ReadBufferExtended(relation, - MAIN_FORKNUM, - block_num, - RBM_NORMAL, - h_scan->rs_strategy); - /* just like heapgetpage() */ - heap_page_prune_opt(relation, buffer); - /* pick up valid tuples from the target page */ - LockBuffer(buffer, BUFFER_LOCK_SHARE); - page = BufferGetPage(buffer); - lines = PageGetMaxOffsetNumber(page); - for (lineoff = FirstOffsetNumber, lpp = PageGetItemId(page, lineoff); - lineoff <= lines; - lineoff++, lpp++) - { - HeapTupleData htup; - bool valid; - - if (!ItemIdIsNormal(lpp)) - continue; - - htup.t_tableOid = RelationGetRelid(relation); - htup.t_data = (HeapTupleHeader) PageGetItem((Page)page, lpp); - htup.t_len = ItemIdGetLength(lpp); - ItemPointerSet(&htup.t_self, block_num, lineoff); - - valid = HeapTupleSatisfiesVisibility(&htup, snapshot, buffer); - HeapCheckForSerializableConflictOut(valid, relation, &htup, - buffer, snapshot); - if (valid) - pts->cb_cpu_fallback(pts, kds, &htup); - } - UnlockReleaseBuffer(buffer); - pg_atomic_fetch_add_u32(&ps_state->heap_fallback_nblocks, 1); -} - -static void -__relScanDirectCachedBlock(pgstromTaskState *pts, BlockNumber block_num) -{ - Relation relation = pts->css.ss.ss_currentRelation; - HeapScanDesc h_scan = (HeapScanDesc)pts->css.ss.ss_currentScanDesc; - Snapshot snapshot = pts->css.ss.ps.state->es_snapshot; - kern_data_store *kds; - Buffer buffer; - Page spage; - Page dpage; - bool has_valid_tuples = false; - - /* - * Load the source buffer with synchronous read - */ - buffer = ReadBufferExtended(relation, - MAIN_FORKNUM, - block_num, - RBM_NORMAL, - h_scan->rs_strategy); - /* prune the old items, if any */ - heap_page_prune_opt(relation, buffer); - /* let's check tuples visibility for each */ - LockBuffer(buffer, BUFFER_LOCK_SHARE); - spage = (Page) BufferGetPage(buffer); - appendBinaryStringInfo(&pts->xcmd_buf, (const char *)spage, BLCKSZ); - UnlockReleaseBuffer(buffer); - kds = __XCMD_GET_KDS_SRC(&pts->xcmd_buf); - dpage = (Page) KDS_BLOCK_PGPAGE(kds, kds->block_nloaded); - Assert(dpage >= pts->xcmd_buf.data && - dpage + BLCKSZ <= pts->xcmd_buf.data + pts->xcmd_buf.len); - KDS_BLOCK_BLCKNR(kds, kds->block_nloaded) = block_num; - - /* - * Logic is almost equivalent as heapgetpage() doing. - * We have to invalidate tuples prior to GPU kernel - * execution, if not all-visible. - */ - if (!PageIsAllVisible(dpage) || snapshot->takenDuringRecovery) - { - int lines = PageGetMaxOffsetNumber(dpage); - ItemId lpp; - OffsetNumber lineoff; - - - for (lineoff = FirstOffsetNumber, lpp = PageGetItemId(dpage, lineoff); - lineoff <= lines; - lineoff++, lpp++) - { - HeapTupleData htup; - bool valid; - - if (!ItemIdIsNormal(lpp)) - continue; - htup.t_tableOid = RelationGetRelid(relation); - htup.t_data = (HeapTupleHeader) PageGetItem((Page) dpage, lpp); - Assert((((uintptr_t)htup.t_data - (uintptr_t)dpage) & 7) == 0); - htup.t_len = ItemIdGetLength(lpp); - ItemPointerSet(&htup.t_self, block_num, lineoff); - - valid = HeapTupleSatisfiesVisibility(&htup, snapshot, buffer); - HeapCheckForSerializableConflictOut(valid, relation, &htup, - buffer, snapshot); - if (valid) - has_valid_tuples = true; - else - ItemIdSetUnused(lpp); - } - } - else - { - has_valid_tuples = true; - } - - /* - * If no tuples in this block are visible, we don't need to load - * them to xPU device (just wast of memory and bandwidth), - * so it shall be reverted from the xcmd-buffer. - */ - if (has_valid_tuples) - { - pts->xcmd_buf.len -= BLCKSZ; - return; - } - /* dpage became all-visible also */ - PageSetAllVisible(dpage); - kds->nitems++; - kds->block_nloaded++; -} - -XpuCommand * -pgstromRelScanChunkDirect(pgstromTaskState *pts, - struct iovec *xcmd_iov, int *xcmd_iovcnt) -{ - pgstromSharedState *ps_state = pts->ps_state; - Relation relation = pts->css.ss.ss_currentRelation; - HeapScanDesc h_scan = (HeapScanDesc)pts->css.ss.ss_currentScanDesc; - /* NOTE: 'smgr_rnode' always locates on the head of SMgrRelationData */ - RelFileNodeBackend *smgr_rnode = (RelFileNodeBackend *)RelationGetSmgr(relation); - XpuCommand *xcmd; - kern_data_store *kds; - unsigned long m_offset = 0UL; - BlockNumber segment_id = InvalidBlockNumber; - strom_io_vector *strom_iovec; - strom_io_chunk *strom_ioc = NULL; - BlockNumber *strom_blknums; - uint32_t strom_nblocks = 0; - uint32_t kds_src_pathname = 0; - uint32_t kds_src_iovec = 0; - uint32_t kds_nrooms; - - kds = __XCMD_GET_KDS_SRC(&pts->xcmd_buf); - kds_nrooms = (PGSTROM_CHUNK_SIZE - - KDS_HEAD_LENGTH(kds)) / (sizeof(BlockNumber) + BLCKSZ); - kds->nitems = 0; - kds->usage = 0; - kds->block_offset = (KDS_HEAD_LENGTH(kds) + - MAXALIGN(sizeof(BlockNumber) * kds_nrooms)); - kds->block_nloaded = 0; - pts->xcmd_buf.len = __XCMD_KDS_SRC_OFFSET(&pts->xcmd_buf) + kds->block_offset; - Assert(pts->xcmd_buf.len == MAXALIGN(pts->xcmd_buf.len)); - enlargeStringInfo(&pts->xcmd_buf, 0); - kds = __XCMD_GET_KDS_SRC(&pts->xcmd_buf); - - strom_iovec = alloca(offsetof(strom_io_vector, ioc[kds_nrooms])); - strom_iovec->nr_chunks = 0; - strom_blknums = alloca(sizeof(BlockNumber) * kds_nrooms); - strom_nblocks = 0; - while (!pts->scan_done) - { - while (pts->curr_block_num < pts->curr_block_tail && - kds->nitems < kds_nrooms) - { - BlockNumber block_num - = (pts->curr_block_num + h_scan->rs_startblock) % h_scan->rs_nblocks; - /* - * MEMO: Usually, CPU is (much) more powerful than DPUs. - * In case when the source cache is already on the shared- - * buffer, it makes no sense to handle this page on the - * DPU device. - */ - if (pts->ds_entry && !pgstrom_dpu_handle_cached_pages) - { - BufferTag bufTag; - uint32 bufHash; - LWLock *bufLock; - int buf_id; - - INIT_BUFFERTAG(bufTag, smgr_rnode->node, MAIN_FORKNUM, block_num); - bufHash = BufTableHashCode(&bufTag); - bufLock = BufMappingPartitionLock(bufHash); - - /* check whether the block exists on the shared buffer? */ - LWLockAcquire(bufLock, LW_SHARED); - buf_id = BufTableLookup(&bufTag, bufHash); - if (buf_id >= 0) - { - LWLockRelease(bufLock); - __relScanDirectFallbackBlock(pts, kds, block_num); - pts->curr_block_num++; - continue; - } - LWLockRelease(bufLock); - } - - /* - * MEMO: right now, we allow GPU Direct SQL for the all-visible - * pages only, due to the restrictions about MVCC checks. - * However, it is too strict for the purpose. If we would have - * a mechanism to perform MVCC checks without commit logs. - * In other words, if all the tuples in a certain page have - * HEAP_XMIN_* or HEAP_XMAX_* flags correctly, we can have MVCC - * logic in the device code. - */ - if (VM_ALL_VISIBLE(relation, block_num, &pts->curr_vm_buffer)) - { - /* - * We don't allow xPU Direct SQL across multiple heap - * segments (for the code simplification). So, once - * relation scan is broken out, then restart with new - * KDS buffer. - */ - unsigned int fchunk_id; - - if (segment_id == InvalidBlockNumber) - segment_id = block_num / RELSEG_SIZE; - else if (segment_id != block_num / RELSEG_SIZE) - goto out; - - fchunk_id = (block_num % RELSEG_SIZE) * PAGES_PER_BLOCK; - if (strom_ioc != NULL && (strom_ioc->fchunk_id + - strom_ioc->nr_pages) == fchunk_id) - { - /* expand the iovec entry */ - strom_ioc->nr_pages += PAGES_PER_BLOCK; - } - else - { - /* add the next iovec entry */ - strom_ioc = &strom_iovec->ioc[strom_iovec->nr_chunks++]; - strom_ioc->m_offset = m_offset; - strom_ioc->fchunk_id = fchunk_id; - strom_ioc->nr_pages = PAGES_PER_BLOCK; - } - kds->nitems++; - strom_blknums[strom_nblocks++] = block_num; - m_offset += BLCKSZ; - } - else if (pts->ds_entry) - { - /* - * For DPU devices, it makes no sense to move the data blocks - * to the (relatively) poor performance devices instead of CPUs. - * So, we run CPU fallback for the tuples in dirty pages. - */ - __relScanDirectFallbackBlock(pts, kds, block_num); - } - else - { - __relScanDirectCachedBlock(pts, block_num); - } - pts->curr_block_num++; - } - - if (kds->nitems >= kds_nrooms) - { - /* ok, we cannot load more pages in this chunk */ - break; - } - else if (pts->br_state) - { - if (!pgstromBrinIndexNextChunk(pts)) - pts->scan_done = true; - } - else if (!h_scan->rs_base.rs_parallel) - { - /* single process scan */ - BlockNumber num_blocks = kds_nrooms - kds->nitems; - - if (!h_scan->rs_inited) - { - h_scan->rs_cblock = 0; - h_scan->rs_inited = true; - } - pts->curr_block_num = h_scan->rs_cblock; - if (pts->curr_block_num >= h_scan->rs_nblocks) - pts->scan_done = true; - else if (pts->curr_block_num + num_blocks > h_scan->rs_nblocks) - num_blocks = h_scan->rs_nblocks - pts->curr_block_num; - h_scan->rs_cblock += num_blocks; - pts->curr_block_tail = pts->curr_block_num + num_blocks; - } - else - { - /* parallel processes scan */ - ParallelBlockTableScanDesc pb_scan = - (ParallelBlockTableScanDesc)h_scan->rs_base.rs_parallel; - BlockNumber num_blocks = kds_nrooms - kds->nitems; - - if (!h_scan->rs_inited) - { - /* see table_block_parallelscan_startblock_init */ - BlockNumber start_block = InvalidBlockNumber; - - retry_parallel_init: - SpinLockAcquire(&pb_scan->phs_mutex); - if (pb_scan->phs_startblock == InvalidBlockNumber) - { - if (!pb_scan->base.phs_syncscan) - pb_scan->phs_startblock = 0; - else if (start_block != InvalidBlockNumber) - pb_scan->phs_startblock = start_block; - else - { - SpinLockRelease(&pb_scan->phs_mutex); - start_block = ss_get_location(relation, pb_scan->phs_nblocks); - goto retry_parallel_init; - } - } - h_scan->rs_nblocks = pb_scan->phs_nblocks; - h_scan->rs_startblock = pb_scan->phs_startblock; - SpinLockRelease(&pb_scan->phs_mutex); - h_scan->rs_inited = true; - } - pts->curr_block_num = pg_atomic_fetch_add_u64(&pb_scan->phs_nallocated, - num_blocks); - if (pts->curr_block_num >= h_scan->rs_nblocks) - pts->scan_done = true; - else if (pts->curr_block_num + num_blocks > h_scan->rs_nblocks) - num_blocks = h_scan->rs_nblocks - pts->curr_block_num; - pts->curr_block_tail = pts->curr_block_num + num_blocks; - } - } -out: - Assert(kds->nitems == kds->block_nloaded + strom_nblocks); - pg_atomic_fetch_add_u32(&ps_state->heap_normal_nblocks, kds->block_nloaded); - pg_atomic_fetch_add_u32(&ps_state->heap_direct_nblocks, strom_nblocks); - kds->length = kds->block_offset + BLCKSZ * kds->nitems; - if (kds->nitems == 0) - return NULL; - if (strom_iovec->nr_chunks > 0) - { - size_t sz; - - kds_src_pathname = pts->xcmd_buf.len; - appendStringInfoString(&pts->xcmd_buf, pts->kds_pathname); - if (segment_id > 0) - appendStringInfo(&pts->xcmd_buf, ".%u", segment_id); - appendStringInfoChar(&pts->xcmd_buf, '\0'); - - sz = offsetof(strom_io_vector, ioc[strom_iovec->nr_chunks]); - kds_src_iovec = __appendBinaryStringInfo(&pts->xcmd_buf, - (const char *)strom_iovec, sz); - } - else - { - Assert(segment_id == InvalidBlockNumber); - } - xcmd = (XpuCommand *)pts->xcmd_buf.data; - xcmd->u.task.kds_src_pathname = kds_src_pathname; - xcmd->u.task.kds_src_iovec = kds_src_iovec; - xcmd->length = pts->xcmd_buf.len; - - xcmd_iov[0].iov_base = xcmd; - xcmd_iov[0].iov_len = xcmd->length; - *xcmd_iovcnt = 1; - - return xcmd; -} - -static bool -__kds_row_insert_tuple(kern_data_store *kds, TupleTableSlot *slot) -{ - uint32_t *rowindex = KDS_GET_ROWINDEX(kds); - HeapTuple tuple; - size_t sz, __usage; - bool should_free; - kern_tupitem *titem; - - Assert(kds->format == KDS_FORMAT_ROW && kds->hash_nslots == 0); - tuple = ExecFetchSlotHeapTuple(slot, false, &should_free); - - __usage = (__kds_unpack(kds->usage) + - MAXALIGN(offsetof(kern_tupitem, htup) + tuple->t_len)); - sz = KDS_HEAD_LENGTH(kds) + sizeof(uint32_t) * (kds->nitems + 1) + __usage; - if (sz > kds->length) - return false; /* no more items! */ - titem = (kern_tupitem *)((char *)kds + kds->length - __usage); - titem->t_len = tuple->t_len; - titem->rowid = kds->nitems; - memcpy(&titem->htup, tuple->t_data, tuple->t_len); - kds->usage = rowindex[kds->nitems++] = __kds_packed(__usage); - - if (should_free) - heap_freetuple(tuple); - ExecClearTuple(slot); - - return true; -} - -XpuCommand * -pgstromRelScanChunkNormal(pgstromTaskState *pts, - struct iovec *xcmd_iov, int *xcmd_iovcnt) -{ - EState *estate = pts->css.ss.ps.state; - TableScanDesc scan = pts->css.ss.ss_currentScanDesc; - TupleTableSlot *slot = pts->base_slot; - kern_data_store *kds; - XpuCommand *xcmd; - size_t sz1, sz2; - - pts->xcmd_buf.len = __XCMD_KDS_SRC_OFFSET(&pts->xcmd_buf) + PGSTROM_CHUNK_SIZE; - enlargeStringInfo(&pts->xcmd_buf, 0); - kds = __XCMD_GET_KDS_SRC(&pts->xcmd_buf); - kds->nitems = 0; - kds->usage = 0; - kds->length = PGSTROM_CHUNK_SIZE; - - if (pts->br_state) - { - /* scan by BRIN index */ - while (!pts->scan_done) - { - if (!pts->curr_tbm) - { - TBMIterateResult *next_tbm = pgstromBrinIndexNextBlock(pts); - - if (!next_tbm) - { - pts->scan_done = true; - break; - } - if (!table_scan_bitmap_next_block(scan, next_tbm)) - elog(ERROR, "failed on table_scan_bitmap_next_block"); - pts->curr_tbm = next_tbm; - } - if (!TTS_EMPTY(slot) && - !__kds_row_insert_tuple(kds, slot)) - break; - if (!table_scan_bitmap_next_tuple(scan, pts->curr_tbm, slot)) - pts->curr_tbm = NULL; - else if (!__kds_row_insert_tuple(kds, slot)) - break; - } - } - else - { - /* full table scan */ - while (!pts->scan_done) - { - if (!TTS_EMPTY(slot) && - !__kds_row_insert_tuple(kds, slot)) - break; - if (!table_scan_getnextslot(scan, estate->es_direction, slot)) - { - pts->scan_done = true; - break; - } - if (!__kds_row_insert_tuple(kds, slot)) - break; - } - } - - if (kds->nitems == 0) - return NULL; - - /* setup iovec that may skip the hole between row-index and tuples-buffer */ - sz1 = ((KDS_BODY_ADDR(kds) - pts->xcmd_buf.data) + - MAXALIGN(sizeof(uint32_t) * kds->nitems)); - sz2 = __kds_unpack(kds->usage); - Assert(sz1 + sz2 <= pts->xcmd_buf.len); - kds->length = (KDS_HEAD_LENGTH(kds) + - MAXALIGN(sizeof(uint32_t) * kds->nitems) + sz2); - xcmd = (XpuCommand *)pts->xcmd_buf.data; - xcmd->length = sz1 + sz2; - xcmd_iov[0].iov_base = xcmd; - xcmd_iov[0].iov_len = sz1; - xcmd_iov[1].iov_base = (pts->xcmd_buf.data + pts->xcmd_buf.len - sz2); - xcmd_iov[1].iov_len = sz2; - *xcmd_iovcnt = 2; - - return xcmd; -} - -void -pgstrom_init_relscan(void) -{ - /* nothing to do */ -} diff --git a/Makefile b/old/Makefile similarity index 95% rename from Makefile rename to old/Makefile index 5c4525c3b..58f5e4bc5 100644 --- a/Makefile +++ b/old/Makefile @@ -39,7 +39,7 @@ __STROM_OBJS = main.o nvrtc.o extra.o \ gpuscan.o gpujoin.o gpupreagg.o \ arrow_fdw.o arrow_nodes.o arrow_write.o arrow_pgsql.o \ aggfuncs.o float2.o tinyint.o misc.o -STROM_OBJS = $(addprefix $(STROM_BUILD_ROOT)/src/, $(__STROM_OBJS)) +STROM_OBJS = $(addprefix $(STROM_BUILD_ROOT)/, $(__STROM_OBJS)) # # Source file of GPU portion @@ -49,26 +49,25 @@ __GPU_FATBIN := cuda_common cuda_numeric cuda_primitive \ cuda_jsonlib cuda_rangetype cuda_postgis \ cuda_gpuscan cuda_gpujoin cuda_gpupreagg cuda_gpusort __GPU_HEADERS := $(__GPU_FATBIN) cuda_utils cuda_basetype cuda_gcache arrow_defs -GPU_HEADERS := $(addprefix $(STROM_BUILD_ROOT)/src/, \ +GPU_HEADERS := $(addprefix $(STROM_BUILD_ROOT)/, \ $(addsuffix .h, $(__GPU_HEADERS))) -GPU_FATBIN := $(addprefix $(STROM_BUILD_ROOT)/src/, \ +GPU_FATBIN := $(addprefix $(STROM_BUILD_ROOT)/, \ $(addsuffix .fatbin, $(__GPU_FATBIN))) GPU_DEBUG_FATBIN := $(GPU_FATBIN:.fatbin=.gfatbin) -GPU_CACHE_FATBIN := $(STROM_BUILD_ROOT)/src/cuda_gcache.fatbin -GPU_CACHE_DEBUG_FATBIN := $(STROM_BUILD_ROOT)/src/cuda_gcache.gfatbin +GPU_CACHE_FATBIN := $(STROM_BUILD_ROOT)/cuda_gcache.fatbin +GPU_CACHE_DEBUG_FATBIN := $(STROM_BUILD_ROOT)/cuda_gcache.gfatbin # # Source file of utilities # -__STROM_UTILS = gpuinfo dbgen-ssbm -STROM_UTILS = $(addprefix $(STROM_BUILD_ROOT)/utils/, $(__STROM_UTILS)) +__STROM_UTILS = gpuinfo +STROM_UTILS = $(addprefix $(STROM_BUILD_ROOT)/, $(__STROM_UTILS)) -GPUINFO := $(STROM_BUILD_ROOT)/utils/gpuinfo -GPUINFO_SOURCE := $(STROM_BUILD_ROOT)/utils/gpuinfo.c +GPUINFO := $(STROM_BUILD_ROOT)/gpuinfo +GPUINFO_SOURCE := $(STROM_BUILD_ROOT)/gpuinfo.c GPUINFO_DEPEND := $(GPUINFO_SOURCE) GPUINFO_CFLAGS = $(PGSTROM_FLAGS) -I $(CUDA_IPATH) -L $(CUDA_LPATH) \ - -I $(STROM_BUILD_ROOT)/src \ - -I $(STROM_BUILD_ROOT)/utils \ + -I $(STROM_BUILD_ROOT) \ $(shell $(PG_CONFIG) --ldflags) SSBM_DBGEN = $(STROM_BUILD_ROOT)/utils/dbgen-ssbm diff --git a/Makefile.cuda b/old/Makefile.cuda similarity index 100% rename from Makefile.cuda rename to old/Makefile.cuda diff --git a/old/aggfuncs.c b/old/aggfuncs.c new file mode 100644 index 000000000..7cc812141 --- /dev/null +++ b/old/aggfuncs.c @@ -0,0 +1,1407 @@ +/* + * aggfuncs.c + * + * Definition of self-defined aggregate functions, used by GpuPreAgg + * ---- + * Copyright 2011-2021 (C) KaiGai Kohei + * Copyright 2014-2021 (C) PG-Strom Developers Team + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the PostgreSQL License. + */ +#include "pg_strom.h" +#include "cuda_numeric.h" + +/* + * declarations + */ +PG_FUNCTION_INFO_V1(pgstrom_partial_nrows); +PG_FUNCTION_INFO_V1(pgstrom_partial_avg_int8); +PG_FUNCTION_INFO_V1(pgstrom_partial_avg_float8); +PG_FUNCTION_INFO_V1(pgstrom_final_avg_int8_accum); +PG_FUNCTION_INFO_V1(pgstrom_final_avg_int8_final); +PG_FUNCTION_INFO_V1(pgstrom_final_avg_float8_accum); +PG_FUNCTION_INFO_V1(pgstrom_final_avg_float8_final); +PG_FUNCTION_INFO_V1(pgstrom_final_avg_numeric_final); +PG_FUNCTION_INFO_V1(pgstrom_partial_min_any); +PG_FUNCTION_INFO_V1(pgstrom_partial_max_any); +PG_FUNCTION_INFO_V1(pgstrom_partial_sum_any); +PG_FUNCTION_INFO_V1(pgstrom_partial_sum_x2_float4); +PG_FUNCTION_INFO_V1(pgstrom_partial_sum_x2_float8); +PG_FUNCTION_INFO_V1(pgstrom_partial_sum_x2_numeric); +PG_FUNCTION_INFO_V1(pgstrom_partial_cov_x); +PG_FUNCTION_INFO_V1(pgstrom_partial_cov_y); +PG_FUNCTION_INFO_V1(pgstrom_partial_cov_x2); +PG_FUNCTION_INFO_V1(pgstrom_partial_cov_y2); +PG_FUNCTION_INFO_V1(pgstrom_partial_cov_xy); +PG_FUNCTION_INFO_V1(pgstrom_partial_variance_float8); +PG_FUNCTION_INFO_V1(pgstrom_partial_covariance_float8); +PG_FUNCTION_INFO_V1(pgstrom_float8_combine); +PG_FUNCTION_INFO_V1(pgstrom_float8_stddev_samp); +PG_FUNCTION_INFO_V1(pgstrom_float8_stddev_pop); +PG_FUNCTION_INFO_V1(pgstrom_float8_stddev_samp_numeric); +PG_FUNCTION_INFO_V1(pgstrom_float8_stddev_pop_numeric); +PG_FUNCTION_INFO_V1(pgstrom_float8_var_samp); +PG_FUNCTION_INFO_V1(pgstrom_float8_var_pop); +PG_FUNCTION_INFO_V1(pgstrom_float8_var_samp_numeric); +PG_FUNCTION_INFO_V1(pgstrom_float8_var_pop_numeric); +PG_FUNCTION_INFO_V1(pgstrom_float8_regr_combine); +PG_FUNCTION_INFO_V1(pgstrom_float8_corr); +PG_FUNCTION_INFO_V1(pgstrom_float8_covar_pop); +PG_FUNCTION_INFO_V1(pgstrom_float8_covar_samp); +PG_FUNCTION_INFO_V1(pgstrom_float8_regr_avgx); +PG_FUNCTION_INFO_V1(pgstrom_float8_regr_avgy); +PG_FUNCTION_INFO_V1(pgstrom_float8_regr_intercept); +PG_FUNCTION_INFO_V1(pgstrom_float8_regr_r2); +PG_FUNCTION_INFO_V1(pgstrom_float8_regr_slope); +PG_FUNCTION_INFO_V1(pgstrom_float8_regr_sxx); +PG_FUNCTION_INFO_V1(pgstrom_float8_regr_sxy); +PG_FUNCTION_INFO_V1(pgstrom_float8_regr_syy); +PG_FUNCTION_INFO_V1(pgstrom_hll_sketch_new); +PG_FUNCTION_INFO_V1(pgstrom_hll_sketch_merge); +PG_FUNCTION_INFO_V1(pgstrom_hll_count_final); +PG_FUNCTION_INFO_V1(pgstrom_hll_sketch_histogram); + +/* utility to reference numeric[] */ +static inline Datum +numeric_array_ref(ArrayType *array, int index, bool *p_isnull) +{ + return array_ref(array, 1, &index, -1, -1, false, 'i', p_isnull); +} + +Datum +pgstrom_partial_nrows(PG_FUNCTION_ARGS) +{ + int i; + + for (i=0; i < PG_NARGS(); i++) + { + if (PG_ARGISNULL(i) || !PG_GETARG_BOOL(i)) + PG_RETURN_INT64(0); + } + PG_RETURN_INT64(1); +} + +Datum +pgstrom_partial_avg_int8(PG_FUNCTION_ARGS) +{ + ArrayType *result; + Datum items[2]; + + items[0] = PG_GETARG_DATUM(0); /* nrows(int8) */ + items[1] = PG_GETARG_DATUM(1); /* p_sum(int8) */ + result = construct_array(items, 2, INT8OID, + sizeof(int64), FLOAT8PASSBYVAL, 'd'); + PG_RETURN_ARRAYTYPE_P(result); +} + +Datum +pgstrom_partial_avg_float8(PG_FUNCTION_ARGS) +{ + int64 nrows = PG_GETARG_INT64(0); + ArrayType *result; + Datum items[2]; + + items[0] = Float8GetDatum((float8)nrows); + items[1] = PG_GETARG_DATUM(1); /* p_sum(float8) */ + result = construct_array(items, 2, FLOAT8OID, + sizeof(float8), FLOAT8PASSBYVAL, 'd'); + PG_RETURN_ARRAYTYPE_P(result); +} + +Datum +pgstrom_final_avg_int8_accum(PG_FUNCTION_ARGS) +{ + MemoryContext aggcxt; + MemoryContext oldcxt; + ArrayType *xarray; + ArrayType *yarray; + int64 *x, *y; + + if (!AggCheckCallContext(fcinfo, &aggcxt)) + elog(ERROR, "aggregate function called in non-aggregate context"); + if (PG_ARGISNULL(1)) + elog(ERROR, "Null state was supplied"); + + if (PG_ARGISNULL(0)) + { + oldcxt = MemoryContextSwitchTo(aggcxt); + xarray = PG_GETARG_ARRAYTYPE_P_COPY(1); + MemoryContextSwitchTo(oldcxt); + } + else + { + xarray = PG_GETARG_ARRAYTYPE_P(0); + yarray = PG_GETARG_ARRAYTYPE_P(1); + x = (int64 *)ARR_DATA_PTR(xarray); + y = (int64 *)ARR_DATA_PTR(yarray); + + x[0] += y[0]; + x[1] += y[1]; + } + PG_RETURN_POINTER(xarray); +} + +Datum +pgstrom_final_avg_int8_final(PG_FUNCTION_ARGS) +{ + ArrayType *xarray = PG_GETARG_ARRAYTYPE_P(0); + int64 *x = (int64 *)ARR_DATA_PTR(xarray); + + return DirectFunctionCall2(numeric_div, + DirectFunctionCall1(int8_numeric, + Int64GetDatum(x[1])), + DirectFunctionCall1(int8_numeric, + Int64GetDatum(x[0]))); +} + +Datum +pgstrom_final_avg_float8_accum(PG_FUNCTION_ARGS) +{ + MemoryContext aggcxt; + MemoryContext oldcxt; + ArrayType *xarray; + ArrayType *yarray; + float8 *x, *y; + + if (!AggCheckCallContext(fcinfo, &aggcxt)) + elog(ERROR, "aggregate function called in non-aggregate context"); + if (PG_ARGISNULL(1)) + elog(ERROR, "Null state was supplied"); + + if (PG_ARGISNULL(0)) + { + oldcxt = MemoryContextSwitchTo(aggcxt); + xarray = PG_GETARG_ARRAYTYPE_P_COPY(1); + MemoryContextSwitchTo(oldcxt); + } + else + { + xarray = PG_GETARG_ARRAYTYPE_P(0); + yarray = PG_GETARG_ARRAYTYPE_P(1); + x = (float8 *)ARR_DATA_PTR(xarray); + y = (float8 *)ARR_DATA_PTR(yarray); + + x[0] += y[0]; + x[1] += y[1]; + } + PG_RETURN_POINTER(xarray); +} + +Datum +pgstrom_final_avg_float8_final(PG_FUNCTION_ARGS) +{ + ArrayType *xarray = PG_GETARG_ARRAYTYPE_P(0); + float8 *x = (float8 *)ARR_DATA_PTR(xarray); + + PG_RETURN_FLOAT8(x[1] / x[0]); +} + +Datum +pgstrom_final_avg_numeric_final(PG_FUNCTION_ARGS) +{ + ArrayType *xarray = PG_GETARG_ARRAYTYPE_P(0); + float8 *x = (float8 *)ARR_DATA_PTR(xarray); + Datum nrows, sum; + + nrows = DirectFunctionCall1(float8_numeric, Float8GetDatum(x[0])); + sum = DirectFunctionCall1(float8_numeric, Float8GetDatum(x[1])); + + return DirectFunctionCall2(numeric_div, sum, nrows); +} + +/* + * pgstrom.pmin(anyelement) + */ +Datum +pgstrom_partial_min_any(PG_FUNCTION_ARGS) +{ + PG_RETURN_DATUM(PG_GETARG_DATUM(0)); +} + +/* + * pgstrom.pmax(anyelement) + */ +Datum +pgstrom_partial_max_any(PG_FUNCTION_ARGS) +{ + PG_RETURN_DATUM(PG_GETARG_DATUM(0)); +} + +/* + * pgstrom.psum(anyelement) + */ +Datum +pgstrom_partial_sum_any(PG_FUNCTION_ARGS) +{ + PG_RETURN_DATUM(PG_GETARG_DATUM(0)); +} + +/* + * pgstrom.psum_x2(float4) + */ +Datum +pgstrom_partial_sum_x2_float4(PG_FUNCTION_ARGS) +{ + float4 value = (PG_ARGISNULL(0) ? 0.0 : PG_GETARG_FLOAT4(0)); + + PG_RETURN_FLOAT4(value * value); +} + +/* + * pgstrom.psum_x2(float8) + */ +Datum +pgstrom_partial_sum_x2_float8(PG_FUNCTION_ARGS) +{ + float8 value = (PG_ARGISNULL(0) ? 0.0 : PG_GETARG_FLOAT8(0)); + + PG_RETURN_FLOAT8(value * value); +} + +/* + * pgstrom.psum_x2(numeric) + */ +Datum +pgstrom_partial_sum_x2_numeric(PG_FUNCTION_ARGS) +{ + Datum value; + + if (!PG_ARGISNULL(0)) + value = PG_GETARG_DATUM(0); /* a valid numeric value */ + else + value = DirectFunctionCall3(numeric_in, + CStringGetDatum("0"), + ObjectIdGetDatum(InvalidOid), + Int32GetDatum(-1)); + return DirectFunctionCall2(numeric_mul, value, value); +} + +/* + * pgstrom.pcov_x(float8) + */ +Datum +pgstrom_partial_cov_x(PG_FUNCTION_ARGS) +{ + if (!PG_GETARG_BOOL(0)) + PG_RETURN_NULL(); + PG_RETURN_DATUM(PG_GETARG_DATUM(1)); +} + +/* + * pgstrom.pcov_y(float8) + */ +Datum +pgstrom_partial_cov_y(PG_FUNCTION_ARGS) +{ + if (!PG_GETARG_BOOL(0)) + PG_RETURN_NULL(); + PG_RETURN_DATUM(PG_GETARG_DATUM(2)); +} + +/* + * pgstrom.pcov_x2(float8) + */ +Datum +pgstrom_partial_cov_x2(PG_FUNCTION_ARGS) +{ + float8 value = PG_GETARG_FLOAT8(1); + + if (!PG_GETARG_BOOL(0)) + PG_RETURN_NULL(); + PG_RETURN_FLOAT8(value * value); +} + +/* + * pgstrom.pcov_y2(float8) + */ +Datum +pgstrom_partial_cov_y2(PG_FUNCTION_ARGS) +{ + float8 value = PG_GETARG_FLOAT8(2); + + if (!PG_GETARG_BOOL(0)) + PG_RETURN_NULL(); + PG_RETURN_FLOAT8(value * value); +} + +/* + * pgstrom.pcov_xy(float8) + */ +Datum +pgstrom_partial_cov_xy(PG_FUNCTION_ARGS) +{ + float8 x_value = PG_GETARG_FLOAT8(1); + float8 y_value = PG_GETARG_FLOAT8(2); + + if (!PG_GETARG_BOOL(0)) + PG_RETURN_NULL(); + PG_RETURN_FLOAT8(x_value * y_value); +} + +/* + * pgstrom_partial_variance_float8 + */ +Datum +pgstrom_partial_variance_float8(PG_FUNCTION_ARGS) +{ + ArrayType *state; + Datum items[3]; + + items[0] = Float8GetDatum((double)PG_GETARG_INT64(0)); /* nrows(int8) */ + items[1] = PG_GETARG_DATUM(1); /* sum of X */ + items[2] = PG_GETARG_DATUM(2); /* sum of X^2 */ + state = construct_array(items, 3, FLOAT8OID, + sizeof(float8), FLOAT8PASSBYVAL, 'd'); + PG_RETURN_ARRAYTYPE_P(state); +} + +/* + * pgstrom_partial_covariance_float8 + */ +Datum +pgstrom_partial_covariance_float8(PG_FUNCTION_ARGS) +{ + ArrayType *state; + Datum items[6]; + + items[0] = Float8GetDatum((double)PG_GETARG_INT64(0)); /* nrows(int8) */ + items[1] = PG_GETARG_DATUM(1); /* sum of X */ + items[2] = PG_GETARG_DATUM(2); /* sum of X^2 */ + items[3] = PG_GETARG_DATUM(3); /* sum of Y */ + items[4] = PG_GETARG_DATUM(4); /* sum of Y^2 */ + items[5] = PG_GETARG_DATUM(5); /* sum of X*Y */ + state = construct_array(items, 6, FLOAT8OID, + sizeof(float8), FLOAT8PASSBYVAL, 'd'); + PG_RETURN_ARRAYTYPE_P(state); +} + +/* + * float8 validator + */ +static inline float8 * +check_float8_array(ArrayType *transarray, const char *caller, int n) +{ + if (ARR_NDIM(transarray) != 1 || + ARR_DIMS(transarray)[0] != n || + ARR_HASNULL(transarray) || + ARR_ELEMTYPE(transarray) != FLOAT8OID) + elog(ERROR, "%s: expected %d-element float8 array", caller, n); + return (float8 *) ARR_DATA_PTR(transarray); +} + +static inline void +check_float8_value(float8 value, bool inf_is_valid, bool zero_is_valid) +{ + if (isinf(value) && !inf_is_valid) + ereport(ERROR, + (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), + errmsg("value out of range: overflow"))); + if (value == 0.0 && !zero_is_valid) + ereport(ERROR, + (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), + errmsg("value out of range: underflow"))); +} + +/* + * pgstrom_float8_combine + */ +Datum +pgstrom_float8_combine(PG_FUNCTION_ARGS) +{ + ArrayType *transarray1 = PG_GETARG_ARRAYTYPE_P(0); + ArrayType *transarray2 = PG_GETARG_ARRAYTYPE_P(1); + float8 *transvalues1; + float8 *transvalues2; + float8 N, sumX, sumX2; + + if (!AggCheckCallContext(fcinfo, NULL)) + elog(ERROR, "aggregate function called in non-aggregate context"); + transvalues1 = check_float8_array(transarray1, __FUNCTION__, 3); + N = transvalues1[0]; + sumX = transvalues1[1]; + sumX2 = transvalues1[2]; + + transvalues2 = check_float8_array(transarray2, __FUNCTION__, 3); + N += transvalues2[0]; + sumX += transvalues2[1]; + sumX2 += transvalues2[2]; + check_float8_value(sumX, isinf(transvalues1[1]) || isinf(transvalues2[1]), true); + check_float8_value(sumX2, isinf(transvalues1[2]) || isinf(transvalues2[2]), true); + + transvalues1[0] = N; + transvalues1[1] = sumX; + transvalues1[2] = sumX2; + + PG_RETURN_ARRAYTYPE_P(transarray1); +} + +/* + * pgstrom_float8_var_samp + */ +Datum +pgstrom_float8_var_samp(PG_FUNCTION_ARGS) +{ + ArrayType *transarray = PG_GETARG_ARRAYTYPE_P(0); + float8 *transvalues; + float8 N, sumX, sumX2; + float8 numerator; + + transvalues = check_float8_array(transarray, "float8_stddev_pop", 3); + N = transvalues[0]; + sumX = transvalues[1]; + sumX2 = transvalues[2]; + /* Population stddev is undefined when N is 0, so return NULL */ + if (N == 0.0) + PG_RETURN_NULL(); + + numerator = N * sumX2 - sumX * sumX; + check_float8_value(numerator, isinf(sumX2) || isinf(sumX), true); + + /* Watch out for roundoff error producing a negative numerator */ + if (numerator <= 0.0) + PG_RETURN_FLOAT8(0.0); + + PG_RETURN_FLOAT8(numerator / (N * (N - 1.0))); +} + +/* + * pgstrom_float8_var_pop + */ +Datum +pgstrom_float8_var_pop(PG_FUNCTION_ARGS) +{ + ArrayType *transarray = PG_GETARG_ARRAYTYPE_P(0); + float8 *transvalues; + float8 N, sumX, sumX2; + float8 numerator; + + transvalues = check_float8_array(transarray, "float8_stddev_pop", 3); + N = transvalues[0]; + sumX = transvalues[1]; + sumX2 = transvalues[2]; + /* Population stddev is undefined when N is 0, so return NULL */ + if (N == 0.0) + PG_RETURN_NULL(); + + numerator = N * sumX2 - sumX * sumX; + check_float8_value(numerator, isinf(sumX2) || isinf(sumX), true); + + /* Watch out for roundoff error producing a negative numerator */ + if (numerator <= 0.0) + PG_RETURN_FLOAT8(0.0); + + PG_RETURN_FLOAT8(numerator / (N * N)); +} + +/* + * pgstrom_float8_stddev_samp + */ +Datum +pgstrom_float8_stddev_samp(PG_FUNCTION_ARGS) +{ + ArrayType *transarray = PG_GETARG_ARRAYTYPE_P(0); + float8 *transvalues; + float8 N, sumX, sumX2; + float8 numerator; + + transvalues = check_float8_array(transarray, "float8_stddev_pop", 3); + N = transvalues[0]; + sumX = transvalues[1]; + sumX2 = transvalues[2]; + /* Population stddev is undefined when N is 0, so return NULL */ + if (N == 0.0) + PG_RETURN_NULL(); + + numerator = N * sumX2 - sumX * sumX; + check_float8_value(numerator, isinf(sumX2) || isinf(sumX), true); + + /* Watch out for roundoff error producing a negative numerator */ + if (numerator <= 0.0) + PG_RETURN_FLOAT8(0.0); + + PG_RETURN_FLOAT8(sqrt(numerator / (N * (N - 1.0)))); +} + +/* + * pgstrom_float8_stddev_pop + */ +Datum +pgstrom_float8_stddev_pop(PG_FUNCTION_ARGS) +{ + ArrayType *transarray = PG_GETARG_ARRAYTYPE_P(0); + float8 *transvalues; + float8 N, sumX, sumX2; + float8 numerator; + + transvalues = check_float8_array(transarray, "float8_stddev_pop", 3); + N = transvalues[0]; + sumX = transvalues[1]; + sumX2 = transvalues[2]; + /* Population stddev is undefined when N is 0, so return NULL */ + if (N == 0.0) + PG_RETURN_NULL(); + + numerator = N * sumX2 - sumX * sumX; + check_float8_value(numerator, isinf(sumX2) || isinf(sumX), true); + + /* Watch out for roundoff error producing a negative numerator */ + if (numerator <= 0.0) + PG_RETURN_FLOAT8(0.0); + + PG_RETURN_FLOAT8(sqrt(numerator / (N * N))); +} + +/* + * pgstrom_float8_stddev_samp_numeric + */ +Datum +pgstrom_float8_stddev_samp_numeric(PG_FUNCTION_ARGS) +{ + Datum datum = pgstrom_float8_stddev_samp(fcinfo); + + PG_RETURN_NUMERIC(DirectFunctionCall1(float8_numeric, datum)); +} + +/* + * pgstrom_float8_stddev_pop_numeric + */ +Datum +pgstrom_float8_stddev_pop_numeric(PG_FUNCTION_ARGS) +{ + Datum datum = pgstrom_float8_stddev_pop(fcinfo); + + PG_RETURN_NUMERIC(DirectFunctionCall1(float8_numeric, datum)); +} + +/* + * pgstrom_float8_var_samp_numeric + */ +Datum +pgstrom_float8_var_samp_numeric(PG_FUNCTION_ARGS) +{ + Datum datum = pgstrom_float8_var_samp(fcinfo); + + PG_RETURN_NUMERIC(DirectFunctionCall1(float8_numeric, datum)); +} + +/* + * pgstrom_float8_var_pop_numeric + */ +Datum +pgstrom_float8_var_pop_numeric(PG_FUNCTION_ARGS) +{ + Datum datum = pgstrom_float8_var_pop(fcinfo); + + PG_RETURN_NUMERIC(DirectFunctionCall1(float8_numeric, datum)); +} + +/* + * pgstrom_float8_regr_combine + */ +Datum +pgstrom_float8_regr_combine(PG_FUNCTION_ARGS) +{ + ArrayType *transarray1 = PG_GETARG_ARRAYTYPE_P(0); + ArrayType *transarray2 = PG_GETARG_ARRAYTYPE_P(1); + float8 *transvalues1; + float8 *transvalues2; + float8 N, sumX, sumX2, sumY, sumY2, sumXY; + + if (!AggCheckCallContext(fcinfo, NULL)) + elog(ERROR, "aggregate function called in non-aggregate context"); + + transvalues1 = check_float8_array(transarray1, __FUNCTION__, 6); + transvalues2 = check_float8_array(transarray2, __FUNCTION__, 6); + N = transvalues1[0] + transvalues2[0]; + sumX = transvalues1[1] + transvalues2[1]; + sumX2 = transvalues1[2] + transvalues2[2]; + sumY = transvalues1[3] + transvalues2[3]; + sumY2 = transvalues1[4] + transvalues2[4]; + sumXY = transvalues1[5] + transvalues2[5]; + + check_float8_value(sumX, isinf(transvalues1[1]) || isinf(transvalues2[1]), true); + check_float8_value(sumX2, isinf(transvalues1[2]) || isinf(transvalues2[2]), true); + check_float8_value(sumY, isinf(transvalues1[3]) || isinf(transvalues2[3]), true); + check_float8_value(sumY2, isinf(transvalues1[4]) || isinf(transvalues2[4]), true); + check_float8_value(sumXY, isinf(transvalues1[5]) || isinf(transvalues2[5]), true); + + transvalues1[0] = N; + transvalues1[1] = sumX; + transvalues1[2] = sumX2; + transvalues1[3] = sumY; + transvalues1[4] = sumY2; + transvalues1[5] = sumXY; + + PG_RETURN_ARRAYTYPE_P(transarray1); +} + +/* + * pgstrom_float8_covar_pop + */ +Datum +pgstrom_float8_covar_pop(PG_FUNCTION_ARGS) +{ + ArrayType *transarray = PG_GETARG_ARRAYTYPE_P(0); + float8 *transvalues; + float8 N, sumX, sumY, sumXY; + float8 numerator; + + transvalues = check_float8_array(transarray, __FUNCTION__, 6); + N = transvalues[0]; + sumX = transvalues[1]; + sumY = transvalues[3]; + sumXY = transvalues[5]; + + /* if N is 0 we should return NULL */ + if (N < 1.0) + PG_RETURN_NULL(); + numerator = N * sumXY - sumX * sumX; + check_float8_value(numerator, isinf(sumXY) || isinf(sumX) || isinf(sumY), true); + + PG_RETURN_FLOAT8(numerator / (N * N)); +} + +/* + * pgstrom_float8_covar_samp + */ +Datum +pgstrom_float8_covar_samp(PG_FUNCTION_ARGS) +{ + ArrayType *transarray = PG_GETARG_ARRAYTYPE_P(0); + float8 *transvalues; + float8 N, sumX, sumY, sumXY; + float8 numerator; + + transvalues = check_float8_array(transarray, __FUNCTION__, 6); + N = transvalues[0]; + sumX = transvalues[1]; + sumY = transvalues[3]; + sumXY = transvalues[5]; + + /* if N is <= 1 we should return NULL */ + if (N < 2.0) + PG_RETURN_NULL(); + numerator = N * sumXY - sumX * sumX; + check_float8_value(numerator, isinf(sumXY) || isinf(sumX) || isinf(sumY), true); + + PG_RETURN_FLOAT8(numerator / (N * (N - 1.0))); +} + +/* + * pgstrom_float8_corr + */ +Datum +pgstrom_float8_corr(PG_FUNCTION_ARGS) +{ + ArrayType *transarray = PG_GETARG_ARRAYTYPE_P(0); + float8 *transvalues; + float8 N, sumX, sumX2, sumY, sumY2, sumXY; + float8 numeratorX, numeratorY, numeratorXY; + + transvalues = check_float8_array(transarray, __FUNCTION__, 6); + N = transvalues[0]; + sumX = transvalues[1]; + sumX2 = transvalues[2]; + sumY = transvalues[3]; + sumY2 = transvalues[4]; + sumXY = transvalues[5]; + + /* if N is 0 we should return NULL */ + if (N < 1.0) + PG_RETURN_NULL(); + numeratorX = N * sumX2 - sumX * sumX; + numeratorY = N * sumY2 - sumY * sumY; + numeratorXY = N * sumXY - sumX * sumY; + check_float8_value(numeratorX, isinf(sumX) || isinf(sumX2), true); + check_float8_value(numeratorY, isinf(sumY) || isinf(sumY2), true); + check_float8_value(numeratorXY, isinf(sumXY) || isinf(sumX) || isinf(sumY), true); + + if (numeratorX <= 0 || numeratorY <= 0) + PG_RETURN_NULL(); + PG_RETURN_FLOAT8(numeratorXY / sqrt(numeratorX * numeratorY)); +} + +/* + * pgstrom_float8_regr_avgx + */ +Datum +pgstrom_float8_regr_avgx(PG_FUNCTION_ARGS) +{ + ArrayType *transarray = PG_GETARG_ARRAYTYPE_P(0); + float8 *transvalues; + float8 N, sumX; + + transvalues = check_float8_array(transarray, __FUNCTION__, 6); + N = transvalues[0]; + sumX = transvalues[1]; + + /* if N is 0 we should return NULL */ + if (N < 1.0) + PG_RETURN_NULL(); + PG_RETURN_FLOAT8(sumX / N); +} + +/* + * pgstrom_float8_regr_avgy + */ +Datum +pgstrom_float8_regr_avgy(PG_FUNCTION_ARGS) +{ + ArrayType *transarray = PG_GETARG_ARRAYTYPE_P(0); + float8 *transvalues; + float8 N, sumY; + + transvalues = check_float8_array(transarray, __FUNCTION__, 6); + N = transvalues[0]; + sumY = transvalues[3]; + + /* if N is 0 we should return NULL */ + if (N < 1.0) + PG_RETURN_NULL(); + PG_RETURN_FLOAT8(sumY / N); +} + +/* + * pgstrom_float8_regr_intercept + */ +Datum +pgstrom_float8_regr_intercept(PG_FUNCTION_ARGS) +{ + ArrayType *transarray = PG_GETARG_ARRAYTYPE_P(0); + float8 *transvalues; + float8 N, sumX, sumX2, sumY, sumXY; + float8 numeratorX, numeratorXXY; + + transvalues = check_float8_array(transarray, __FUNCTION__, 6); + N = transvalues[0]; + sumX = transvalues[1]; + sumX2 = transvalues[2]; + sumY = transvalues[3]; + sumXY = transvalues[5]; + + /* if N is 0 we should return NULL */ + if (N < 1.0) + PG_RETURN_NULL(); + numeratorX = N * sumX2 - sumX * sumX; + numeratorXXY = sumY * sumX2 - sumX * sumXY; + check_float8_value(numeratorX, isinf(sumX) || isinf(sumX2), true); + check_float8_value(numeratorXXY, (isinf(sumY) || isinf(sumX2) || + isinf(sumX) || isinf(sumXY)), true); + if (numeratorX <= 0) + PG_RETURN_NULL(); + PG_RETURN_FLOAT8(numeratorXXY / numeratorX); +} + +/* + * pgstrom_float8_regr_r2 + */ +Datum +pgstrom_float8_regr_r2(PG_FUNCTION_ARGS) +{ + ArrayType *transarray = PG_GETARG_ARRAYTYPE_P(0); + float8 *transvalues; + float8 N, sumX, sumX2, sumY, sumY2, sumXY; + float8 numeratorX, numeratorY, numeratorXY; + + transvalues = check_float8_array(transarray, __FUNCTION__, 6); + N = transvalues[0]; + sumX = transvalues[1]; + sumX2 = transvalues[2]; + sumY = transvalues[3]; + sumY2 = transvalues[4]; + sumXY = transvalues[5]; + + /* if N is 0 we should return NULL */ + if (N < 1.0) + PG_RETURN_NULL(); + numeratorX = N * sumX2 - sumX * sumX; + numeratorY = N * sumY2 - sumY * sumY; + numeratorXY = N * sumXY - sumX * sumY; + check_float8_value(numeratorX, isinf(sumX) || isinf(sumX2), true); + check_float8_value(numeratorY, isinf(sumY) || isinf(sumY2), true); + check_float8_value(numeratorXY, isinf(sumXY) || isinf(sumX) || isinf(sumY), true); + + if (numeratorX <= 0.0) + PG_RETURN_NULL(); + if (numeratorY <= 0.0) + PG_RETURN_FLOAT8(1.0); + PG_RETURN_FLOAT8((numeratorXY * numeratorXY) / (numeratorX * numeratorY)); +} + +/* + * pgstrom_float8_regr_slope + */ +Datum +pgstrom_float8_regr_slope(PG_FUNCTION_ARGS) +{ + ArrayType *transarray = PG_GETARG_ARRAYTYPE_P(0); + float8 *transvalues; + float8 N, sumX, sumX2, sumY, sumXY; + float8 numeratorX, numeratorXY; + + transvalues = check_float8_array(transarray, __FUNCTION__, 6); + N = transvalues[0]; + sumX = transvalues[1]; + sumX2 = transvalues[2]; + sumY = transvalues[3]; + sumXY = transvalues[5]; + + /* if N is 0 we should return NULL */ + if (N < 1.0) + PG_RETURN_NULL(); + numeratorX = N * sumX2 - sumX * sumX; + numeratorXY = N * sumXY - sumX * sumY; + check_float8_value(numeratorX, isinf(sumX) || isinf(sumX2), true); + check_float8_value(numeratorXY, isinf(sumXY) || isinf(sumX) || isinf(sumY), true); + + if (numeratorX <= 0) + PG_RETURN_NULL(); + PG_RETURN_FLOAT8(numeratorXY / numeratorX); +} + +/* + * pgstrom_float8_regr_sxx + */ +Datum +pgstrom_float8_regr_sxx(PG_FUNCTION_ARGS) +{ + ArrayType *transarray = PG_GETARG_ARRAYTYPE_P(0); + float8 *transvalues; + float8 N, sumX, sumX2; + float8 numeratorX; + + transvalues = check_float8_array(transarray, __FUNCTION__, 6); + N = transvalues[0]; + sumX = transvalues[1]; + sumX2 = transvalues[2]; + + /* if N is 0 we should return NULL */ + if (N < 1.0) + PG_RETURN_NULL(); + numeratorX = N * sumX2 - sumX * sumX; + check_float8_value(numeratorX, isinf(sumX) || isinf(sumX2), true); + + if (numeratorX <= 0) + PG_RETURN_NULL(); + PG_RETURN_FLOAT8(numeratorX / N); +} + +/* + * pgstrom_float8_regr_syy + */ +Datum +pgstrom_float8_regr_syy(PG_FUNCTION_ARGS) +{ + ArrayType *transarray = PG_GETARG_ARRAYTYPE_P(0); + float8 *transvalues; + float8 N, sumY, sumY2; + float8 numeratorY; + + transvalues = check_float8_array(transarray, __FUNCTION__, 6); + N = transvalues[0]; + sumY = transvalues[3]; + sumY2 = transvalues[4]; + + /* if N is 0 we should return NULL */ + if (N < 1.0) + PG_RETURN_NULL(); + numeratorY = N * sumY2 - sumY * sumY; + check_float8_value(numeratorY, isinf(sumY) || isinf(sumY2), true); + + if (numeratorY <= 0) + PG_RETURN_NULL(); + PG_RETURN_FLOAT8(numeratorY / N); +} + +/* + * pgstrom_float8_regr_sxy + */ +Datum +pgstrom_float8_regr_sxy(PG_FUNCTION_ARGS) +{ + ArrayType *transarray = PG_GETARG_ARRAYTYPE_P(0); + float8 *transvalues; + float8 N, sumX, sumY, sumXY; + float8 numeratorXY; + + transvalues = check_float8_array(transarray, __FUNCTION__, 6); + N = transvalues[0]; + sumX = transvalues[1]; + sumY = transvalues[3]; + sumXY = transvalues[5]; + + /* if N is 0 we should return NULL */ + if (N < 1.0) + PG_RETURN_NULL(); + numeratorXY = N * sumXY - sumX * sumY; + check_float8_value(numeratorXY, isinf(sumXY) || isinf(sumX) || isinf(sumY), true); + + PG_RETURN_FLOAT8(numeratorXY / N); +} + +/* + * ---------------------------------------------------------------- + * + * Hyper-Log-Log support functions + * + * ---------------------------------------------------------------- + */ + +/* + * Hash-function based on Sip-Hash + * + * See https://en.wikipedia.org/wiki/SipHash + * and https://github.com/veorq/SipHash + */ +/* default: SipHash-2-4 */ +#define cROUNDS 2 +#define dROUNDS 4 +#define ROTL(x, b) (uint64_t)(((x) << (b)) | ((x) >> (64 - (b)))) + +#define U8TO64_LE(p) \ + (((uint64_t)((p)[0])) | ((uint64_t)((p)[1]) << 8) | \ + ((uint64_t)((p)[2]) << 16) | ((uint64_t)((p)[3]) << 24) | \ + ((uint64_t)((p)[4]) << 32) | ((uint64_t)((p)[5]) << 40) | \ + ((uint64_t)((p)[6]) << 48) | ((uint64_t)((p)[7]) << 56)) + +#define SIPROUND \ + do { \ + v0 += v1; \ + v1 = ROTL(v1, 13); \ + v1 ^= v0; \ + v0 = ROTL(v0, 32); \ + v2 += v3; \ + v3 = ROTL(v3, 16); \ + v3 ^= v2; \ + v0 += v3; \ + v3 = ROTL(v3, 21); \ + v3 ^= v0; \ + v2 += v1; \ + v1 = ROTL(v1, 17); \ + v1 ^= v2; \ + v2 = ROTL(v2, 32); \ + } while (0) + +static uint64_t +__pgstrom_hll_siphash_value(const void *ptr, const size_t len) +{ + const unsigned char *ni = (const unsigned char *)ptr; + uint64_t v0 = 0x736f6d6570736575UL; + uint64_t v1 = 0x646f72616e646f6dUL; + uint64_t v2 = 0x6c7967656e657261UL; + uint64_t v3 = 0x7465646279746573UL; + uint64_t k0 = 0x9c38151cda15a76bUL; /* random key-0 */ + uint64_t k1 = 0xfb4ff68fbd3e6658UL; /* random key-1 */ + uint64_t m; + int i; + const unsigned char *end = ni + len - (len % sizeof(uint64_t)); + const int left = len & 7; + uint64_t b = ((uint64_t)len) << 56; + + v3 ^= k1; + v2 ^= k0; + v1 ^= k1; + v0 ^= k0; + + for (; ni != end; ni += 8) + { + m = U8TO64_LE(ni); + v3 ^= m; + + for (i = 0; i < cROUNDS; ++i) + SIPROUND; + + v0 ^= m; + } + +#if 1 + if (left > 0) + { + uint64_t temp = 0; + + memcpy(&temp, ni, left); + b |= (temp & ((1UL << (BITS_PER_BYTE * left)) - 1)); + } +#else + /* original code */ + switch (left) + { + case 7: + b |= ((uint64_t)ni[6]) << 48; __attribute__ ((fallthrough)); + case 6: + b |= ((uint64_t)ni[5]) << 40; __attribute__ ((fallthrough)); + case 5: + b |= ((uint64_t)ni[4]) << 32; __attribute__ ((fallthrough)); + case 4: + b |= ((uint64_t)ni[3]) << 24; __attribute__ ((fallthrough)); + case 3: + b |= ((uint64_t)ni[2]) << 16; __attribute__ ((fallthrough)); + case 2: + b |= ((uint64_t)ni[1]) << 8; __attribute__ ((fallthrough)); + case 1: + b |= ((uint64_t)ni[0]); + break; + case 0: + break; + } +#endif + + v3 ^= b; + for (i = 0; i < cROUNDS; ++i) + SIPROUND; + + v0 ^= b; + + v2 ^= 0xff; + + for (i = 0; i < dROUNDS; ++i) + SIPROUND; + + b = v0 ^ v1 ^ v2 ^ v3; + + return b; +} + +/* + * pgstrom_hll_hash_xxxx functions + */ +static uint64 +__pgstrom_hll_hash_int1(Datum datum) +{ + return __pgstrom_hll_siphash_value(&datum, sizeof(int8)); +} + +static uint64 +__pgstrom_hll_hash_int2(Datum datum) +{ + return __pgstrom_hll_siphash_value(&datum, sizeof(int16)); +} + +static uint64 +__pgstrom_hll_hash_int4(Datum datum) +{ + return __pgstrom_hll_siphash_value(&datum, sizeof(int32)); +} + +static uint64 +__pgstrom_hll_hash_int8(Datum datum) +{ + return __pgstrom_hll_siphash_value(&datum, sizeof(int64)); +} + +static uint64 +__pgstrom_hll_hash_numeric(Datum datum) +{ + kern_context kcxt; + pg_numeric_t num; + size_t sz; + + memset(&kcxt, 0, sizeof(kcxt)); + num = pg_numeric_from_varlena(&kcxt, (struct varlena *)datum); + if (kcxt.errcode != ERRCODE_STROM_SUCCESS) + elog(ERROR, "failed on hash calculation of device numeric: %s", + DatumGetCString(DirectFunctionCall1(numeric_out, datum))); + sz = offsetof(pg_numeric_t, weight) + sizeof(cl_short); + return __pgstrom_hll_siphash_value(&num, sz); +} + +static uint64 +__pgstrom_hll_hash_date(Datum datum) +{ + return __pgstrom_hll_siphash_value(&datum, sizeof(DateADT)); +} + +static uint64 +__pgstrom_hll_hash_time(Datum datum) +{ + return __pgstrom_hll_siphash_value(&datum, sizeof(TimeADT)); +} + +static uint64 +__pgstrom_hll_hash_timetz(Datum datum) +{ + return __pgstrom_hll_siphash_value(DatumGetPointer(datum), sizeof(TimeTzADT)); +} + +static uint64 +__pgstrom_hll_hash_timestamp(Datum datum) +{ + return __pgstrom_hll_siphash_value(&datum, sizeof(Timestamp)); +} + +static uint64 +__pgstrom_hll_hash_timestamptz(Datum datum) +{ + return __pgstrom_hll_siphash_value(&datum, sizeof(TimestampTz)); +} + +static uint64 +__pgstrom_hll_hash_bpchar(Datum datum) +{ + BpChar *val = DatumGetBpCharPP(datum); + int len = bpchartruelen(VARDATA_ANY(val), + VARSIZE_ANY_EXHDR(val)); + return __pgstrom_hll_siphash_value(VARDATA_ANY(val), len); +} + +static uint64 +__pgstrom_hll_hash_varlena(Datum datum) +{ + struct varlena *val = PG_DETOAST_DATUM(datum); + + return __pgstrom_hll_siphash_value(VARDATA_ANY(val), VARSIZE_ANY_EXHDR(val)); +} + +static uint64 +__pgstrom_hll_hash_uuid(Datum datum) +{ + return __pgstrom_hll_siphash_value(DatumGetUUIDP(datum), sizeof(pg_uuid_t)); +} + +static bytea * +__pgstrom_hll_sketch_update_common(PG_FUNCTION_ARGS, uint64 hash) +{ + MemoryContext aggcxt; + bytea *hll_state; + uint8 *hll_regs; + uint64 nrooms; + uint32 index; + uint32 count; + + if (!AggCheckCallContext(fcinfo, &aggcxt)) + elog(ERROR, "aggregate function called in non-aggregate context"); + nrooms = (1UL << pgstrom_hll_register_bits); + if (PG_ARGISNULL(0)) + { + size_t sz = VARHDRSZ + sizeof(uint8) * nrooms; + hll_state = MemoryContextAllocZero(aggcxt, sz); + SET_VARSIZE(hll_state, sz); + } + else + { + hll_state = PG_GETARG_BYTEA_P(0); + } + Assert(VARSIZE(hll_state) == VARHDRSZ + sizeof(uint8) * nrooms); + hll_regs = (uint8 *)VARDATA(hll_state); + + index = hash & (nrooms - 1); + count = __builtin_ctzll(hash >> pgstrom_hll_register_bits) + 1; + if (hll_regs[index] < count) + hll_regs[index] = count; + return hll_state; +} + +#define PGSTROM_HLL_HANDLER_TEMPLATE(NAME) \ + PG_FUNCTION_INFO_V1(pgstrom_hll_hash_##NAME); \ + PG_FUNCTION_INFO_V1(pgstrom_hll_sketch_update_##NAME); \ + Datum \ + pgstrom_hll_hash_##NAME(PG_FUNCTION_ARGS) \ + { \ + Datum arg = PG_GETARG_DATUM(0); \ + PG_RETURN_UINT64(__pgstrom_hll_hash_##NAME(arg)); \ + } \ + Datum \ + pgstrom_hll_sketch_update_##NAME(PG_FUNCTION_ARGS) \ + { \ + if (PG_ARGISNULL(1)) \ + { \ + if (PG_ARGISNULL(0)) \ + PG_RETURN_NULL(); \ + PG_RETURN_DATUM(PG_GETARG_DATUM(0)); \ + } \ + else \ + { \ + Datum arg = PG_GETARG_DATUM(1); \ + uint64 hash = __pgstrom_hll_hash_##NAME(arg); \ + bytea *state; \ + \ + state = __pgstrom_hll_sketch_update_common(fcinfo, hash); \ + PG_RETURN_BYTEA_P(state); \ + } \ + } + +PGSTROM_HLL_HANDLER_TEMPLATE(int1) +PGSTROM_HLL_HANDLER_TEMPLATE(int2) +PGSTROM_HLL_HANDLER_TEMPLATE(int4) +PGSTROM_HLL_HANDLER_TEMPLATE(int8) +PGSTROM_HLL_HANDLER_TEMPLATE(numeric) +PGSTROM_HLL_HANDLER_TEMPLATE(date) +PGSTROM_HLL_HANDLER_TEMPLATE(time) +PGSTROM_HLL_HANDLER_TEMPLATE(timetz) +PGSTROM_HLL_HANDLER_TEMPLATE(timestamp) +PGSTROM_HLL_HANDLER_TEMPLATE(timestamptz) +PGSTROM_HLL_HANDLER_TEMPLATE(bpchar) +PGSTROM_HLL_HANDLER_TEMPLATE(varlena) +PGSTROM_HLL_HANDLER_TEMPLATE(uuid) + +/* + * pgstrom_hll_sketch_new + */ +Datum +pgstrom_hll_sketch_new(PG_FUNCTION_ARGS) +{ + uint64 nrooms = (1UL << pgstrom_hll_register_bits); + uint64 hll_hash = DatumGetUInt64(PG_GETARG_DATUM(0)); + bytea *hll_state; + uint8 *hll_regs; + uint32 count; + uint32 index; + + hll_state = palloc0(VARHDRSZ + sizeof(uint8) * nrooms); + SET_VARSIZE(hll_state, VARHDRSZ + sizeof(uint8) * nrooms); + hll_regs = (uint8 *)VARDATA(hll_state); + + index = hll_hash & (nrooms - 1); + Assert(index < nrooms); + count = __builtin_ctzll(hll_hash >> pgstrom_hll_register_bits) + 1; + if (hll_regs[index] < count) + hll_regs[index] = count; + + PG_RETURN_BYTEA_P(hll_state); +} + +/* + * pgstrom_hll_sketch_merge + */ +Datum +pgstrom_hll_sketch_merge(PG_FUNCTION_ARGS) +{ + MemoryContext aggcxt; + bytea *hll_state = NULL; + uint8 *hll_regs; + bytea *new_state; + uint8 *new_regs; + uint32 nrooms; + uint32 index; + + if (!AggCheckCallContext(fcinfo, &aggcxt)) + elog(ERROR, "aggregate function called in non-aggregate context"); + if (PG_ARGISNULL(0)) + { + if (PG_ARGISNULL(1)) + PG_RETURN_NULL(); + new_state = PG_GETARG_BYTEA_P(1); + nrooms = VARSIZE_ANY_EXHDR(new_state); + if (nrooms < 1 || (nrooms & (nrooms - 1)) != 0) + elog(ERROR, "HLL sketch must have 2^N rooms (%u)", nrooms); + hll_state = MemoryContextAllocZero(aggcxt, VARHDRSZ + nrooms); + SET_VARSIZE(hll_state, VARHDRSZ + nrooms); + memcpy(VARDATA_ANY(hll_state), VARDATA_ANY(new_state), nrooms); + } + else + { + hll_state = PG_GETARG_BYTEA_P(0); + nrooms = VARSIZE_ANY_EXHDR(hll_state); + if (nrooms < 1 || (nrooms & (nrooms - 1)) != 0) + elog(ERROR, "HLL sketch must have 2^N rooms (%u)", nrooms); + if (!PG_ARGISNULL(1)) + { + new_state = PG_GETARG_BYTEA_P(1); + if (VARSIZE_ANY_EXHDR(hll_state) != VARSIZE_ANY_EXHDR(new_state)) + elog(ERROR, "incompatible HLL sketch"); + hll_regs = (uint8 *)VARDATA_ANY(hll_state); + new_regs = (uint8 *)VARDATA_ANY(new_state); + for (index=0; index < nrooms; index++) + { + if (hll_regs[index] < new_regs[index]) + hll_regs[index] = new_regs[index]; + } + } + } + PG_RETURN_POINTER(hll_state); +} + +/* + * pgstrom_hll_count_final + */ +Datum +pgstrom_hll_count_final(PG_FUNCTION_ARGS) +{ + bytea *hll_state; + uint8 *hll_regs; + uint32 nrooms; + uint32 index; + double divider = 0.0; + double weight; + double estimate; + +#if 0 + /* + * MEMO: Here to no reason to prohibit to use pgstrom.hll_count_final() + * towards preliminary calculated HLL sketch. + */ + if (!AggCheckCallContext(fcinfo, NULL)) + elog(ERROR, "aggregate function called in non-aggregate context"); +#endif + if (PG_ARGISNULL(0)) + PG_RETURN_INT64(0); + /* + * MEMO: Hyper-Log-Log merge algorithm + * https://ja.wikiqube.net/wiki/HyperLogLog + */ + hll_state = PG_GETARG_BYTEA_P(0); + nrooms = VARSIZE_ANY_EXHDR(hll_state); + if (nrooms < 1 || (nrooms & (nrooms - 1)) != 0) + elog(ERROR, "HLL sketch must have 2^N rooms (%u)", nrooms); + hll_regs = (uint8 *)VARDATA(hll_state); + + for (index = 0; index < nrooms; index++) + divider += 1.0 / (double)(1UL << hll_regs[index]); + if (nrooms <= 16) + weight = 0.673; + else if (nrooms <= 32) + weight = 0.697; + else if (nrooms <= 64) + weight = 0.709; + else + weight = 0.7213 / (1.0 + 1.079 / (double)nrooms); + + estimate = (weight * (double)nrooms * (double)nrooms) / divider; + PG_RETURN_INT64((int64)estimate); +} + + + +/* + * pgstrom_hll_sketch_histogram + */ +Datum +pgstrom_hll_sketch_histogram(PG_FUNCTION_ARGS) +{ + bytea *hll_state = PG_GETARG_BYTEA_P(0); + uint8 *hll_regs; + uint32 nrooms; + uint32 index; + Datum hll_hist[64]; + int max_hist = -1; + ArrayType *result; + + nrooms = VARSIZE_ANY_EXHDR(hll_state); + if (nrooms < 1 || (nrooms & (nrooms - 1)) != 0) + elog(ERROR, "HLL sketch must have 2^N rooms (%u)", nrooms); + hll_regs = (uint8 *)VARDATA(hll_state); + + memset(hll_hist, 0, sizeof(hll_hist)); + for (index=0; index < nrooms; index++) + { + int value = (int)hll_regs[index]; + + if (value < 0 || value >= 64) + elog(ERROR, "HLL sketch looks corrupted"); + hll_hist[value]++; + if (max_hist < value) + max_hist = value; + } + + if (max_hist < 0) + PG_RETURN_NULL(); + + result = construct_array(hll_hist, + max_hist + 1, + INT4OID, + sizeof(int32), + true, + 'i'); + PG_RETURN_POINTER(result); +} diff --git a/next/arrow_defs.h b/old/arrow_defs.h similarity index 91% rename from next/arrow_defs.h rename to old/arrow_defs.h index b78ca1ae4..3da60b165 100644 --- a/next/arrow_defs.h +++ b/old/arrow_defs.h @@ -126,7 +126,6 @@ typedef enum { ArrowIntervalUnit__Year_Month = 0, ArrowIntervalUnit__Day_Time = 1, - ArrowIntervalUnit__Month_Day_Nano = 2, } ArrowIntervalUnit; /* @@ -193,39 +192,50 @@ typedef enum /* * ArrowTypeOptions - our own definition */ -typedef struct ArrowTypeOptions -{ - ArrowTypeTag tag; - short unitsz; - union { - struct { - unsigned short bitWidth; - __boolean is_signed; - } integer; - struct { - ArrowPrecision precision; - } floating_point; - struct { - unsigned short precision; - unsigned short scale; - unsigned short bitWidth; - } decimal; - struct { - ArrowDateUnit unit; - } date; - struct { - ArrowTimeUnit unit; - } time; - struct { - ArrowTimeUnit unit; - } timestamp; - struct { - ArrowIntervalUnit unit; - } interval; - struct { - unsigned int byteWidth; - } fixed_size_binary; - }; +#define ARROW_TYPE_OPTIONS_COMMON_FIELDS \ + ArrowTypeTag tag; \ + unsigned short unitsz + +typedef union ArrowTypeOptions +{ + struct { + ARROW_TYPE_OPTIONS_COMMON_FIELDS; + } common; + struct { + ARROW_TYPE_OPTIONS_COMMON_FIELDS; + unsigned short bitWidth; + __boolean is_signed; + } integer; + struct { + ARROW_TYPE_OPTIONS_COMMON_FIELDS; + ArrowPrecision precision; + } floating_point; + struct { + ARROW_TYPE_OPTIONS_COMMON_FIELDS; + unsigned short precision; + unsigned short scale; + unsigned short bitWidth; + } decimal; + struct { + ARROW_TYPE_OPTIONS_COMMON_FIELDS; + ArrowDateUnit unit; + } date; + struct { + ARROW_TYPE_OPTIONS_COMMON_FIELDS; + ArrowTimeUnit unit; + } time; + struct { + ARROW_TYPE_OPTIONS_COMMON_FIELDS; + ArrowTimeUnit unit; + } timestamp; + struct { + ARROW_TYPE_OPTIONS_COMMON_FIELDS; + ArrowIntervalUnit unit; + } interval; + struct { + ARROW_TYPE_OPTIONS_COMMON_FIELDS; + int byteWidth; + } fixed_size_binary; } ArrowTypeOptions; #undef ARROW_TYPE_OPTIONS_COMMON_FIELDS diff --git a/old/arrow_fdw.c b/old/arrow_fdw.c new file mode 100644 index 000000000..f7b6f2a32 --- /dev/null +++ b/old/arrow_fdw.c @@ -0,0 +1,6239 @@ +/* + * arrow_fdw.c + * + * Routines to map Apache Arrow files as PG's Foreign-Table. + * ---- + * Copyright 2011-2021 (C) KaiGai Kohei + * Copyright 2014-2021 (C) PG-Strom Developers Team + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the PostgreSQL License. + */ +#include "pg_strom.h" +#include "arrow_defs.h" +#include "arrow_ipc.h" +#include "cuda_numeric.cu" + +/* + * RecordBatchState + */ +typedef struct RecordBatchFieldState +{ + Oid atttypid; + int atttypmod; + ArrowTypeOptions attopts; + int64 nitems; /* usually, same with rb_nitems */ + int64 null_count; + off_t nullmap_offset; + size_t nullmap_length; + off_t values_offset; + size_t values_length; + off_t extra_offset; + size_t extra_length; + /* min/max statistics */ + SQLstat__datum stat_min; + SQLstat__datum stat_max; + bool stat_isnull; + /* sub-fields if any */ + int num_children; + struct RecordBatchFieldState *children; +} RecordBatchFieldState; + +typedef struct RecordBatchState +{ + File fdesc; + GPUDirectFileDesc *dfile; + struct stat stat_buf; + int rb_index; /* index number in a file */ + off_t rb_offset; /* offset from the head */ + size_t rb_length; /* length of the entire RecordBatch */ + int64 rb_nitems; /* number of items */ + /* per column information */ + int ncols; + RecordBatchFieldState columns[FLEXIBLE_ARRAY_MEMBER]; +} RecordBatchState; + +/* + * metadata cache (on shared memory) + */ +typedef struct +{ + dev_t st_dev; + ino_t st_ino; + uint32 hash; +} MetadataCacheKey; + +typedef struct +{ + dlist_node chain; + dlist_node lru_chain; + dlist_head siblings; /* if two or more record batches per file */ + /* key of RecordBatch metadata cache */ + struct stat stat_buf; + uint32 hash; + /* fields from RecordBatchState */ + int rb_index; /* index of the RecordBatch */ + off_t rb_offset; /* offset from the head */ + size_t rb_length; /* length of the entire RecordBatch */ + int64 rb_nitems; /* number of items */ + int ncols; + int nfields; /* length of fstate[] array */ + RecordBatchFieldState fstate[FLEXIBLE_ARRAY_MEMBER]; +} arrowMetadataCache; + +#define ARROW_METADATA_HASH_NSLOTS 2048 +typedef struct +{ + slock_t lru_lock; + dlist_head lru_list; + pg_atomic_uint64 consumed; + + LWLock lock_slots[ARROW_METADATA_HASH_NSLOTS]; + dlist_head hash_slots[ARROW_METADATA_HASH_NSLOTS]; + dlist_head mvcc_slots[ARROW_METADATA_HASH_NSLOTS]; +} arrowMetadataState; + +/* setup of MetadataCacheKey */ +static inline int +initMetadataCacheKey(MetadataCacheKey *mkey, struct stat *stat_buf) +{ + memset(mkey, 0, sizeof(MetadataCacheKey)); + mkey->st_dev = stat_buf->st_dev; + mkey->st_ino = stat_buf->st_ino; + mkey->hash = hash_any((unsigned char *)mkey, + offsetof(MetadataCacheKey, hash)); + return mkey->hash % ARROW_METADATA_HASH_NSLOTS; +} + +/* + * executor hint by min/max statistics per record batch + */ +typedef struct +{ + List *orig_quals; + List *eval_quals; + ExprState *eval_state; + Bitmapset *stat_attrs; + Bitmapset *load_attrs; + ExprContext *econtext; +} arrowStatsHint; + +/* + * MVCC state for the pending writes + */ +typedef struct +{ + dlist_node chain; + MetadataCacheKey key; + TransactionId xid; + CommandId cid; + uint32 record_batch; +} arrowWriteMVCCLog; + +/* + * REDO Log for INSERT/TRUNCATE + */ +typedef struct +{ + dlist_node chain; + MetadataCacheKey key; + TransactionId xid; + CommandId cid; + char *pathname; + bool is_truncate; + /* for TRUNCATE */ + uint32 suffix; + /* for INSERT */ + loff_t footer_offset; + size_t footer_length; + char footer_backup[FLEXIBLE_ARRAY_MEMBER]; +} arrowWriteRedoLog; + +/* + * arrowWriteState + */ +typedef struct +{ + MemoryContext memcxt; + File file; + MetadataCacheKey key; + uint32 hash; + SQLtable sql_table; +} arrowWriteState; + +/* + * ArrowFdwState + */ +struct ArrowFdwState +{ + GpuContext *gcontext; /* valid if owned by GpuXXX plan */ + List *gpuDirectFileDescList; /* list of GPUDirectFileDesc */ + List *fdescList; /* list of File (buffered i/o) */ + Bitmapset *referenced; + arrowStatsHint *stats_hint; + pg_atomic_uint32 *rbatch_index; + pg_atomic_uint32 __rbatch_index_local; /* if single process */ + pg_atomic_uint32 *rbatch_nload; + pg_atomic_uint32 __rbatch_nload_local; /* if single process */ + pg_atomic_uint32 *rbatch_nskip; + pg_atomic_uint32 __rbatch_nskip_local; /* if single process */ + pgstrom_data_store *curr_pds; /* current focused buffer */ + cl_ulong curr_index; /* current index to row on KDS */ + /* state of RecordBatches */ + uint32 num_rbatches; + RecordBatchState *rbatches[FLEXIBLE_ARRAY_MEMBER]; +}; + +/* ---------- static variables ---------- */ +static FdwRoutine pgstrom_arrow_fdw_routine; +static shmem_request_hook_type shmem_request_next = NULL; +static shmem_startup_hook_type shmem_startup_next = NULL; +static arrowMetadataState *arrow_metadata_state = NULL; +static dlist_head arrow_write_redo_list; +static bool arrow_fdw_enabled; /* GUC */ +static bool arrow_fdw_stats_hint_enabled; /* GUC */ +static int arrow_metadata_cache_size_kb; /* GUC */ +static size_t arrow_metadata_cache_size; +static int arrow_record_batch_size_kb; /* GUC */ + +/* ---------- static functions ---------- */ +static Oid arrowTypeToPGTypeOid(ArrowField *field, int *typmod); +static const char *arrowTypeToPGTypeName(ArrowField *field); +static size_t arrowFieldLength(ArrowField *field, int64 nitems); +static bool arrowSchemaCompatibilityCheck(TupleDesc tupdesc, + RecordBatchState *rb_state); +static List *__arrowFdwExtractFilesList(List *options_list, + int *p_parallel_nworkers, + bool *p_writable); +static List *arrowFdwExtractFilesList(List *options_list); +static List *arrowLookupOrBuildMetadataCache(File fdesc, Bitmapset **p_stat_attrs); +static void pg_datum_arrow_ref(kern_data_store *kds, + kern_colmeta *cmeta, + size_t index, + Datum *p_datum, + bool *p_isnull); +/* routines for writable arrow_fdw foreign tables */ +static void setupArrowSQLbufferSchema(SQLtable *table, TupleDesc tupdesc, + ArrowFileInfo *af_info); +static void setupArrowSQLbufferBatches(SQLtable *table, + ArrowFileInfo *af_info); +static loff_t createArrowWriteRedoLog(File filp, bool is_newfile); +static void writeOutArrowRecordBatch(arrowWriteState *aw_state, + bool with_footer); + +Datum pgstrom_arrow_fdw_handler(PG_FUNCTION_ARGS); +Datum pgstrom_arrow_fdw_validator(PG_FUNCTION_ARGS); +Datum pgstrom_arrow_fdw_precheck_schema(PG_FUNCTION_ARGS); +Datum pgstrom_arrow_fdw_truncate(PG_FUNCTION_ARGS); +Datum pgstrom_arrow_fdw_import_file(PG_FUNCTION_ARGS); + +/* + * timespec_comp - compare timespec values + */ +static inline int +timespec_comp(struct timespec *tv1, struct timespec *tv2) +{ + if (tv1->tv_sec < tv2->tv_sec) + return -1; + if (tv1->tv_sec > tv2->tv_sec) + return 1; + if (tv1->tv_nsec < tv2->tv_nsec) + return -1; + if (tv1->tv_nsec > tv2->tv_nsec) + return 1; + return 0; +} + +/* + * baseRelIsArrowFdw + */ +bool +baseRelIsArrowFdw(RelOptInfo *baserel) +{ + if ((baserel->reloptkind == RELOPT_BASEREL || + baserel->reloptkind == RELOPT_OTHER_MEMBER_REL) && + baserel->rtekind == RTE_RELATION && + OidIsValid(baserel->serverid) && + baserel->fdwroutine && + memcmp(baserel->fdwroutine, + &pgstrom_arrow_fdw_routine, + sizeof(FdwRoutine)) == 0) + return true; + + return false; +} + +/* + * RelationIsArrowFdw + */ +bool +RelationIsArrowFdw(Relation frel) +{ + if (RelationGetForm(frel)->relkind == RELKIND_FOREIGN_TABLE) + { + FdwRoutine *routine = GetFdwRoutineForRelation(frel, false); + + if (memcmp(routine, &pgstrom_arrow_fdw_routine, + sizeof(FdwRoutine)) == 0) + return true; + } + return false; +} + +/* + * RecordBatchFieldCount + */ +static int +__RecordBatchFieldCount(RecordBatchFieldState *fstate) +{ + int j, count = 1; + + for (j=0; j < fstate->num_children; j++) + count += __RecordBatchFieldCount(&fstate->children[j]); + + return count; +} + +static int +RecordBatchFieldCount(RecordBatchState *rbstate) +{ + int j, count = 0; + + for (j=0; j < rbstate->ncols; j++) + count += __RecordBatchFieldCount(&rbstate->columns[j]); + + return count; +} + +/* + * RecordBatchFieldLength + */ +static size_t +RecordBatchFieldLength(RecordBatchFieldState *fstate) +{ + size_t len; + int j; + + len = BLCKALIGN(fstate->nullmap_length + + fstate->values_length + + fstate->extra_length); + for (j=0; j < fstate->num_children; j++) + len += RecordBatchFieldLength(&fstate->children[j]); + return len; +} + +/* + * ArrowGetForeignRelSize + */ +static void +ArrowGetForeignRelSize(PlannerInfo *root, + RelOptInfo *baserel, + Oid foreigntableid) +{ + ForeignTable *ft = GetForeignTable(foreigntableid); + List *filesList; + Size filesSizeTotal = 0; + Bitmapset *referenced = NULL; + BlockNumber npages = 0; + double ntuples = 0.0; + ListCell *lc; + int parallel_nworkers; + bool writable; + Bitmapset *optimal_gpus = (void *)(~0UL); + int j, k; + + /* columns to be fetched */ + foreach (lc, baserel->baserestrictinfo) + { + RestrictInfo *rinfo = lfirst(lc); + + pull_varattnos((Node *)rinfo->clause, baserel->relid, &referenced); + } + referenced = pgstrom_pullup_outer_refs(root, baserel, referenced); + + filesList = __arrowFdwExtractFilesList(ft->options, + ¶llel_nworkers, + &writable); + foreach (lc, filesList) + { + char *fname = strVal(lfirst(lc)); + File fdesc; + List *rb_cached; + ListCell *cell; + Bitmapset *__gpus; + size_t len = 0; + + fdesc = PathNameOpenFile(fname, O_RDONLY | PG_BINARY); + if (fdesc < 0) + { + if (writable && errno == ENOENT) + continue; + elog(ERROR, "failed to open file '%s' on behalf of '%s'", + fname, get_rel_name(foreigntableid)); + } + /* lookup optimal GPUs */ + __gpus = extraSysfsLookupOptimalGpus(fdesc); + if (optimal_gpus == (void *)(~0UL)) + optimal_gpus = __gpus; + else + optimal_gpus = bms_intersect(optimal_gpus, __gpus); + /* lookup or build metadata cache */ + rb_cached = arrowLookupOrBuildMetadataCache(fdesc, NULL); + foreach (cell, rb_cached) + { + RecordBatchState *rb_state = lfirst(cell); + + if (cell == list_head(rb_cached)) + filesSizeTotal += BLCKALIGN(rb_state->stat_buf.st_size); + + if (bms_is_member(-FirstLowInvalidHeapAttributeNumber, referenced)) + { + for (j=0; j < rb_state->ncols; j++) + len += RecordBatchFieldLength(&rb_state->columns[j]); + } + else + { + for (k = bms_next_member(referenced, -1); + k >= 0; + k = bms_next_member(referenced, k)) + { + j = k + FirstLowInvalidHeapAttributeNumber; + if (j < 0 || j >= rb_state->ncols) + continue; + len += RecordBatchFieldLength(&rb_state->columns[j]); + } + } + ntuples += rb_state->rb_nitems; + } + npages = len / BLCKSZ; + FileClose(fdesc); + } + bms_free(referenced); + + if (optimal_gpus == (void *)(~0UL) || + filesSizeTotal < pgstrom_gpudirect_threshold()) + optimal_gpus = NULL; + + baserel->rel_parallel_workers = parallel_nworkers; + baserel->fdw_private = list_make1(optimal_gpus); + baserel->pages = npages; + baserel->tuples = ntuples; + baserel->rows = ntuples * + clauselist_selectivity(root, + baserel->baserestrictinfo, + 0, + JOIN_INNER, + NULL); +} + +/* + * GetOptimalGpusForArrowFdw + * + * optimal GPUs bitmap is saved at baserel->fdw_private + */ +Bitmapset * +GetOptimalGpusForArrowFdw(PlannerInfo *root, RelOptInfo *baserel) +{ + if (baserel->fdw_private == NIL) + { + RangeTblEntry *rte = root->simple_rte_array[baserel->relid]; + + ArrowGetForeignRelSize(root, baserel, rte->relid); + } + return linitial(baserel->fdw_private); +} + +static void +cost_arrow_fdw_seqscan(Path *path, + PlannerInfo *root, + RelOptInfo *baserel, + ParamPathInfo *param_info, + int num_workers) +{ + Cost startup_cost = 0.0; + Cost disk_run_cost = 0.0; + Cost cpu_run_cost = 0.0; + QualCost qcost; + double nrows; + double spc_seq_page_cost; + + if (param_info) + nrows = param_info->ppi_rows; + else + nrows = baserel->rows; + + /* arrow_fdw.enabled */ + if (!arrow_fdw_enabled) + startup_cost += disable_cost; + + /* + * Storage costs + * + * XXX - smaller number of columns to read shall have less disk cost + * because of columnar format. Right now, we don't discount cost for + * the pages not to be read. + */ + get_tablespace_page_costs(baserel->reltablespace, + NULL, + &spc_seq_page_cost); + disk_run_cost = spc_seq_page_cost * baserel->pages; + + /* CPU costs */ + if (param_info) + { + cost_qual_eval(&qcost, param_info->ppi_clauses, root); + qcost.startup += baserel->baserestrictcost.startup; + qcost.per_tuple += baserel->baserestrictcost.per_tuple; + } + else + qcost = baserel->baserestrictcost; + startup_cost += qcost.startup; + cpu_run_cost = (cpu_tuple_cost + qcost.per_tuple) * baserel->tuples; + + /* tlist evaluation costs */ + startup_cost += path->pathtarget->cost.startup; + cpu_run_cost += path->pathtarget->cost.per_tuple * path->rows; + + /* adjust cost for CPU parallelism */ + if (num_workers > 0) + { + double leader_contribution; + double parallel_divisor = (double) num_workers; + + /* see get_parallel_divisor() */ + leader_contribution = 1.0 - (0.3 * (double)num_workers); + parallel_divisor += Max(leader_contribution, 0.0); + + /* The CPU cost is divided among all the workers. */ + cpu_run_cost /= parallel_divisor; + + /* Estimated row count per background worker process */ + nrows = clamp_row_est(nrows / parallel_divisor); + } + path->rows = nrows; + path->startup_cost = startup_cost; + path->total_cost = startup_cost + cpu_run_cost + disk_run_cost; + path->parallel_workers = num_workers; +} + +/* + * ArrowGetForeignPaths + */ +static void +ArrowGetForeignPaths(PlannerInfo *root, + RelOptInfo *baserel, + Oid foreigntableid) +{ + ForeignPath *fpath; + ParamPathInfo *param_info; + Relids required_outer = baserel->lateral_relids; + + param_info = get_baserel_parampathinfo(root, baserel, required_outer); + + fpath = create_foreignscan_path(root, baserel, + NULL, /* default pathtarget */ + -1, /* dummy */ + -1.0, /* dummy */ + -1.0, /* dummy */ + NIL, /* no pathkeys */ + required_outer, + NULL, /* no extra plan */ + NIL); /* no particular private */ + cost_arrow_fdw_seqscan(&fpath->path, root, baserel, param_info, 0); + add_path(baserel, (Path *)fpath); + + if (baserel->consider_parallel) + { + int num_workers = + compute_parallel_worker(baserel, + baserel->pages, -1.0, + max_parallel_workers_per_gather); + if (num_workers == 0) + return; + + fpath = create_foreignscan_path(root, + baserel, + NULL, /* default pathtarget */ + -1, /* dummy */ + -1.0, /* dummy */ + -1.0, /* dummy */ + NIL, /* no pathkeys */ + required_outer, + NULL, /* no extra plan */ + NIL); /* no particular private */ + fpath->path.parallel_aware = true; + + cost_arrow_fdw_seqscan(&fpath->path, root, baserel, param_info, + num_workers); + add_partial_path(baserel, (Path *)fpath); + } +} + +/* + * ArrowGetForeignPlan + */ +static ForeignScan * +ArrowGetForeignPlan(PlannerInfo *root, + RelOptInfo *baserel, + Oid foreigntableid, + ForeignPath *best_path, + List *tlist, + List *scan_clauses, + Plan *outer_plan) +{ + Bitmapset *referenced = NULL; + List *ref_list = NIL; + ListCell *lc; + int j, k; + + foreach (lc, baserel->baserestrictinfo) + { + RestrictInfo *rinfo = lfirst(lc); + + pull_varattnos((Node *)rinfo->clause, baserel->relid, &referenced); + } + referenced = pgstrom_pullup_outer_refs(root, baserel, referenced); + + for (k = bms_next_member(referenced, -1); + k >= 0; + k = bms_next_member(referenced, k)) + { + j = k + FirstLowInvalidHeapAttributeNumber; + ref_list = lappend_int(ref_list, j); + } + bms_free(referenced); + + return make_foreignscan(tlist, + extract_actual_clauses(scan_clauses, false), + baserel->relid, + NIL, /* no expressions to evaluate */ + ref_list, /* list of referenced attnums */ + NIL, /* no custom tlist */ + NIL, /* no remote quals */ + outer_plan); +} + +/* ---------------------------------------------------------------- + * + * Routines related to min/max statistics and scan hint + * + * If mapped Apache Arrow files have custome-metadata of "min_values" and + * "max_values" at the Field, arrow_fdw deals with this comma separated + * integer values as min/max value for each field, if any. + * Once we can know min/max value of the field, we can skip record batches + * that shall not match with WHERE-clause. + * + * This min/max array is expected to have as many integer elements or nulls + * as there are record-batches. + * ---------------------------------------------------------------- + */ + +/* + * buildArrowStatsBinary + * + * It reconstruct binary min/max statistics per record-batch + * from the custom-metadata of ArrowField. + */ +typedef struct arrowFieldStatsBinary +{ + uint32 nrooms; /* number of record-batches */ + int unitsz; /* unit size of min/max statistics */ + bool *isnull; + char *min_values; + char *max_values; + int nfields; /* if List/Struct data type */ + struct arrowFieldStatsBinary *subfields; +} arrowFieldStatsBinary; + +typedef struct +{ + int nitems; /* number of record-batches */ + int ncols; + arrowFieldStatsBinary columns[FLEXIBLE_ARRAY_MEMBER]; +} arrowStatsBinary; + +static void +__releaseArrowFieldStatsBinary(arrowFieldStatsBinary *bstats) +{ + int j; + + if (bstats->subfields) + { + for (j=0; j < bstats->nfields; j++) + __releaseArrowFieldStatsBinary(&bstats->subfields[j]); + pfree(bstats->subfields); + } + if (bstats->isnull) + pfree(bstats->isnull); + if (bstats->min_values) + pfree(bstats->min_values); + if (bstats->max_values) + pfree(bstats->max_values); +} + +static void +releaseArrowStatsBinary(arrowStatsBinary *arrow_bstats) +{ + int j; + + if (arrow_bstats) + { + for (j=0; j < arrow_bstats->ncols; j++) + __releaseArrowFieldStatsBinary(&arrow_bstats->columns[j]); + pfree(arrow_bstats); + } +} + +static int128_t +__atoi128(const char *tok, bool *p_isnull) +{ + int128_t ival = 0; + bool is_minus = false; + + if (*tok == '-') + { + is_minus = true; + tok++; + } + while (isdigit(*tok)) + { + ival = 10 * ival + (*tok - '0'); + tok++; + } + + if (*tok != '\0') + *p_isnull = true; + if (is_minus) + { + if (ival == 0) + *p_isnull = true; + ival = -ival; + } + return ival; +} + +static bool +__parseArrowFieldStatsBinary(arrowFieldStatsBinary *bstats, + ArrowField *field, + const char *min_tokens, + const char *max_tokens) +{ + int unitsz = -1; + char *min_buffer; + char *max_buffer; + char *min_values = NULL; + char *max_values = NULL; + bool *isnull = NULL; + char *tok1, *pos1; + char *tok2, *pos2; + uint32 index; + + /* determine the unitsz of datum */ + switch (field->type.node.tag) + { + case ArrowNodeTag__Int: + switch (field->type.Int.bitWidth) + { + case 8: + unitsz = sizeof(uint8_t); + break; + case 16: + unitsz = sizeof(uint16_t); + break; + case 32: + unitsz = sizeof(uint32_t); + break; + case 64: + unitsz = sizeof(uint64_t); + break; + default: + return false; + } + break; + + case ArrowNodeTag__FloatingPoint: + switch (field->type.FloatingPoint.precision) + { + case ArrowPrecision__Half: + unitsz = sizeof(uint16_t); + break; + case ArrowPrecision__Single: + unitsz = sizeof(uint32_t); + break; + case ArrowPrecision__Double: + unitsz = sizeof(uint64_t); + break; + default: + return false; + } + break; + + case ArrowNodeTag__Decimal: + unitsz = sizeof(int128_t); + break; + + case ArrowNodeTag__Date: + switch (field->type.Date.unit) + { + case ArrowDateUnit__Day: + unitsz = sizeof(uint32_t); + break; + case ArrowDateUnit__MilliSecond: + unitsz = sizeof(uint64_t); + break; + default: + return false; + } + break; + + case ArrowNodeTag__Time: + switch (field->type.Time.unit) + { + case ArrowTimeUnit__Second: + case ArrowTimeUnit__MilliSecond: + unitsz = sizeof(uint32_t); + break; + case ArrowTimeUnit__MicroSecond: + case ArrowTimeUnit__NanoSecond: + unitsz = sizeof(uint64_t); + break; + default: + return false; + } + break; + + case ArrowNodeTag__Timestamp: + switch (field->type.Timestamp.unit) + { + case ArrowTimeUnit__Second: + case ArrowTimeUnit__MilliSecond: + case ArrowTimeUnit__MicroSecond: + case ArrowTimeUnit__NanoSecond: + unitsz = sizeof(uint64_t); + break; + default: + return false; + } + break; + default: + return false; + } + Assert(unitsz > 0); + /* parse the min_tokens/max_tokens */ + min_buffer = alloca(strlen(min_tokens) + 1); + max_buffer = alloca(strlen(max_tokens) + 1); + strcpy(min_buffer, min_tokens); + strcpy(max_buffer, max_tokens); + + min_values = palloc0(unitsz * bstats->nrooms); + max_values = palloc0(unitsz * bstats->nrooms); + isnull = palloc0(sizeof(bool) * bstats->nrooms); + for (tok1 = strtok_r(min_buffer, ",", &pos1), + tok2 = strtok_r(max_buffer, ",", &pos2), index = 0; + tok1 != NULL && tok2 != NULL && index < bstats->nrooms; + tok1 = strtok_r(NULL, ",", &pos1), + tok2 = strtok_r(NULL, ",", &pos2), index++) + { + bool __isnull = false; + int128_t __min = __atoi128(__trim(tok1), &__isnull); + int128_t __max = __atoi128(__trim(tok2), &__isnull); + + if (__isnull) + isnull[index] = true; + else + { + memcpy(min_values + unitsz * index, &__min, unitsz); + memcpy(max_values + unitsz * index, &__max, unitsz); + } + } + /* sanity checks */ + if (!tok1 && !tok2 && index == bstats->nrooms) + { + bstats->unitsz = unitsz; + bstats->isnull = isnull; + bstats->min_values = min_values; + bstats->max_values = max_values; + return true; + } + /* elsewhere, something wrong */ + pfree(min_values); + pfree(max_values); + pfree(isnull); + return false; +} + +static bool +__buildArrowFieldStatsBinary(arrowFieldStatsBinary *bstats, + ArrowField *field, + uint32 numRecordBatches) +{ + const char *min_tokens = NULL; + const char *max_tokens = NULL; + int j, k; + bool retval = false; + + for (k=0; k < field->_num_custom_metadata; k++) + { + ArrowKeyValue *kv = &field->custom_metadata[k]; + + if (strcmp(kv->key, "min_values") == 0) + min_tokens = kv->value; + else if (strcmp(kv->key, "max_values") == 0) + max_tokens = kv->value; + } + + bstats->nrooms = numRecordBatches; + bstats->unitsz = -1; + if (min_tokens && max_tokens) + { + if (__parseArrowFieldStatsBinary(bstats, field, + min_tokens, + max_tokens)) + { + retval = true; + } + else + { + /* parse error, ignore the stat */ + if (bstats->isnull) + pfree(bstats->isnull); + if (bstats->min_values) + pfree(bstats->min_values); + if (bstats->max_values) + pfree(bstats->max_values); + bstats->unitsz = -1; + bstats->isnull = NULL; + bstats->min_values = NULL; + bstats->max_values = NULL; + } + } + + if (field->_num_children > 0) + { + bstats->nfields = field->_num_children; + bstats->subfields = palloc0(sizeof(arrowFieldStatsBinary) * bstats->nfields); + for (j=0; j < bstats->nfields; j++) + { + if (__buildArrowFieldStatsBinary(&bstats->subfields[j], + &field->children[j], + numRecordBatches)) + retval = true; + } + } + return retval; +} + +static arrowStatsBinary * +buildArrowStatsBinary(const ArrowFooter *footer, Bitmapset **p_stat_attrs) +{ + arrowStatsBinary *arrow_bstats; + int j, ncols = footer->schema._num_fields; + bool found = false; + + arrow_bstats = palloc0(offsetof(arrowStatsBinary, + columns[ncols])); + arrow_bstats->nitems = footer->_num_recordBatches; + arrow_bstats->ncols = ncols; + for (j=0; j < ncols; j++) + { + if (__buildArrowFieldStatsBinary(&arrow_bstats->columns[j], + &footer->schema.fields[j], + footer->_num_recordBatches)) + { + if (p_stat_attrs) + *p_stat_attrs = bms_add_member(*p_stat_attrs, j+1); + found = true; + } + } + if (!found) + { + releaseArrowStatsBinary(arrow_bstats); + return NULL; + } + return arrow_bstats; +} + +/* + * applyArrowStatsBinary + * + * It applies the fetched min/max values on the cached record-batch metadata + */ +static void +__applyArrowFieldStatsBinary(RecordBatchFieldState *fstate, + arrowFieldStatsBinary *bstats, + int rb_index) +{ + int j; + + if (bstats->unitsz > 0 && + bstats->isnull != NULL && + bstats->min_values != NULL && + bstats->max_values != NULL) + { + size_t off = bstats->unitsz * rb_index; + + memcpy(&fstate->stat_min, + bstats->min_values + off, bstats->unitsz); + memcpy(&fstate->stat_max, + bstats->max_values + off, bstats->unitsz); + fstate->stat_isnull = false; + } + else + { + memset(&fstate->stat_min, 0, sizeof(SQLstat__datum)); + memset(&fstate->stat_max, 0, sizeof(SQLstat__datum)); + fstate->stat_isnull = true; + } + + Assert(fstate->num_children == bstats->nfields); + for (j=0; j < fstate->num_children; j++) + { + RecordBatchFieldState *__fstate = &fstate->children[j]; + arrowFieldStatsBinary *__bstats = &bstats->subfields[j]; + + __applyArrowFieldStatsBinary(__fstate, __bstats, rb_index); + } +} + +static void +applyArrowStatsBinary(RecordBatchState *rb_state, arrowStatsBinary *arrow_bstats) +{ + int j, ncols = rb_state->ncols; + + Assert(rb_state->ncols == arrow_bstats->ncols && + rb_state->rb_index < arrow_bstats->nitems); + for (j=0; j < ncols; j++) + { + RecordBatchFieldState *fstate = &rb_state->columns[j]; + arrowFieldStatsBinary *bstats = &arrow_bstats->columns[j]; + + __applyArrowFieldStatsBinary(fstate, bstats, rb_state->rb_index); + } +} + +static SQLstat * +__buildArrowFieldStatsList(ArrowField *field, uint32 numRecordBatches) +{ + const char *min_tokens = NULL; + const char *max_tokens = NULL; + char *min_buffer; + char *max_buffer; + char *tok1, *pos1; + char *tok2, *pos2; + SQLstat *results = NULL; + int k, index; + + for (k=0; k < field->_num_custom_metadata; k++) + { + ArrowKeyValue *kv = &field->custom_metadata[k]; + + if (strcmp(kv->key, "min_values") == 0) + min_tokens = kv->value; + else if (strcmp(kv->key, "max_values") == 0) + max_tokens = kv->value; + } + if (!min_tokens || !max_tokens) + return NULL; + min_buffer = alloca(strlen(min_tokens) + 1); + max_buffer = alloca(strlen(max_tokens) + 1); + strcpy(min_buffer, min_tokens); + strcpy(max_buffer, max_tokens); + + for (tok1 = strtok_r(min_buffer, ",", &pos1), + tok2 = strtok_r(max_buffer, ",", &pos2), index = 0; + tok1 && tok2; + tok1 = strtok_r(NULL, ",", &pos1), + tok2 = strtok_r(NULL, ",", &pos2), index++) + { + bool __isnull = false; + int128_t __min = __atoi128(__trim(tok1), &__isnull); + int128_t __max = __atoi128(__trim(tok2), &__isnull); + + if (!__isnull) + { + SQLstat *item = palloc0(sizeof(SQLstat)); + + item->next = results; + item->rb_index = index; + item->is_valid = true; + item->min.i128 = __min; + item->max.i128 = __max; + results = item; + } + } + /* sanity checks */ + if (!tok1 && !tok2 && index == numRecordBatches) + return results; + /* ah, error... */ + while (results) + { + SQLstat *next = results->next; + + pfree(results); + results = next; + } + return NULL; +} + +/* + * execInitArrowStatsHint / execCheckArrowStatsHint / execEndArrowStatsHint + * + * ... are executor routines for min/max statistics. + */ +static bool +__buildArrowStatsOper(arrowStatsHint *arange, + ScanState *ss, + OpExpr *op, + bool reverse) +{ + Index scanrelid = ((Scan *)ss->ps.plan)->scanrelid; + Oid opcode; + Var *var; + Node *arg; + Expr *expr; + Oid opfamily = InvalidOid; + StrategyNumber strategy = InvalidStrategy; + CatCList *catlist; + int i; + + if (!reverse) + { + opcode = op->opno; + var = linitial(op->args); + arg = lsecond(op->args); + } + else + { + opcode = get_commutator(op->opno); + var = lsecond(op->args); + arg = linitial(op->args); + } + /* Is it VAR ARG form? */ + if (!IsA(var, Var) || var->varno != scanrelid) + return false; + if (!bms_is_member(var->varattno, arange->stat_attrs)) + return false; + if (contain_var_clause(arg) || + contain_volatile_functions(arg)) + return false; + + catlist = SearchSysCacheList1(AMOPOPID, ObjectIdGetDatum(opcode)); + for (i=0; i < catlist->n_members; i++) + { + HeapTuple tuple = &catlist->members[i]->tuple; + Form_pg_amop amop = (Form_pg_amop) GETSTRUCT(tuple); + + if (amop->amopmethod == BRIN_AM_OID) + { + opfamily = amop->amopfamily; + strategy = amop->amopstrategy; + break; + } + } + ReleaseSysCacheList(catlist); + + if (strategy == BTLessStrategyNumber || + strategy == BTLessEqualStrategyNumber) + { + /* (VAR < ARG) --> (Min < ARG) */ + /* (VAR <= ARG) --> (Min <= ARG) */ + arange->load_attrs = bms_add_member(arange->load_attrs, + var->varattno); + expr = make_opclause(opcode, + op->opresulttype, + op->opretset, + (Expr *)makeVar(INNER_VAR, + var->varattno, + var->vartype, + var->vartypmod, + var->varcollid, + 0), + (Expr *)copyObject(arg), + op->opcollid, + op->inputcollid); + set_opfuncid((OpExpr *)expr); + arange->eval_quals = lappend(arange->eval_quals, expr); + } + else if (strategy == BTGreaterEqualStrategyNumber || + strategy == BTGreaterStrategyNumber) + { + /* (VAR >= ARG) --> (Max >= ARG) */ + /* (VAR > ARG) --> (Max > ARG) */ + arange->load_attrs = bms_add_member(arange->load_attrs, + var->varattno); + expr = make_opclause(opcode, + op->opresulttype, + op->opretset, + (Expr *)makeVar(OUTER_VAR, + var->varattno, + var->vartype, + var->vartypmod, + var->varcollid, + 0), + (Expr *)copyObject(arg), + op->opcollid, + op->inputcollid); + set_opfuncid((OpExpr *)expr); + arange->eval_quals = lappend(arange->eval_quals, expr); + } + else if (strategy == BTEqualStrategyNumber) + { + /* (VAR = ARG) --> (Max >= ARG && Min <= ARG) */ + opcode = get_opfamily_member(opfamily, var->vartype, + exprType((Node *)arg), + BTGreaterEqualStrategyNumber); + expr = make_opclause(opcode, + op->opresulttype, + op->opretset, + (Expr *)makeVar(OUTER_VAR, + var->varattno, + var->vartype, + var->vartypmod, + var->varcollid, + 0), + (Expr *)copyObject(arg), + op->opcollid, + op->inputcollid); + set_opfuncid((OpExpr *)expr); + arange->eval_quals = lappend(arange->eval_quals, expr); + + opcode = get_opfamily_member(opfamily, var->vartype, + exprType((Node *)arg), + BTLessEqualStrategyNumber); + expr = make_opclause(opcode, + op->opresulttype, + op->opretset, + (Expr *)makeVar(INNER_VAR, + var->varattno, + var->vartype, + var->vartypmod, + var->varcollid, + 0), + (Expr *)copyObject(arg), + op->opcollid, + op->inputcollid); + set_opfuncid((OpExpr *)expr); + arange->eval_quals = lappend(arange->eval_quals, expr); + } + else + { + return false; + } + arange->load_attrs = bms_add_member(arange->load_attrs, + var->varattno); + return true; +} + +static arrowStatsHint * +execInitArrowStatsHint(ScanState *ss, + Bitmapset *stat_attrs, + List *outer_quals) +{ + Relation relation = ss->ss_currentRelation; + TupleDesc tupdesc = RelationGetDescr(relation); + ExprContext *econtext; + arrowStatsHint *result, temp; + Expr *eval_expr; + ListCell *lc; + + memset(&temp, 0, sizeof(arrowStatsHint)); + temp.stat_attrs = stat_attrs; + foreach (lc, outer_quals) + { + OpExpr *op = lfirst(lc); + + if (IsA(op, OpExpr) && list_length(op->args) == 2 && + (__buildArrowStatsOper(&temp, ss, op, false) || + __buildArrowStatsOper(&temp, ss, op, true))) + { + temp.orig_quals = lappend(temp.orig_quals, copyObject(op)); + } + } + if (!temp.orig_quals) + return NULL; + + Assert(list_length(temp.eval_quals) > 0); + if (list_length(temp.eval_quals) == 1) + eval_expr = linitial(temp.eval_quals); + else + eval_expr = make_andclause(temp.eval_quals); + + econtext = CreateExprContext(ss->ps.state); + econtext->ecxt_innertuple = MakeSingleTupleTableSlot(tupdesc, &TTSOpsVirtual); + econtext->ecxt_outertuple = MakeSingleTupleTableSlot(tupdesc, &TTSOpsVirtual); + + result = palloc0(sizeof(arrowStatsHint)); + result->orig_quals = temp.orig_quals; + result->eval_quals = temp.eval_quals; + result->eval_state = ExecInitExpr(eval_expr, &ss->ps); + result->stat_attrs = bms_copy(stat_attrs); + result->load_attrs = temp.load_attrs; + result->econtext = econtext; + + return result; +} + +static bool +__fetchArrowStatsDatum(RecordBatchFieldState *fstate, + SQLstat__datum *sval, + Datum *p_datum, bool *p_isnull) +{ + Datum datum; + int64 shift; + + switch (fstate->atttypid) + { + case INT1OID: + datum = Int8GetDatum(sval->i8); + break; + case INT2OID: + case FLOAT2OID: + datum = Int16GetDatum(sval->i16); + break; + case INT4OID: + case FLOAT4OID: + datum = Int32GetDatum(sval->i32); + break; + case INT8OID: + case FLOAT8OID: + datum = Int64GetDatum(sval->i64); + break; + case NUMERICOID: + { + Int128_t decimal; + int dscale = fstate->attopts.decimal.scale; + char *result = palloc0(sizeof(struct NumericData)); + + decimal.ival = sval->i128; + while (dscale > 0 && decimal.ival % 10 == 0) + { + decimal.ival /= 10; + dscale--; + } + pg_numeric_to_varlena(result, dscale, decimal); + + datum = PointerGetDatum(result); + } + break; + case DATEOID: + shift = POSTGRES_EPOCH_JDATE - UNIX_EPOCH_JDATE; + switch (fstate->attopts.date.unit) + { + case ArrowDateUnit__Day: + datum = DateADTGetDatum((DateADT)sval->i32 - shift); + break; + case ArrowDateUnit__MilliSecond: + datum = DateADTGetDatum((DateADT)sval->i64 / 1000L - shift); + break; + default: + return false; + } + break; + + case TIMEOID: + switch (fstate->attopts.time.unit) + { + case ArrowTimeUnit__Second: + datum = TimeADTGetDatum((TimeADT)sval->u32 * 1000000L); + break; + case ArrowTimeUnit__MilliSecond: + datum = TimeADTGetDatum((TimeADT)sval->u32 * 1000L); + break; + case ArrowTimeUnit__MicroSecond: + datum = TimeADTGetDatum((TimeADT)sval->u64); + break; + case ArrowTimeUnit__NanoSecond: + datum = TimeADTGetDatum((TimeADT)sval->u64 / 1000L); + break; + default: + return false; + } + break; + case TIMESTAMPOID: + case TIMESTAMPTZOID: + shift = (POSTGRES_EPOCH_JDATE - UNIX_EPOCH_JDATE) * USECS_PER_DAY; + switch (fstate->attopts.timestamp.unit) + { + case ArrowTimeUnit__Second: + datum = TimestampGetDatum((Timestamp)sval->i64 * 1000000L - shift); + break; + case ArrowTimeUnit__MilliSecond: + datum = TimestampGetDatum((Timestamp)sval->i64 * 1000L - shift); + break; + case ArrowTimeUnit__MicroSecond: + datum = TimestampGetDatum((Timestamp)sval->i64 - shift); + break; + case ArrowTimeUnit__NanoSecond: + datum = TimestampGetDatum((Timestamp)sval->i64 / 1000L - shift); + break; + default: + return false; + } + break; + default: + return false; + } + *p_datum = datum; + *p_isnull = false; + return true; +} + +static bool +execCheckArrowStatsHint(arrowStatsHint *stats_hint, + RecordBatchState *rb_state) +{ + ExprContext *econtext = stats_hint->econtext; + TupleTableSlot *min_values = econtext->ecxt_innertuple; + TupleTableSlot *max_values = econtext->ecxt_outertuple; + int anum; + Datum datum; + bool isnull; + + /* load the min/max statistics */ + ExecStoreAllNullTuple(min_values); + ExecStoreAllNullTuple(max_values); + for (anum = bms_next_member(stats_hint->load_attrs, -1); + anum >= 0; + anum = bms_next_member(stats_hint->load_attrs, anum)) + { + RecordBatchFieldState *fstate = &rb_state->columns[anum-1]; + + Assert(anum > 0 && anum <= rb_state->ncols); + /* + * In case when min/max statistics are missing, we cannot determine + * whether we can skip the current record-batch. + */ + if (fstate->stat_isnull) + return false; + + if (!__fetchArrowStatsDatum(fstate, &fstate->stat_min, + &min_values->tts_values[anum-1], + &min_values->tts_isnull[anum-1])) + return false; + + if (!__fetchArrowStatsDatum(fstate, &fstate->stat_max, + &max_values->tts_values[anum-1], + &max_values->tts_isnull[anum-1])) + return false; + } + datum = ExecEvalExprSwitchContext(stats_hint->eval_state, econtext, &isnull); + +// elog(INFO, "file [%s] rb_index=%u datum=%lu isnull=%d", +// FilePathName(rb_state->fdesc), rb_state->rb_index, datum, (int)isnull); + if (!isnull && DatumGetBool(datum)) + return true; + return false; +} + +static void +execEndArrowStatsHint(arrowStatsHint *stats_hint) +{ + ExprContext *econtext = stats_hint->econtext; + + ExecDropSingleTupleTableSlot(econtext->ecxt_innertuple); + ExecDropSingleTupleTableSlot(econtext->ecxt_outertuple); + econtext->ecxt_innertuple = NULL; + econtext->ecxt_outertuple = NULL; + + FreeExprContext(econtext, true); +} + +/* + * Routines to setup record-batches + */ +typedef struct +{ + ArrowBuffer *buffer_curr; + ArrowBuffer *buffer_tail; + ArrowFieldNode *fnode_curr; + ArrowFieldNode *fnode_tail; +} setupRecordBatchContext; + +static void +assignArrowTypeOptions(ArrowTypeOptions *attopts, const ArrowType *atype) +{ + memset(attopts, 0, sizeof(ArrowTypeOptions)); + switch (atype->node.tag) + { + case ArrowNodeTag__Decimal: + if (atype->Decimal.precision < SHRT_MIN || + atype->Decimal.precision > SHRT_MAX) + elog(ERROR, "Decimal precision is out of range"); + if (atype->Decimal.scale < SHRT_MIN || + atype->Decimal.scale > SHRT_MAX) + elog(ERROR, "Decimal scale is out of range"); + attopts->decimal.precision = atype->Decimal.precision; + attopts->decimal.scale = atype->Decimal.scale; + break; + case ArrowNodeTag__Date: + if (atype->Date.unit == ArrowDateUnit__Day || + atype->Date.unit == ArrowDateUnit__MilliSecond) + attopts->date.unit = atype->Date.unit; + else + elog(ERROR, "unknown unit of Date"); + break; + case ArrowNodeTag__Time: + if (atype->Time.unit == ArrowTimeUnit__Second || + atype->Time.unit == ArrowTimeUnit__MilliSecond || + atype->Time.unit == ArrowTimeUnit__MicroSecond || + atype->Time.unit == ArrowTimeUnit__NanoSecond) + attopts->time.unit = atype->Time.unit; + else + elog(ERROR, "unknown unit of Time"); + break; + case ArrowNodeTag__Timestamp: + if (atype->Timestamp.unit == ArrowTimeUnit__Second || + atype->Timestamp.unit == ArrowTimeUnit__MilliSecond || + atype->Timestamp.unit == ArrowTimeUnit__MicroSecond || + atype->Timestamp.unit == ArrowTimeUnit__NanoSecond) + attopts->timestamp.unit = atype->Timestamp.unit; + else + elog(ERROR, "unknown unit of Timestamp"); + break; + case ArrowNodeTag__Interval: + if (atype->Interval.unit == ArrowIntervalUnit__Year_Month || + atype->Interval.unit == ArrowIntervalUnit__Day_Time) + attopts->interval.unit = atype->Interval.unit; + else + elog(ERROR, "unknown unit of Interval"); + break; + case ArrowNodeTag__FixedSizeBinary: + attopts->fixed_size_binary.byteWidth = atype->FixedSizeBinary.byteWidth; + break; + default: + /* no extra attributes */ + break; + } +} + +static void +setupRecordBatchField(setupRecordBatchContext *con, + RecordBatchFieldState *fstate, + ArrowField *field, + int depth) +{ + ArrowBuffer *buffer_curr; + ArrowFieldNode *fnode; + + if (con->fnode_curr >= con->fnode_tail) + elog(ERROR, "RecordBatch has less ArrowFieldNode than expected"); + fnode = con->fnode_curr++; + fstate->atttypid = arrowTypeToPGTypeOid(field, &fstate->atttypmod); + fstate->nitems = fnode->length; + fstate->null_count = fnode->null_count; + fstate->stat_isnull = true; + + switch (field->type.node.tag) + { + case ArrowNodeTag__Int: + case ArrowNodeTag__FloatingPoint: + case ArrowNodeTag__Bool: + case ArrowNodeTag__Decimal: + case ArrowNodeTag__Date: + case ArrowNodeTag__Time: + case ArrowNodeTag__Timestamp: + case ArrowNodeTag__Interval: + case ArrowNodeTag__FixedSizeBinary: + /* fixed length values */ + if (con->buffer_curr + 2 > con->buffer_tail) + elog(ERROR, "RecordBatch has less buffers than expected"); + buffer_curr = con->buffer_curr++; + if (fstate->null_count > 0) + { + fstate->nullmap_offset = buffer_curr->offset; + fstate->nullmap_length = buffer_curr->length; + if (fstate->nullmap_length < BITMAPLEN(fstate->nitems)) + elog(ERROR, "nullmap length is smaller than expected"); + if ((fstate->nullmap_offset & (MAXIMUM_ALIGNOF - 1)) != 0) + elog(ERROR, "nullmap is not aligned well"); + } + buffer_curr = con->buffer_curr++; + fstate->values_offset = buffer_curr->offset; + fstate->values_length = buffer_curr->length; + if (fstate->values_length < arrowFieldLength(field,fstate->nitems)) + elog(ERROR, "values array is smaller than expected"); + if ((fstate->values_offset & (MAXIMUM_ALIGNOF - 1)) != 0) + elog(ERROR, "values array is not aligned well"); + break; + + case ArrowNodeTag__List: + if (field->_num_children != 1) + elog(ERROR, "Bug? List of arrow type is corrupted"); + if (depth > 0) + elog(ERROR, "nested array type is not supported"); + /* nullmap */ + if (con->buffer_curr + 1 > con->buffer_tail) + elog(ERROR, "RecordBatch has less buffers than expected"); + buffer_curr = con->buffer_curr++; + if (fstate->null_count > 0) + { + fstate->nullmap_offset = buffer_curr->offset; + fstate->nullmap_length = buffer_curr->length; + if (fstate->nullmap_length < BITMAPLEN(fstate->nitems)) + elog(ERROR, "nullmap length is smaller than expected"); + if ((fstate->nullmap_offset & (MAXIMUM_ALIGNOF - 1)) != 0) + elog(ERROR, "nullmap is not aligned well"); + } + /* offset values */ + buffer_curr = con->buffer_curr++; + fstate->values_offset = buffer_curr->offset; + fstate->values_length = buffer_curr->length; + if (fstate->values_length < arrowFieldLength(field,fstate->nitems)) + elog(ERROR, "offset array is smaller than expected"); + if ((fstate->values_offset & (MAXIMUM_ALIGNOF - 1)) != 0) + elog(ERROR, "offset array is not aligned well"); + /* setup array element */ + fstate->children = palloc0(sizeof(RecordBatchFieldState)); + setupRecordBatchField(con, + &fstate->children[0], + &field->children[0], + depth+1); + fstate->num_children = 1; + break; + + case ArrowNodeTag__Utf8: + case ArrowNodeTag__Binary: + /* variable length values */ + if (con->buffer_curr + 3 > con->buffer_tail) + elog(ERROR, "RecordBatch has less buffers than expected"); + buffer_curr = con->buffer_curr++; + if (fstate->null_count > 0) + { + fstate->nullmap_offset = buffer_curr->offset; + fstate->nullmap_length = buffer_curr->length; + if (fstate->nullmap_length < BITMAPLEN(fstate->nitems)) + elog(ERROR, "nullmap length is smaller than expected"); + if ((fstate->nullmap_offset & (MAXIMUM_ALIGNOF - 1)) != 0) + elog(ERROR, "nullmap is not aligned well"); + } + + buffer_curr = con->buffer_curr++; + fstate->values_offset = buffer_curr->offset; + fstate->values_length = buffer_curr->length; + if (fstate->values_length < arrowFieldLength(field,fstate->nitems)) + elog(ERROR, "offset array is smaller than expected"); + if ((fstate->values_offset & (MAXIMUM_ALIGNOF - 1)) != 0) + elog(ERROR, "offset array is not aligned well (%lu %lu)", fstate->values_offset, fstate->values_length); + + buffer_curr = con->buffer_curr++; + fstate->extra_offset = buffer_curr->offset; + fstate->extra_length = buffer_curr->length; + if ((fstate->extra_offset & (MAXIMUM_ALIGNOF - 1)) != 0) + elog(ERROR, "extra buffer is not aligned well"); + break; + + case ArrowNodeTag__Struct: + if (depth > 0) + elog(ERROR, "nested composite type is not supported"); + /* only nullmap */ + if (con->buffer_curr + 1 > con->buffer_tail) + elog(ERROR, "RecordBatch has less buffers than expected"); + buffer_curr = con->buffer_curr++; + if (fstate->null_count > 0) + { + fstate->nullmap_offset = buffer_curr->offset; + fstate->nullmap_length = buffer_curr->length; + if (fstate->nullmap_length < BITMAPLEN(fstate->nitems)) + elog(ERROR, "nullmap length is smaller than expected"); + if ((fstate->nullmap_offset & (MAXIMUM_ALIGNOF - 1)) != 0) + elog(ERROR, "nullmap is not aligned well"); + } + + if (field->_num_children > 0) + { + int i; + + fstate->children = palloc0(sizeof(RecordBatchFieldState) * + field->_num_children); + for (i=0; i < field->_num_children; i++) + { + setupRecordBatchField(con, + &fstate->children[i], + &field->children[i], + depth+1); + } + } + fstate->num_children = field->_num_children; + break; + default: + elog(ERROR, "Bug? ArrowSchema contains unsupported types"); + } + /* assign extra attributes (precision, unitsz, ...) */ + assignArrowTypeOptions(&fstate->attopts, &field->type); +} + +static RecordBatchState * +makeRecordBatchState(ArrowSchema *schema, + ArrowBlock *block, + ArrowRecordBatch *rbatch) +{ + setupRecordBatchContext con; + RecordBatchState *result; + int j, ncols = schema->_num_fields; + + /* + * Right now, we have no support for compressed RecordBatches + */ + if (rbatch->compression) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("arrow_fdw: compressed record-batches are not supported"))); + + result = palloc0(offsetof(RecordBatchState, columns[ncols])); + result->ncols = ncols; + result->rb_offset = block->offset + block->metaDataLength; + result->rb_length = block->bodyLength; + result->rb_nitems = rbatch->length; + + memset(&con, 0, sizeof(setupRecordBatchContext)); + con.buffer_curr = rbatch->buffers; + con.buffer_tail = rbatch->buffers + rbatch->_num_buffers; + con.fnode_curr = rbatch->nodes; + con.fnode_tail = rbatch->nodes + rbatch->_num_nodes; + + for (j=0; j < ncols; j++) + { + RecordBatchFieldState *fstate = &result->columns[j]; + ArrowField *field = &schema->fields[j]; + + setupRecordBatchField(&con, fstate, field, 0); + } + if (con.buffer_curr != con.buffer_tail || + con.fnode_curr != con.fnode_tail) + elog(ERROR, "arrow_fdw: RecordBatch may have corruption."); + + return result; +} + +/* + * ExecInitArrowFdw + */ +ArrowFdwState * +ExecInitArrowFdw(ScanState *ss, + GpuContext *gcontext, + List *outer_quals, + Bitmapset *outer_refs) +{ + Relation relation = ss->ss_currentRelation; + TupleDesc tupdesc = RelationGetDescr(relation); + ForeignTable *ft = GetForeignTable(RelationGetRelid(relation)); + List *filesList = NIL; + List *fdescList = NIL; + List *gpuDirectFileDescList = NIL; + Bitmapset *referenced = NULL; + Bitmapset *stat_attrs = NULL; + bool whole_row_ref = false; + ArrowFdwState *af_state; + List *rb_state_list = NIL; + ListCell *lc; + bool writable; + int i, num_rbatches; + + Assert(RelationGetForm(relation)->relkind == RELKIND_FOREIGN_TABLE && + memcmp(GetFdwRoutineForRelation(relation, false), + &pgstrom_arrow_fdw_routine, sizeof(FdwRoutine)) == 0); + /* expand 'referenced' if it has whole-row reference */ + if (bms_is_member(-FirstLowInvalidHeapAttributeNumber, outer_refs)) + whole_row_ref = true; + for (i=0; i < tupdesc->natts; i++) + { + Form_pg_attribute attr = tupleDescAttr(tupdesc, i); + int k = attr->attnum - FirstLowInvalidHeapAttributeNumber; + + if (attr->attisdropped) + continue; + if (whole_row_ref || bms_is_member(k, outer_refs)) + referenced = bms_add_member(referenced, k); + } + + filesList = __arrowFdwExtractFilesList(ft->options, + NULL, + &writable); + foreach (lc, filesList) + { + char *fname = strVal(lfirst(lc)); + File fdesc; + List *rb_cached = NIL; + ListCell *cell; + GPUDirectFileDesc *dfile = NULL; + + fdesc = PathNameOpenFile(fname, O_RDONLY | PG_BINARY); + if (fdesc < 0) + { + if (writable && errno == ENOENT) + continue; + elog(ERROR, "failed to open '%s' on behalf of '%s'", + fname, RelationGetRelationName(relation)); + } + fdescList = lappend_int(fdescList, fdesc); + + /* + * Open file for GPUDirect I/O + */ + if (gcontext) + { + dfile = palloc0(sizeof(GPUDirectFileDesc)); + + gpuDirectFileDescOpen(dfile, fdesc); + if (!trackRawFileDesc(gcontext, dfile, __FILE__, __LINE__)) + { + gpuDirectFileDescClose(dfile); + elog(ERROR, "out of memory"); + } + gpuDirectFileDescList = lappend(gpuDirectFileDescList, dfile); + } + + rb_cached = arrowLookupOrBuildMetadataCache(fdesc, &stat_attrs); + /* check schema compatibility */ + foreach (cell, rb_cached) + { + RecordBatchState *rb_state = lfirst(cell); + + if (!arrowSchemaCompatibilityCheck(tupdesc, rb_state)) + elog(ERROR, "arrow file '%s' on behalf of foreign table '%s' has incompatible schema definition", + fname, RelationGetRelationName(relation)); + /* GPUDirect I/O state, if any */ + rb_state->dfile = dfile; + } + rb_state_list = list_concat(rb_state_list, rb_cached); + } + num_rbatches = list_length(rb_state_list); + af_state = palloc0(offsetof(ArrowFdwState, rbatches[num_rbatches])); + af_state->gcontext = gcontext; + af_state->gpuDirectFileDescList = gpuDirectFileDescList; + af_state->fdescList = fdescList; + af_state->referenced = referenced; + if (arrow_fdw_stats_hint_enabled) + af_state->stats_hint = execInitArrowStatsHint(ss, stat_attrs, + outer_quals); + af_state->rbatch_index = &af_state->__rbatch_index_local; + af_state->rbatch_nload = &af_state->__rbatch_nload_local; + af_state->rbatch_nskip = &af_state->__rbatch_nskip_local; + i = 0; + foreach (lc, rb_state_list) + af_state->rbatches[i++] = (RecordBatchState *)lfirst(lc); + af_state->num_rbatches = num_rbatches; + + return af_state; +} + +/* + * ArrowBeginForeignScan + */ +static void +ArrowBeginForeignScan(ForeignScanState *node, int eflags) +{ + Relation relation = node->ss.ss_currentRelation; + TupleDesc tupdesc = RelationGetDescr(relation); + ForeignScan *fscan = (ForeignScan *) node->ss.ps.plan; + ListCell *lc; + Bitmapset *referenced = NULL; + + foreach (lc, fscan->fdw_private) + { + int j = lfirst_int(lc); + + if (j >= 0 && j <= tupdesc->natts) + referenced = bms_add_member(referenced, j - + FirstLowInvalidHeapAttributeNumber); + } + node->fdw_state = ExecInitArrowFdw(&node->ss, + NULL, + fscan->scan.plan.qual, + referenced); +} + +typedef struct +{ + off_t rb_offset; + off_t f_offset; + off_t m_offset; + cl_int io_index; + cl_int depth; + strom_io_chunk ioc[FLEXIBLE_ARRAY_MEMBER]; +} arrowFdwSetupIOContext; + +/* + * arrowFdwSetupIOvectorField + */ +static void +__setupIOvectorField(arrowFdwSetupIOContext *con, + off_t chunk_offset, + size_t chunk_length, + cl_uint *p_cmeta_offset, + cl_uint *p_cmeta_length) +{ + off_t f_pos = con->rb_offset + chunk_offset; + size_t __length = MAXALIGN(chunk_length); + + Assert((con->m_offset & (MAXIMUM_ALIGNOF - 1)) == 0); + + if (f_pos == con->f_offset) + { + /* good, buffer is fully continuous */ + *p_cmeta_offset = __kds_packed(con->m_offset); + *p_cmeta_length = __kds_packed(__length); + + con->m_offset += __length; + con->f_offset += __length; + } + else if (f_pos > con->f_offset && + (f_pos & ~PAGE_MASK) == (con->f_offset & ~PAGE_MASK) && + ((f_pos - con->f_offset) & (MAXIMUM_ALIGNOF-1)) == 0) + { + /* + * we can also consolidate the i/o of two chunks, if file position + * of the next chunk (f_pos) and the current file tail position + * (con->f_offset) locate within the same file page, and if gap bytes + * on the file does not break alignment. + */ + size_t __gap = (f_pos - con->f_offset); + + /* put gap bytes */ + Assert(__gap < PAGE_SIZE); + con->m_offset += __gap; + con->f_offset += __gap; + + *p_cmeta_offset = __kds_packed(con->m_offset); + *p_cmeta_length = __kds_packed(__length); + + con->m_offset += __length; + con->f_offset += __length; + } + else + { + /* + * Elsewhere, we have no chance to consolidate this chunk to + * the previous i/o-chunk. So, make a new i/o-chunk. + */ + off_t f_base = TYPEALIGN_DOWN(PAGE_SIZE, f_pos); + off_t f_tail; + off_t shift = f_pos - f_base; + strom_io_chunk *ioc; + + if (con->io_index < 0) + con->io_index = 0; /* no previous i/o chunks */ + else + { + ioc = &con->ioc[con->io_index++]; + + f_tail = TYPEALIGN(PAGE_SIZE, con->f_offset); + ioc->nr_pages = f_tail / PAGE_SIZE - ioc->fchunk_id; + con->m_offset += (f_tail - con->f_offset); //safety margin; + } + ioc = &con->ioc[con->io_index]; + /* adjust position if con->m_offset is not aligned well */ + if (con->m_offset + shift != MAXALIGN(con->m_offset + shift)) + con->m_offset = MAXALIGN(con->m_offset + shift) - shift; + ioc->m_offset = con->m_offset; + ioc->fchunk_id = f_base / PAGE_SIZE; + + *p_cmeta_offset = __kds_packed(con->m_offset + shift); + *p_cmeta_length = __kds_packed(__length); + + con->m_offset += shift + __length; + con->f_offset = f_pos + __length; + } +} + +static void +arrowFdwSetupIOvectorField(arrowFdwSetupIOContext *con, + RecordBatchFieldState *fstate, + kern_data_store *kds, + kern_colmeta *cmeta) +{ + //int index = cmeta - kds->colmeta; + + if (fstate->nullmap_length > 0) + { + Assert(fstate->null_count > 0); + __setupIOvectorField(con, + fstate->nullmap_offset, + fstate->nullmap_length, + &cmeta->nullmap_offset, + &cmeta->nullmap_length); + //elog(INFO, "D%d att[%d] nullmap=%lu,%lu m_offset=%lu f_offset=%lu", con->depth, index, fstate->nullmap_offset, fstate->nullmap_length, con->m_offset, con->f_offset); + } + if (fstate->values_length > 0) + { + __setupIOvectorField(con, + fstate->values_offset, + fstate->values_length, + &cmeta->values_offset, + &cmeta->values_length); + //elog(INFO, "D%d att[%d] values=%lu,%lu m_offset=%lu f_offset=%lu", con->depth, index, fstate->values_offset, fstate->values_length, con->m_offset, con->f_offset); + } + if (fstate->extra_length > 0) + { + __setupIOvectorField(con, + fstate->extra_offset, + fstate->extra_length, + &cmeta->extra_offset, + &cmeta->extra_length); + //elog(INFO, "D%d att[%d] extra=%lu,%lu m_offset=%lu f_offset=%lu", con->depth, index, fstate->extra_offset, fstate->extra_length, con->m_offset, con->f_offset); + } + + /* nested sub-fields if composite types */ + if (cmeta->atttypkind == TYPE_KIND__ARRAY || + cmeta->atttypkind == TYPE_KIND__COMPOSITE) + { + kern_colmeta *subattr; + int j; + + Assert(fstate->num_children == cmeta->num_subattrs); + con->depth++; + for (j=0, subattr = &kds->colmeta[cmeta->idx_subattrs]; + j < cmeta->num_subattrs; + j++, subattr++) + { + RecordBatchFieldState *child = &fstate->children[j]; + + arrowFdwSetupIOvectorField(con, child, kds, subattr); + } + con->depth--; + } +} + +/* + * arrowFdwSetupIOvector + */ +static strom_io_vector * +arrowFdwSetupIOvector(kern_data_store *kds, + RecordBatchState *rb_state, + Bitmapset *referenced) +{ + arrowFdwSetupIOContext *con; + strom_io_vector *iovec = NULL; + int j, nr_chunks = 0; + + Assert(kds->nr_colmeta >= kds->ncols); + con = alloca(offsetof(arrowFdwSetupIOContext, + ioc[3 * kds->nr_colmeta])); + con->rb_offset = rb_state->rb_offset; + con->f_offset = ~0UL; /* invalid offset */ + con->m_offset = TYPEALIGN(PAGE_SIZE, KERN_DATA_STORE_HEAD_LENGTH(kds)); + con->io_index = -1; + for (j=0; j < kds->ncols; j++) + { + RecordBatchFieldState *fstate = &rb_state->columns[j]; + kern_colmeta *cmeta = &kds->colmeta[j]; + int attidx = j + 1 - FirstLowInvalidHeapAttributeNumber; + + if (referenced && bms_is_member(attidx, referenced)) + arrowFdwSetupIOvectorField(con, fstate, kds, cmeta); + else + cmeta->atttypkind = TYPE_KIND__NULL; /* unreferenced */ + } + if (con->io_index >= 0) + { + /* close the last I/O chunks */ + strom_io_chunk *ioc = &con->ioc[con->io_index]; + + ioc->nr_pages = (TYPEALIGN(PAGE_SIZE, con->f_offset) / PAGE_SIZE - + ioc->fchunk_id); + con->m_offset = ioc->m_offset + PAGE_SIZE * ioc->nr_pages; + nr_chunks = con->io_index + 1; + } + kds->length = con->m_offset; + + iovec = palloc0(offsetof(strom_io_vector, ioc[nr_chunks])); + iovec->nr_chunks = nr_chunks; + if (nr_chunks > 0) + memcpy(iovec->ioc, con->ioc, sizeof(strom_io_chunk) * nr_chunks); + return iovec; +} + +/* + * __dump_kds_and_iovec - just for debug + */ +static inline void +__dump_kds_and_iovec(kern_data_store *kds, strom_io_vector *iovec) +{ +#if 0 + int j; + + elog(INFO, "nchunks = %d", iovec->nr_chunks); + for (j=0; j < iovec->nr_chunks; j++) + { + strom_io_chunk *ioc = &iovec->ioc[j]; + + elog(INFO, "io[%d] [ m_offset=%lu, f_read=%lu...%lu, nr_pages=%u}", + j, + ioc->m_offset, + ioc->fchunk_id * PAGE_SIZE, + (ioc->fchunk_id + ioc->nr_pages) * PAGE_SIZE, + ioc->nr_pages); + } + + elog(INFO, "kds {length=%zu nitems=%u typeid=%u typmod=%u table_oid=%u}", + kds->length, kds->nitems, + kds->tdtypeid, kds->tdtypmod, kds->table_oid); + for (j=0; j < kds->nr_colmeta; j++) + { + kern_colmeta *cmeta = &kds->colmeta[j]; + + elog(INFO, "%ccol[%d] nullmap=%lu,%lu values=%lu,%lu extra=%lu,%lu", + j < kds->ncols ? ' ' : '*', j, + __kds_unpack(cmeta->nullmap_offset), + __kds_unpack(cmeta->nullmap_length), + __kds_unpack(cmeta->values_offset), + __kds_unpack(cmeta->values_length), + __kds_unpack(cmeta->extra_offset), + __kds_unpack(cmeta->extra_length)); + + } +#endif +} + +/* + * arrowFdwLoadRecordBatch + */ +static void +__arrowFdwAssignTypeOptions(kern_data_store *kds, + int base, int ncols, + RecordBatchFieldState *rb_fstate) +{ + int i; + + for (i=0; i < ncols; i++) + { + kern_colmeta *cmeta = &kds->colmeta[base+i]; + + cmeta->attopts = rb_fstate[i].attopts; + if (cmeta->atttypkind == TYPE_KIND__ARRAY) + { + Assert(cmeta->idx_subattrs >= kds->ncols && + cmeta->num_subattrs == 1 && + cmeta->idx_subattrs + cmeta->num_subattrs <= kds->nr_colmeta); + Assert(rb_fstate[i].num_children == 1); + __arrowFdwAssignTypeOptions(kds, + cmeta->idx_subattrs, + cmeta->num_subattrs, + rb_fstate[i].children); + } + else if (cmeta->atttypkind == TYPE_KIND__COMPOSITE) + { + Assert(cmeta->idx_subattrs >= kds->ncols && + cmeta->idx_subattrs + cmeta->num_subattrs <= kds->nr_colmeta); + Assert(rb_fstate[i].num_children == cmeta->num_subattrs); + __arrowFdwAssignTypeOptions(kds, + cmeta->idx_subattrs, + cmeta->num_subattrs, + rb_fstate[i].children); + } + } +} + +static pgstrom_data_store * +__arrowFdwLoadRecordBatch(RecordBatchState *rb_state, + Relation relation, + Bitmapset *referenced, + GpuContext *gcontext, + MemoryContext mcontext, + const Bitmapset *optimal_gpus) +{ + TupleDesc tupdesc = RelationGetDescr(relation); + pgstrom_data_store *pds; + kern_data_store *kds; + strom_io_vector *iovec; + size_t head_sz; + CUresult rc; + + /* setup KDS and I/O-vector */ + head_sz = KDS_calculateHeadSize(tupdesc); + kds = alloca(head_sz); + init_kernel_data_store(kds, tupdesc, 0, KDS_FORMAT_ARROW, 0); + kds->nitems = rb_state->rb_nitems; + kds->nrooms = rb_state->rb_nitems; + kds->table_oid = RelationGetRelid(relation); + Assert(head_sz == KERN_DATA_STORE_HEAD_LENGTH(kds)); + Assert(kds->ncols == rb_state->ncols); + __arrowFdwAssignTypeOptions(kds, 0, kds->ncols, rb_state->columns); + iovec = arrowFdwSetupIOvector(kds, rb_state, referenced); + __dump_kds_and_iovec(kds, iovec); + + /* + * If SSD-to-GPU Direct SQL is available on the arrow file, setup a small + * PDS on host-pinned memory, with strom_io_vector. + */ + if (gcontext && + bms_is_member(gcontext->cuda_dindex, optimal_gpus) && + iovec->nr_chunks > 0 && + kds->length <= gpuMemAllocIOMapMaxLength() && + rb_state->dfile != NULL) + { + size_t iovec_sz = offsetof(strom_io_vector, ioc[iovec->nr_chunks]); + + rc = gpuMemAllocHost(gcontext, (void **)&pds, + offsetof(pgstrom_data_store, kds) + + head_sz + iovec_sz); + if (rc != CUDA_SUCCESS) + elog(ERROR, "failed on gpuMemAllocHost: %s", errorText(rc)); + + pds->gcontext = gcontext; + pg_atomic_init_u32(&pds->refcnt, 1); + pds->nblocks_uncached = 0; + memcpy(&pds->filedesc, rb_state->dfile, sizeof(GPUDirectFileDesc)); + pds->iovec = (strom_io_vector *)((char *)&pds->kds + head_sz); + memcpy(&pds->kds, kds, head_sz); + memcpy(pds->iovec, iovec, iovec_sz); + } + else + { + /* Elsewhere, load RecordBatch by filesystem */ + int fdesc = FileGetRawDesc(rb_state->fdesc); + + if (gcontext) + { + rc = gpuMemAllocManaged(gcontext, + (CUdeviceptr *)&pds, + offsetof(pgstrom_data_store, + kds) + kds->length, + CU_MEM_ATTACH_GLOBAL); + if (rc != CUDA_SUCCESS) + elog(ERROR, "failed on gpuMemAllocManaged: %s", errorText(rc)); + } + else + { + pds = MemoryContextAllocHuge(mcontext, + offsetof(pgstrom_data_store, + kds) + kds->length); + } + __PDS_fillup_arrow(pds, gcontext, kds, fdesc, iovec); + } + pfree(iovec); + return pds; +} + +static pgstrom_data_store * +arrowFdwLoadRecordBatch(ArrowFdwState *af_state, + Relation relation, + EState *estate, + GpuContext *gcontext, + const Bitmapset *optimal_gpus) +{ + RecordBatchState *rb_state; + uint32 rb_index; + +retry: + /* fetch next RecordBatch */ + rb_index = pg_atomic_fetch_add_u32(af_state->rbatch_index, 1); + if (rb_index >= af_state->num_rbatches) + return NULL; /* no more RecordBatch to read */ + rb_state = af_state->rbatches[rb_index]; + + if (af_state->stats_hint) + { + if (execCheckArrowStatsHint(af_state->stats_hint, rb_state)) + pg_atomic_fetch_add_u32(af_state->rbatch_nload, 1); + else + { + pg_atomic_fetch_add_u32(af_state->rbatch_nskip, 1); + goto retry; + } + } + return __arrowFdwLoadRecordBatch(rb_state, + relation, + af_state->referenced, + gcontext, + estate->es_query_cxt, + optimal_gpus); +} + +/* + * ExecScanChunkArrowFdw + */ +pgstrom_data_store * +ExecScanChunkArrowFdw(GpuTaskState *gts) +{ + pgstrom_data_store *pds; + + InstrStartNode(>s->outer_instrument); + pds = arrowFdwLoadRecordBatch(gts->af_state, + gts->css.ss.ss_currentRelation, + gts->css.ss.ps.state, + gts->gcontext, + gts->optimal_gpus); + InstrStopNode(>s->outer_instrument, + !pds ? 0.0 : (double)pds->kds.nitems); + return pds; +} + +/* + * ArrowIterateForeignScan + */ +static TupleTableSlot * +ArrowIterateForeignScan(ForeignScanState *node) +{ + ArrowFdwState *af_state = node->fdw_state; + Relation relation = node->ss.ss_currentRelation; + TupleTableSlot *slot = node->ss.ss_ScanTupleSlot; + pgstrom_data_store *pds; + + while ((pds = af_state->curr_pds) == NULL || + af_state->curr_index >= pds->kds.nitems) + { + EState *estate = node->ss.ps.state; + + /* unload the previous RecordBatch, if any */ + if (pds) + PDS_release(pds); + af_state->curr_index = 0; + af_state->curr_pds = arrowFdwLoadRecordBatch(af_state, + relation, + estate, + NULL, + NULL); + if (!af_state->curr_pds) + return NULL; + } + Assert(pds && af_state->curr_index < pds->kds.nitems); + if (KDS_fetch_tuple_arrow(slot, &pds->kds, af_state->curr_index++)) + return slot; + return NULL; +} + +/* + * ArrowReScanForeignScan + */ +void +ExecReScanArrowFdw(ArrowFdwState *af_state) +{ + /* rewind the current scan state */ + pg_atomic_write_u32(af_state->rbatch_index, 0); + if (af_state->curr_pds) + PDS_release(af_state->curr_pds); + af_state->curr_pds = NULL; + af_state->curr_index = 0; +} + +static void +ArrowReScanForeignScan(ForeignScanState *node) +{ + ExecReScanArrowFdw((ArrowFdwState *)node->fdw_state); +} + +/* + * ArrowEndForeignScan + */ +void +ExecEndArrowFdw(ArrowFdwState *af_state) +{ + ListCell *lc; + + foreach (lc, af_state->fdescList) + FileClose((File)lfirst_int(lc)); + foreach (lc, af_state->gpuDirectFileDescList) + { + GPUDirectFileDesc *dfile = lfirst(lc); + + untrackRawFileDesc(af_state->gcontext, dfile); + gpuDirectFileDescClose(dfile); + } + if (af_state->stats_hint) + execEndArrowStatsHint(af_state->stats_hint); +} + +static void +ArrowEndForeignScan(ForeignScanState *node) +{ + ExecEndArrowFdw((ArrowFdwState *)node->fdw_state); +} + +/* + * ArrowExplainForeignScan + */ +void +ExplainArrowFdw(ArrowFdwState *af_state, + Relation frel, + ExplainState *es, + List *dcontext) +{ + TupleDesc tupdesc = RelationGetDescr(frel); + ListCell *lc; + int fcount = 0; + char label[80]; + size_t *chunk_sz = alloca(sizeof(size_t) * tupdesc->natts); + int i, j, k; + StringInfoData buf; + + /* shows referenced columns */ + initStringInfo(&buf); + for (k = bms_next_member(af_state->referenced, -1); + k >= 0; + k = bms_next_member(af_state->referenced, k)) + { + j = k + FirstLowInvalidHeapAttributeNumber - 1; + + if (j >= 0) + { + Form_pg_attribute attr = tupleDescAttr(tupdesc, j); + const char *attName = NameStr(attr->attname); + if (buf.len > 0) + appendStringInfoString(&buf, ", "); + appendStringInfoString(&buf, quote_identifier(attName)); + } + } + ExplainPropertyText("referenced", buf.data, es); + + /* shows stats hint if any */ + if (af_state->stats_hint) + { + arrowStatsHint *stats_hint = af_state->stats_hint; + + resetStringInfo(&buf); + + if (dcontext == NIL) + { + int anum; + + for (anum = bms_next_member(stats_hint->load_attrs, -1); + anum >= 0; + anum = bms_next_member(stats_hint->load_attrs, anum)) + { + Form_pg_attribute attr = tupleDescAttr(tupdesc, anum-1); + const char *attName = NameStr(attr->attname); + + if (buf.len > 0) + appendStringInfoString(&buf, ", "); + appendStringInfoString(&buf, quote_identifier(attName)); + } + } + else + { + ListCell *lc; + + foreach (lc, stats_hint->orig_quals) + { + Node *qual = lfirst(lc); + char *temp; + + temp = deparse_expression(qual, dcontext, es->verbose, false); + if (buf.len > 0) + appendStringInfoString(&buf, ", "); + appendStringInfoString(&buf, temp); + pfree(temp); + } + } + if (es->analyze) + appendStringInfo(&buf, " [loaded: %u, skipped: %u]", + pg_atomic_read_u32(af_state->rbatch_nload), + pg_atomic_read_u32(af_state->rbatch_nskip)); + ExplainPropertyText("Stats-Hint", buf.data, es); + } + + /* shows files on behalf of the foreign table */ + foreach (lc, af_state->fdescList) + { + File fdesc = (File)lfirst_int(lc); + const char *fname = FilePathName(fdesc); + int rbcount = 0; + size_t read_sz = 0; + char *pos = label; + struct stat st_buf; + + pos += snprintf(label, sizeof(label), "files%d", fcount++); + if (fstat(FileGetRawDesc(fdesc), &st_buf) != 0) + memset(&st_buf, 0, sizeof(struct stat)); + + /* size count per chunk */ + memset(chunk_sz, 0, sizeof(size_t) * tupdesc->natts); + for (i=0; i < af_state->num_rbatches; i++) + { + RecordBatchState *rb_state = af_state->rbatches[i]; + size_t sz; + + if (rb_state->fdesc != fdesc) + continue; + + for (k = bms_next_member(af_state->referenced, -1); + k >= 0; + k = bms_next_member(af_state->referenced, k)) + { + j = k + FirstLowInvalidHeapAttributeNumber - 1; + if (j < 0 || j >= tupdesc->natts) + continue; + sz = RecordBatchFieldLength(&rb_state->columns[j]); + read_sz += sz; + chunk_sz[j] += sz; + } + rbcount++; + } + + /* file size and read size */ + if (es->format == EXPLAIN_FORMAT_TEXT) + { + resetStringInfo(&buf); + if (st_buf.st_size == 0) + appendStringInfoString(&buf, fname); + else + appendStringInfo(&buf, "%s (read: %s, size: %s)", + fname, + format_bytesz(read_sz), + format_bytesz(st_buf.st_size)); + ExplainPropertyText(label, buf.data, es); + } + else + { + ExplainPropertyText(label, fname, es); + + sprintf(pos, "-size"); + ExplainPropertyText(label, format_bytesz(st_buf.st_size), es); + + sprintf(pos, "-read"); + ExplainPropertyText(label, format_bytesz(read_sz), es); + } + + /* read-size per column (verbose mode only) */ + if (es->verbose && rbcount >= 0) + { + for (k = bms_next_member(af_state->referenced, -1); + k >= 0; + k = bms_next_member(af_state->referenced, k)) + { + Form_pg_attribute attr; + + j = k + FirstLowInvalidHeapAttributeNumber - 1; + if (j < 0 || j >= tupdesc->natts) + continue; + attr = tupleDescAttr(tupdesc, j); + snprintf(label, sizeof(label), + " %s", NameStr(attr->attname)); + ExplainPropertyText(label, format_bytesz(chunk_sz[j]), es); + } + } + } + pfree(buf.data); +} + +static void +ArrowExplainForeignScan(ForeignScanState *node, ExplainState *es) +{ + Relation frel = node->ss.ss_currentRelation; + + ExplainArrowFdw((ArrowFdwState *)node->fdw_state, frel, es, NIL); +} + +/* + * readArrowFile + */ +static bool +readArrowFile(const char *pathname, ArrowFileInfo *af_info, bool missing_ok) +{ + File filp = PathNameOpenFile(pathname, O_RDONLY | PG_BINARY); + + if (filp < 0) + { + if (missing_ok && errno == ENOENT) + return false; + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", pathname))); + } + readArrowFileDesc(FileGetRawDesc(filp), af_info); + FileClose(filp); + return true; +} + +/* + * RecordBatchAcquireSampleRows - random sampling + */ +static int +RecordBatchAcquireSampleRows(Relation relation, + RecordBatchState *rb_state, + HeapTuple *rows, + int nsamples) +{ + TupleDesc tupdesc = RelationGetDescr(relation); + pgstrom_data_store *pds; + Bitmapset *referenced = NULL; + Datum *values; + bool *isnull; + int count; + int i, j, nwords; + + /* ANALYZE needs to fetch all the attributes */ + nwords = (tupdesc->natts - FirstLowInvalidHeapAttributeNumber + + BITS_PER_BITMAPWORD - 1) / BITS_PER_BITMAPWORD; + referenced = alloca(offsetof(Bitmapset, words[nwords])); + referenced->nwords = nwords; + memset(referenced->words, -1, sizeof(bitmapword) * nwords); + + pds = __arrowFdwLoadRecordBatch(rb_state, + relation, + referenced, + NULL, + CurrentMemoryContext, + NULL); + values = alloca(sizeof(Datum) * tupdesc->natts); + isnull = alloca(sizeof(bool) * tupdesc->natts); + for (count = 0; count < nsamples; count++) + { + /* fetch a row randomly */ + i = (double)pds->kds.nitems * drand48(); + Assert(i < pds->kds.nitems); + + for (j=0; j < pds->kds.ncols; j++) + { + kern_colmeta *cmeta = &pds->kds.colmeta[j]; + + pg_datum_arrow_ref(&pds->kds, + cmeta, + i, + values + j, + isnull + j); + } + rows[count] = heap_form_tuple(tupdesc, values, isnull); + } + PDS_release(pds); + + return count; +} + +/* + * ArrowAcquireSampleRows + */ +static int +ArrowAcquireSampleRows(Relation relation, + int elevel, + HeapTuple *rows, + int nrooms, + double *p_totalrows, + double *p_totaldeadrows) +{ + TupleDesc tupdesc = RelationGetDescr(relation); + ForeignTable *ft = GetForeignTable(RelationGetRelid(relation)); + List *filesList = NIL; + List *fdescList = NIL; + List *rb_state_list = NIL; + ListCell *lc; + bool writable; + int64 total_nrows = 0; + int64 count_nrows = 0; + int nsamples_min = nrooms / 100; + int nitems = 0; + + filesList = __arrowFdwExtractFilesList(ft->options, + NULL, + &writable); + foreach (lc, filesList) + { + char *fname = strVal(lfirst(lc)); + File fdesc; + List *rb_cached; + ListCell *cell; + + fdesc = PathNameOpenFile(fname, O_RDONLY | PG_BINARY); + if (fdesc < 0) + { + if (writable && errno == ENOENT) + continue; + elog(ERROR, "failed to open file '%s' on behalf of '%s'", + fname, RelationGetRelationName(relation)); + } + fdescList = lappend_int(fdescList, fdesc); + + rb_cached = arrowLookupOrBuildMetadataCache(fdesc, NULL); + foreach (cell, rb_cached) + { + RecordBatchState *rb_state = lfirst(cell); + + if (!arrowSchemaCompatibilityCheck(tupdesc, rb_state)) + elog(ERROR, "arrow file '%s' on behalf of foreign table '%s' has incompatible schema definition", + fname, RelationGetRelationName(relation)); + if (rb_state->rb_nitems == 0) + continue; /* not reasonable to sample, skipped */ + total_nrows += rb_state->rb_nitems; + + rb_state_list = lappend(rb_state_list, rb_state); + } + } + nrooms = Min(nrooms, total_nrows); + + /* fetch samples for each record-batch */ + foreach (lc, rb_state_list) + { + RecordBatchState *rb_state = lfirst(lc); + int nsamples; + + count_nrows += rb_state->rb_nitems; + nsamples = (double)nrooms * ((double)count_nrows / + (double)total_nrows) - nitems; + if (nitems + nsamples > nrooms) + nsamples = nrooms - nitems; + if (nsamples > nsamples_min) + nitems += RecordBatchAcquireSampleRows(relation, + rb_state, + rows + nitems, + nsamples); + } + foreach (lc, fdescList) + FileClose((File)lfirst_int(lc)); + + *p_totalrows = total_nrows; + *p_totaldeadrows = 0.0; + + return nitems; +} + +/* + * ArrowAnalyzeForeignTable + */ +static bool +ArrowAnalyzeForeignTable(Relation frel, + AcquireSampleRowsFunc *p_sample_rows_func, + BlockNumber *p_totalpages) +{ + ForeignTable *ft = GetForeignTable(RelationGetRelid(frel)); + List *filesList = arrowFdwExtractFilesList(ft->options); + ListCell *lc; + Size totalpages = 0; + + foreach (lc, filesList) + { + const char *fname = strVal(lfirst(lc)); + struct stat statbuf; + + if (stat(fname, &statbuf) != 0) + { + elog(NOTICE, "failed on stat('%s') on behalf of '%s', skipped", + fname, get_rel_name(ft->relid)); + continue; + } + totalpages += (statbuf.st_size + BLCKSZ - 1) / BLCKSZ; + } + + if (totalpages > MaxBlockNumber) + totalpages = MaxBlockNumber; + + *p_sample_rows_func = ArrowAcquireSampleRows; + *p_totalpages = totalpages; + + return true; +} + +/* + * ArrowImportForeignSchema + */ +static List * +ArrowImportForeignSchema(ImportForeignSchemaStmt *stmt, Oid serverOid) +{ + ArrowSchema schema; + List *filesList; + ListCell *lc; + int j; + StringInfoData cmd; + + /* sanity checks */ + switch (stmt->list_type) + { + case FDW_IMPORT_SCHEMA_ALL: + break; + case FDW_IMPORT_SCHEMA_LIMIT_TO: + elog(ERROR, "arrow_fdw does not support LIMIT TO clause"); + break; + case FDW_IMPORT_SCHEMA_EXCEPT: + elog(ERROR, "arrow_fdw does not support EXCEPT clause"); + break; + default: + elog(ERROR, "arrow_fdw: Bug? unknown list-type"); + break; + } + filesList = arrowFdwExtractFilesList(stmt->options); + if (filesList == NIL) + ereport(ERROR, + (errmsg("No valid apache arrow files are specified"), + errhint("Use 'file' or 'dir' option to specify apache arrow files on behalf of the foreign table"))); + + /* read the schema */ + memset(&schema, 0, sizeof(ArrowSchema)); + foreach (lc, filesList) + { + const char *fname = strVal(lfirst(lc)); + ArrowFileInfo af_info; + + readArrowFile(fname, &af_info, false); + if (lc == list_head(filesList)) + { + copyArrowNode(&schema.node, &af_info.footer.schema.node); + } + else + { + /* compatibility checks */ + ArrowSchema *stemp = &af_info.footer.schema; + + if (schema.endianness != stemp->endianness || + schema._num_fields != stemp->_num_fields) + elog(ERROR, "file '%s' has incompatible schema definition", fname); + for (j=0; j < schema._num_fields; j++) + { + if (!arrowFieldTypeIsEqual(&schema.fields[j], + &stemp->fields[j])) + elog(ERROR, "file '%s' has incompatible schema definition", fname); + } + } + } + + /* makes a command to define foreign table */ + initStringInfo(&cmd); + appendStringInfo(&cmd, "CREATE FOREIGN TABLE %s (\n", + quote_identifier(stmt->remote_schema)); + for (j=0; j < schema._num_fields; j++) + { + ArrowField *field = &schema.fields[j]; + const char *type_name = arrowTypeToPGTypeName(field); + + if (j > 0) + appendStringInfo(&cmd, ",\n"); + if (!field->name || field->_name_len == 0) + { + elog(NOTICE, "field %d has no name, so \"__col%02d\" is used", + j+1, j+1); + appendStringInfo(&cmd, " __col%02d %s", j+1, type_name); + } + else + appendStringInfo(&cmd, " %s %s", + quote_identifier(field->name), type_name); + } + appendStringInfo(&cmd, + "\n" + ") SERVER %s\n" + " OPTIONS (", stmt->server_name); + foreach (lc, stmt->options) + { + DefElem *defel = lfirst(lc); + + if (lc != list_head(stmt->options)) + appendStringInfo(&cmd, ",\n "); + appendStringInfo(&cmd, "%s '%s'", + defel->defname, + strVal(defel->arg)); + } + appendStringInfo(&cmd, ")"); + + return list_make1(cmd.data); +} + +/* + * pgstrom_arrow_fdw_import_file + * + * NOTE: Due to historical reason, PostgreSQL does not allow to define + * columns more than MaxHeapAttributeNumber (1600) for foreign-tables also, + * not only heap-tables. This restriction comes from NULL-bitmap length + * in HeapTupleHeaderData and width of t_hoff. + * However, it is not a reasonable restriction for foreign-table, because + * it does not use heap-format internally. + */ +static void +__insertPgAttributeTuple(Relation pg_attr_rel, + CatalogIndexState pg_attr_index, + Oid ftable_oid, + AttrNumber attnum, + ArrowField *field) +{ + Oid type_oid; + int32 type_mod; + int16 type_len; + bool type_byval; + char type_align; + int32 type_ndims; + char type_storage; + Datum values[Natts_pg_attribute]; + bool isnull[Natts_pg_attribute]; + HeapTuple tup; + ObjectAddress myself, referenced; + + type_oid = arrowTypeToPGTypeOid(field, &type_mod); + get_typlenbyvalalign(type_oid, + &type_len, + &type_byval, + &type_align); + type_ndims = (type_is_array(type_oid) ? 1 : 0); + type_storage = get_typstorage(type_oid); + + memset(values, 0, sizeof(values)); + memset(isnull, 0, sizeof(isnull)); + + values[Anum_pg_attribute_attrelid - 1] = ObjectIdGetDatum(ftable_oid); + values[Anum_pg_attribute_attname - 1] = CStringGetDatum(field->name); + values[Anum_pg_attribute_atttypid - 1] = ObjectIdGetDatum(type_oid); + values[Anum_pg_attribute_attstattarget - 1] = Int32GetDatum(-1); + values[Anum_pg_attribute_attlen - 1] = Int16GetDatum(type_len); + values[Anum_pg_attribute_attnum - 1] = Int16GetDatum(attnum); + values[Anum_pg_attribute_attndims - 1] = Int32GetDatum(type_ndims); + values[Anum_pg_attribute_attcacheoff - 1] = Int32GetDatum(-1); + values[Anum_pg_attribute_atttypmod - 1] = Int32GetDatum(type_mod); + values[Anum_pg_attribute_attbyval - 1] = BoolGetDatum(type_byval); + values[Anum_pg_attribute_attstorage - 1] = CharGetDatum(type_storage); + values[Anum_pg_attribute_attalign - 1] = CharGetDatum(type_align); + values[Anum_pg_attribute_attnotnull - 1] = BoolGetDatum(!field->nullable); + values[Anum_pg_attribute_attislocal - 1] = BoolGetDatum(true); + isnull[Anum_pg_attribute_attacl - 1] = true; + isnull[Anum_pg_attribute_attoptions - 1] = true; + isnull[Anum_pg_attribute_attfdwoptions - 1] = true; + isnull[Anum_pg_attribute_attmissingval - 1] = true; + + tup = heap_form_tuple(RelationGetDescr(pg_attr_rel), values, isnull); + CatalogTupleInsertWithInfo(pg_attr_rel, tup, pg_attr_index); + + /* add dependency */ + myself.classId = RelationRelationId; + myself.objectId = ftable_oid; + myself.objectSubId = attnum; + referenced.classId = TypeRelationId; + referenced.objectId = type_oid; + referenced.objectSubId = 0; + recordDependencyOn(&myself, &referenced, DEPENDENCY_NORMAL); + + heap_freetuple(tup); +} + +Datum +pgstrom_arrow_fdw_import_file(PG_FUNCTION_ARGS) +{ + CreateForeignTableStmt stmt; + ArrowSchema schema; + List *tableElts = NIL; + char *ftable_name; + char *file_name; + char *namespace_name; + DefElem *defel; + int j, nfields; + Oid ftable_oid; + Oid type_oid; + int type_mod; + ObjectAddress myself; + ArrowFileInfo af_info; + + /* read schema of the file */ + if (PG_ARGISNULL(0)) + elog(ERROR, "foreign table name is not supplied"); + ftable_name = text_to_cstring(PG_GETARG_TEXT_PP(0)); + + if (PG_ARGISNULL(1)) + elog(ERROR, "arrow filename is not supplied"); + file_name = text_to_cstring(PG_GETARG_TEXT_PP(1)); + defel = makeDefElem("file", (Node *)makeString(file_name), -1); + + if (PG_ARGISNULL(2)) + namespace_name = NULL; + else + namespace_name = text_to_cstring(PG_GETARG_TEXT_PP(2)); + + readArrowFile(file_name, &af_info, false); + copyArrowNode(&schema.node, &af_info.footer.schema.node); + if (schema._num_fields > SHRT_MAX) + Elog("Arrow file '%s' has too much fields: %d", + file_name, schema._num_fields); + + /* setup CreateForeignTableStmt */ + memset(&stmt, 0, sizeof(CreateForeignTableStmt)); + NodeSetTag(&stmt, T_CreateForeignTableStmt); + stmt.base.relation = makeRangeVar(namespace_name, ftable_name, -1); + + nfields = Min(schema._num_fields, 100); + for (j=0; j < nfields; j++) + { + ColumnDef *cdef; + + type_oid = arrowTypeToPGTypeOid(&schema.fields[j], &type_mod); + cdef = makeColumnDef(schema.fields[j].name, + type_oid, + type_mod, + InvalidOid); + tableElts = lappend(tableElts, cdef); + } + stmt.base.tableElts = tableElts; + stmt.base.oncommit = ONCOMMIT_NOOP; + stmt.servername = "arrow_fdw"; + stmt.options = list_make1(defel); + + myself = DefineRelation(&stmt.base, + RELKIND_FOREIGN_TABLE, + InvalidOid, + NULL, + __FUNCTION__); + ftable_oid = myself.objectId; + CreateForeignTable(&stmt, ftable_oid); + + if (nfields < schema._num_fields) + { + Relation c_rel = table_open(RelationRelationId, RowExclusiveLock); + Relation a_rel = table_open(AttributeRelationId, RowExclusiveLock); + CatalogIndexState c_index = CatalogOpenIndexes(c_rel); + CatalogIndexState a_index = CatalogOpenIndexes(a_rel); + HeapTuple tup; + + tup = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(ftable_oid)); + if (!HeapTupleIsValid(tup)) + elog(ERROR, "cache lookup failed for relation %u", ftable_oid); + + for (j=nfields; j < schema._num_fields; j++) + { + __insertPgAttributeTuple(a_rel, + a_index, + ftable_oid, + j+1, + &schema.fields[j]); + } + /* update relnatts also */ + ((Form_pg_class) GETSTRUCT(tup))->relnatts = schema._num_fields; + CatalogTupleUpdate(c_rel, &tup->t_self, tup); + + CatalogCloseIndexes(a_index); + CatalogCloseIndexes(c_index); + table_close(a_rel, RowExclusiveLock); + table_close(c_rel, RowExclusiveLock); + + CommandCounterIncrement(); + } + PG_RETURN_VOID(); +} +PG_FUNCTION_INFO_V1(pgstrom_arrow_fdw_import_file); + +/* + * ArrowIsForeignScanParallelSafe + */ +static bool +ArrowIsForeignScanParallelSafe(PlannerInfo *root, + RelOptInfo *rel, + RangeTblEntry *rte) +{ + return true; +} + +/* + * ArrowEstimateDSMForeignScan + */ +static Size +ArrowEstimateDSMForeignScan(ForeignScanState *node, + ParallelContext *pcxt) +{ + return MAXALIGN(sizeof(pg_atomic_uint32) * 3); +} + +/* + * ArrowInitializeDSMForeignScan + */ +static inline void +__ExecInitDSMArrowFdw(ArrowFdwState *af_state, + pg_atomic_uint32 *rbatch_index, + pg_atomic_uint32 *rbatch_nload, + pg_atomic_uint32 *rbatch_nskip) +{ + pg_atomic_init_u32(rbatch_index, 0); + af_state->rbatch_index = rbatch_index; + pg_atomic_init_u32(rbatch_nload, 0); + af_state->rbatch_nload = rbatch_nload; + pg_atomic_init_u32(rbatch_nskip, 0); + af_state->rbatch_nskip = rbatch_nskip; +} + +void +ExecInitDSMArrowFdw(ArrowFdwState *af_state, GpuTaskSharedState *gtss) +{ + __ExecInitDSMArrowFdw(af_state, + >ss->af_rbatch_index, + >ss->af_rbatch_nload, + >ss->af_rbatch_nskip); +} + +static void +ArrowInitializeDSMForeignScan(ForeignScanState *node, + ParallelContext *pcxt, + void *coordinate) +{ + pg_atomic_uint32 *atomic_buffer = coordinate; + + __ExecInitDSMArrowFdw((ArrowFdwState *)node->fdw_state, + atomic_buffer, + atomic_buffer + 1, + atomic_buffer + 2); +} + +/* + * ArrowReInitializeDSMForeignScan + */ +static void +__ExecReInitDSMArrowFdw(ArrowFdwState *af_state) +{ + pg_atomic_write_u32(af_state->rbatch_index, 0); +} + +void +ExecReInitDSMArrowFdw(ArrowFdwState *af_state) +{ + __ExecReInitDSMArrowFdw(af_state); +} + + +static void +ArrowReInitializeDSMForeignScan(ForeignScanState *node, + ParallelContext *pcxt, + void *coordinate) +{ + __ExecReInitDSMArrowFdw((ArrowFdwState *)node->fdw_state); +} + +/* + * ArrowInitializeWorkerForeignScan + */ +static inline void +__ExecInitWorkerArrowFdw(ArrowFdwState *af_state, + pg_atomic_uint32 *rbatch_index, + pg_atomic_uint32 *rbatch_nload, + pg_atomic_uint32 *rbatch_nskip) +{ + af_state->rbatch_index = rbatch_index; + af_state->rbatch_nload = rbatch_nload; + af_state->rbatch_nskip = rbatch_nskip; +} + +void +ExecInitWorkerArrowFdw(ArrowFdwState *af_state, + GpuTaskSharedState *gtss) +{ + __ExecInitWorkerArrowFdw(af_state, + >ss->af_rbatch_index, + >ss->af_rbatch_nload, + >ss->af_rbatch_nskip); +} + +static void +ArrowInitializeWorkerForeignScan(ForeignScanState *node, + shm_toc *toc, + void *coordinate) +{ + pg_atomic_uint32 *atomic_buffer = coordinate; + + __ExecInitWorkerArrowFdw((ArrowFdwState *)node->fdw_state, + atomic_buffer, + atomic_buffer + 1, + atomic_buffer + 2); +} + +/* + * ArrowShutdownForeignScan + */ +static inline void +__ExecShutdownArrowFdw(ArrowFdwState *af_state) +{ + uint32 temp; + + temp = pg_atomic_read_u32(af_state->rbatch_index); + pg_atomic_write_u32(&af_state->__rbatch_index_local, temp); + af_state->rbatch_index = &af_state->__rbatch_index_local; + + temp = pg_atomic_read_u32(af_state->rbatch_nload); + pg_atomic_write_u32(&af_state->__rbatch_nload_local, temp); + af_state->rbatch_nload = &af_state->__rbatch_nload_local; + + temp = pg_atomic_read_u32(af_state->rbatch_nskip); + pg_atomic_write_u32(&af_state->__rbatch_nskip_local, temp); + af_state->rbatch_nskip = &af_state->__rbatch_nskip_local; +} + +void +ExecShutdownArrowFdw(ArrowFdwState *af_state) +{ + __ExecShutdownArrowFdw(af_state); +} + +static void +ArrowShutdownForeignScan(ForeignScanState *node) +{ + __ExecShutdownArrowFdw((ArrowFdwState *)node->fdw_state); +} + +/* + * ArrowPlanForeignModify + */ +static List * +ArrowPlanForeignModify(PlannerInfo *root, + ModifyTable *plan, + Index resultRelation, + int subplan_index) +{ + RangeTblEntry *rte = planner_rt_fetch(resultRelation, root); + ForeignTable *ft = GetForeignTable(rte->relid); + List *filesList __attribute__((unused)); + bool writable; + + if (plan->operation != CMD_INSERT) + elog(ERROR, "not a supported operation on arrow_fdw foreign tables"); + + filesList = __arrowFdwExtractFilesList(ft->options, + NULL, + &writable); + if (!writable) + elog(ERROR, "arrow_fdw: foreign table \"%s\" is not writable", + get_rel_name(rte->relid)); + Assert(list_length(filesList) == 1); + + return NIL; +} + +/* + * ArrowBeginForeignModify + */ +static void +__ArrowBeginForeignModify(ResultRelInfo *rrinfo, int eflags) +{ + Relation frel = rrinfo->ri_RelationDesc; + TupleDesc tupdesc = RelationGetDescr(frel); + ForeignTable *ft = GetForeignTable(RelationGetRelid(frel)); + List *filesList = arrowFdwExtractFilesList(ft->options); + const char *fname; + File filp; + struct stat stat_buf; + ArrowFileInfo *af_info = NULL; + arrowWriteState *aw_state; + SQLtable *table; + MetadataCacheKey key; + off_t f_pos; + + Assert(list_length(filesList) == 1); + fname = strVal(linitial(filesList)); + + LockRelation(frel, ShareRowExclusiveLock); + filp = PathNameOpenFile(fname, O_RDWR | PG_BINARY); + if (filp >= 0) + { + af_info = alloca(sizeof(ArrowFileInfo)); + readArrowFileDesc(FileGetRawDesc(filp), af_info); + f_pos = createArrowWriteRedoLog(filp, false); + } + else if (errno == ENOENT) + { + filp = PathNameOpenFile(fname, O_RDWR | O_CREAT | O_EXCL | PG_BINARY); + if (filp < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", fname))); + PG_TRY(); + { + f_pos = createArrowWriteRedoLog(filp, true); + } + PG_CATCH(); + { + unlink(fname); + PG_RE_THROW(); + } + PG_END_TRY(); + } + else + { + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", fname))); + } + + if (fstat(FileGetRawDesc(filp), &stat_buf) != 0) + elog(ERROR, "failed on fstat('%s'): %m", FilePathName(filp)); + initMetadataCacheKey(&key, &stat_buf); + + aw_state = palloc0(offsetof(arrowWriteState, + sql_table.columns[tupdesc->natts])); + aw_state->memcxt = CurrentMemoryContext; + aw_state->file = filp; + memcpy(&aw_state->key, &key, sizeof(MetadataCacheKey)); + aw_state->hash = key.hash; + table = &aw_state->sql_table; + table->filename = FilePathName(filp); + table->fdesc = FileGetRawDesc(filp); + table->f_pos = f_pos; + if (af_info) + setupArrowSQLbufferBatches(table, af_info); + setupArrowSQLbufferSchema(table, tupdesc, af_info); + + rrinfo->ri_FdwState = aw_state; +} + +static void +ArrowBeginForeignModify(ModifyTableState *mtstate, + ResultRelInfo *rrinfo, + List *fdw_private, + int subplan_index, + int eflags) +{ + __ArrowBeginForeignModify(rrinfo, eflags); +} + +/* + * ArrowExecForeignInsert + */ +static TupleTableSlot * +ArrowExecForeignInsert(EState *estate, + ResultRelInfo *rrinfo, + TupleTableSlot *slot, + TupleTableSlot *planSlot) +{ + Relation frel = rrinfo->ri_RelationDesc; + TupleDesc tupdesc = RelationGetDescr(frel); + arrowWriteState *aw_state = rrinfo->ri_FdwState; + SQLtable *table = &aw_state->sql_table; + MemoryContext oldcxt; + size_t usage = 0; + int j; + + slot_getallattrs(slot); + oldcxt = MemoryContextSwitchTo(aw_state->memcxt); + for (j=0; j < tupdesc->natts; j++) + { + Form_pg_attribute attr = tupleDescAttr(tupdesc, j); + SQLfield *column = &table->columns[j]; + Datum datum = slot->tts_values[j]; + bool isnull = slot->tts_isnull[j]; + + if (isnull) + { + usage += sql_field_put_value(column, NULL, 0); + } + else if (attr->attbyval) + { + Assert(column->sql_type.pgsql.typbyval); + usage += sql_field_put_value(column, (char *)&datum, attr->attlen); + } + else if (attr->attlen == -1) + { + int vl_len = VARSIZE_ANY_EXHDR(datum); + char *vl_ptr = VARDATA_ANY(datum); + + Assert(column->sql_type.pgsql.typlen == -1); + usage += sql_field_put_value(column, vl_ptr, vl_len); + } + else + { + elog(ERROR, "Bug? unsupported type format"); + } + } + table->usage = usage; + table->nitems++; + MemoryContextSwitchTo(oldcxt); + + /* + * If usage exceeds the threshold of record-batch size, make a redo-log + * on demand, and write out the buffer. + */ + if (usage > table->segment_sz) + writeOutArrowRecordBatch(aw_state, false); + + return slot; +} + +/* + * ArrowEndForeignModify + */ +static void +ArrowEndForeignModify(EState *estate, + ResultRelInfo *rrinfo) +{ + arrowWriteState *aw_state = rrinfo->ri_FdwState; + + writeOutArrowRecordBatch(aw_state, true); +} + +#if PG_VERSION_NUM >= 110000 +/* + * MEMO: executor begin/end routine, if arrow_fdw is partitioned-leaf + * relations. In this case, ArrowBeginForeignModify shall not be called. + */ +static void +ArrowBeginForeignInsert(ModifyTableState *mtstate, + ResultRelInfo *rrinfo) +{ + __ArrowBeginForeignModify(rrinfo, 0); +} + +static void +ArrowEndForeignInsert(EState *estate, ResultRelInfo *rrinfo) +{ + arrowWriteState *aw_state = rrinfo->ri_FdwState; + + writeOutArrowRecordBatch(aw_state, true); +} +#endif + +/* + * ArrowExplainForeignModify + */ +static void +ArrowExplainForeignModify(ModifyTableState *mtstate, + ResultRelInfo *rinfo, + List *fdw_private, + int subplan_index, + struct ExplainState *es) +{ + /* print something */ +} + +/* + * handler of Arrow_Fdw + */ +Datum +pgstrom_arrow_fdw_handler(PG_FUNCTION_ARGS) +{ + PG_RETURN_POINTER(&pgstrom_arrow_fdw_routine); +} +PG_FUNCTION_INFO_V1(pgstrom_arrow_fdw_handler); + +/* + * arrowFieldGetPGTypeHint + */ +static Oid +arrowFieldGetPGTypeHint(ArrowField *field) +{ + Oid hint_oid = InvalidOid; + int i, j; + + for (i=0; i < field->_num_custom_metadata; i++) + { + ArrowKeyValue *kv = &field->custom_metadata[i]; + char *namebuf, *pos; + Oid namespace_oid; + HeapTuple tup; + + if (kv->_key_len != 7 || strncmp(kv->key, "pg_type", 7) != 0) + continue; + namebuf = alloca(kv->_value_len + 10); + /* namespace name */ + pos = namebuf; + for (j=0; j < kv->_value_len; j++) + { + int c = kv->value[j]; + + if (c == '.') + break; + else if (c == '\\' && ++j < kv->_value_len) + c = kv->value[j]; + *pos++ = c; + } + *pos++ = '\0'; + + namespace_oid = get_namespace_oid(namebuf, true); + if (!OidIsValid(namespace_oid)) + continue; + + /* type name */ + pos = namebuf; + for (j++; j < kv->_value_len; j++) + { + int c = kv->value[j]; + + if (c == '\\' && ++j < kv->_value_len) + c = kv->value[j]; + *pos++ = c; + } + *pos++ = '\0'; + + tup = SearchSysCache2(TYPENAMENSP, + PointerGetDatum(namebuf), + ObjectIdGetDatum(namespace_oid)); + if (!HeapTupleIsValid(tup)) + continue; + hint_oid = PgTypeTupleGetOid(tup); + + ReleaseSysCache(tup); + + return hint_oid; + } + return InvalidOid; +} + +static bool +__arrowStructTypeIsCompatible(ArrowField *field, Oid comp_oid) +{ + TupleDesc tupdesc; + int j; + bool compatible = false; + + if (pg_type_aclcheck(comp_oid, + GetUserId(), + ACL_USAGE) != ACLCHECK_OK) + return false; + + tupdesc = lookup_rowtype_tupdesc_noerror(comp_oid, -1, true); + if (tupdesc && tupdesc->natts == field->_num_children) + { + for (j=0; j < tupdesc->natts; j++) + { + Form_pg_attribute attr = tupleDescAttr(tupdesc, j); + ArrowField *child = &field->children[j]; + Oid typoid; + int typmod; + + typoid = arrowTypeToPGTypeOid(child, &typmod); + if (typoid != attr->atttypid) + break; + } + if (j >= tupdesc->natts) + compatible = true; + } + if (tupdesc) + ReleaseTupleDesc(tupdesc); + + return compatible; +} + +static Oid +arrowTypeToPGTypeOid(ArrowField *field, int *p_type_mod) +{ + ArrowType *t = &field->type; + Oid hint_oid; + int i; + + hint_oid = arrowFieldGetPGTypeHint(field); + + /* extra module may provide own mapping */ + for (i=0; i < pgstrom_num_users_extra; i++) + { + pgstromUsersExtraDescriptor *extra = &pgstrom_users_extra_desc[i]; + Oid type_oid; + + if (extra->arrow_lookup_pgtype) + { + type_oid = extra->arrow_lookup_pgtype(field, hint_oid, p_type_mod); + if (OidIsValid(type_oid)) + return type_oid; + } + } + + *p_type_mod = -1; + switch (t->node.tag) + { + case ArrowNodeTag__Int: + switch (t->Int.bitWidth) + { + case 8: + return INT1OID; + case 16: + return INT2OID; + case 32: + return INT4OID; + case 64: + return INT8OID; + default: + elog(ERROR, "%s is not supported", + arrowNodeName(&t->node)); + break; + } + break; + case ArrowNodeTag__FloatingPoint: + switch (t->FloatingPoint.precision) + { + case ArrowPrecision__Half: + return FLOAT2OID; + case ArrowPrecision__Single: + return FLOAT4OID; + case ArrowPrecision__Double: + return FLOAT8OID; + default: + elog(ERROR, "%s is not supported", + arrowNodeName(&t->node)); + } + break; + case ArrowNodeTag__Utf8: + return TEXTOID; + case ArrowNodeTag__Binary: + return BYTEAOID; + case ArrowNodeTag__Bool: + return BOOLOID; + case ArrowNodeTag__Decimal: + if (t->Decimal.bitWidth == 128) + return NUMERICOID; + break; + case ArrowNodeTag__Date: + return DATEOID; + case ArrowNodeTag__Time: + return TIMEOID; + case ArrowNodeTag__Timestamp: + if (t->Timestamp.timezone) + return TIMESTAMPTZOID; + return TIMESTAMPOID; + case ArrowNodeTag__Interval: + return INTERVALOID; + case ArrowNodeTag__List: + if (field->_num_children != 1) + elog(ERROR, "arrow_fdw: corrupted List type definition"); + else + { + ArrowField *child = &field->children[0]; + Oid type_oid; + Oid elem_oid; + int elem_mod; + + elem_oid = arrowTypeToPGTypeOid(child, &elem_mod); + type_oid = get_array_type(elem_oid); + if (!OidIsValid(type_oid)) + elog(ERROR, "array of %s type is not defined", + arrowNodeName(&t->node)); + return type_oid; + } + break; + + case ArrowNodeTag__Struct: + if (!OidIsValid(hint_oid) || + !__arrowStructTypeIsCompatible(field, hint_oid)) + { + Relation rel; + ScanKeyData skey[2]; + SysScanDesc sscan; + HeapTuple tup; + + /* + * lookup composite type definition from pg_class + * At least, nattrs == _num_children + */ + rel = table_open(RelationRelationId, AccessShareLock); + ScanKeyInit(&skey[0], + Anum_pg_class_relkind, + BTEqualStrategyNumber, F_CHAREQ, + CharGetDatum(RELKIND_COMPOSITE_TYPE)); + ScanKeyInit(&skey[1], + Anum_pg_class_relnatts, + BTEqualStrategyNumber, F_INT2EQ, + Int16GetDatum(field->_num_children)); + + sscan = systable_beginscan(rel, InvalidOid, false, + NULL, 2, skey); + hint_oid = InvalidOid; + while (!OidIsValid(hint_oid) && + HeapTupleIsValid(tup = systable_getnext(sscan))) + { + Oid reltype = ((Form_pg_class) GETSTRUCT(tup))->reltype; + + if (__arrowStructTypeIsCompatible(field, reltype)) + hint_oid = reltype; + } + systable_endscan(sscan); + table_close(rel, AccessShareLock); + + if (!OidIsValid(hint_oid)) + elog(ERROR, "arrow::%s is not supported", + arrowNodeName(&t->node)); + } + return hint_oid; + + case ArrowNodeTag__FixedSizeBinary: + if (t->FixedSizeBinary.byteWidth < 1 || + t->FixedSizeBinary.byteWidth > BLCKSZ) + elog(ERROR, "arrow_fdw: %s with byteWidth=%d is not supported", + t->node.tagName, + t->FixedSizeBinary.byteWidth); + if (hint_oid == MACADDROID && + t->FixedSizeBinary.byteWidth == sizeof(macaddr)) + { + return MACADDROID; + } + else if (hint_oid == INETOID && + (t->FixedSizeBinary.byteWidth == 4 || + t->FixedSizeBinary.byteWidth == 16)) + { + return INETOID; + } + *p_type_mod = t->FixedSizeBinary.byteWidth; + return BPCHAROID; + default: + elog(ERROR, "arrow_fdw: type '%s' is not supported", + field->type.node.tagName); + } + return InvalidOid; +} + +static const char * +arrowTypeToPGTypeName(ArrowField *field) +{ + Oid typoid; + int typmod; + HeapTuple tup; + Form_pg_type type; + char *schema; + char *result; + + typoid = arrowTypeToPGTypeOid(field, &typmod); + if (!OidIsValid(typoid)) + return NULL; + tup = SearchSysCache1(TYPEOID, ObjectIdGetDatum(typoid)); + if (!HeapTupleIsValid(tup)) + elog(ERROR, "cache lookup failed for type %u", typoid); + type = (Form_pg_type) GETSTRUCT(tup); + schema = get_namespace_name(type->typnamespace); + if (typmod < 0) + result = psprintf("%s.%s", + quote_identifier(schema), + quote_identifier(NameStr(type->typname))); + else + result = psprintf("%s.%s(%d)", + quote_identifier(schema), + quote_identifier(NameStr(type->typname)), + typmod); + ReleaseSysCache(tup); + + return result; +} + +#if 0 +//no longer needed? + +/* + * arrowTypeIsConvertible + */ +static bool +arrowTypeIsConvertible(Oid type_oid, int typemod) +{ + HeapTuple tup; + Form_pg_type typeForm; + bool retval = false; + + switch (type_oid) + { + case INT1OID: /* Int8 */ + case INT2OID: /* Int16 */ + case INT4OID: /* Int32 */ + case INT8OID: /* Int64 */ + case FLOAT2OID: /* FP16 */ + case FLOAT4OID: /* FP32 */ + case FLOAT8OID: /* FP64 */ + case TEXTOID: /* Utf8 */ + case BYTEAOID: /* Binary */ + case BOOLOID: /* Bool */ + case NUMERICOID: /* Decimal */ + case DATEOID: /* Date */ + case TIMEOID: /* Time */ + case TIMESTAMPOID: /* Timestamp */ + case TIMESTAMPTZOID:/* TimestampTz */ + case INTERVALOID: /* Interval */ + case BPCHAROID: /* FixedSizeBinary */ + return true; + default: + tup = SearchSysCache1(TYPEOID, ObjectIdGetDatum(type_oid)); + if (!HeapTupleIsValid(tup)) + elog(ERROR, "cache lookup failed for type %u", type_oid); + typeForm = (Form_pg_type) GETSTRUCT(tup); + + if (OidIsValid(typeForm->typelem) && typeForm->typlen == -1) + { + retval = arrowTypeIsConvertible(typeForm->typelem, typemod); + } + else if (typeForm->typtype == TYPTYPE_COMPOSITE) + { + Relation rel; + TupleDesc tupdesc; + int j; + + rel = relation_open(typeForm->typrelid, AccessShareLock); + tupdesc = RelationGetDescr(rel); + for (j=0; j < tupdesc->natts; j++) + { + Form_pg_attribute attr = tupleDescAttr(tupdesc, j); + + if (!arrowTypeIsConvertible(attr->atttypid, + attr->atttypmod)) + break; + } + if (j >= tupdesc->natts) + retval = true; + relation_close(rel, AccessShareLock); + } + ReleaseSysCache(tup); + } + return retval; +} +#endif + +/* + * arrowFieldLength + */ +static size_t +arrowFieldLength(ArrowField *field, int64 nitems) +{ + ArrowType *type = &field->type; + size_t length = 0; + + switch (type->node.tag) + { + case ArrowNodeTag__Int: + switch (type->Int.bitWidth) + { + case 8: + length = nitems; + break; + case 16: + length = 2 * nitems; + break; + case 32: + length = 4 * nitems; + break; + case 64: + length = 8 * nitems; + break; + default: + elog(ERROR, "Not a supported Int width: %d", + type->Int.bitWidth); + } + break; + case ArrowNodeTag__FloatingPoint: + switch (type->FloatingPoint.precision) + { + case ArrowPrecision__Half: + length = sizeof(cl_short) * nitems; + break; + case ArrowPrecision__Single: + length = sizeof(cl_float) * nitems; + break; + case ArrowPrecision__Double: + length = sizeof(cl_double) * nitems; + break; + default: + elog(ERROR, "Not a supported FloatingPoint precision"); + } + break; + case ArrowNodeTag__Utf8: + case ArrowNodeTag__Binary: + case ArrowNodeTag__List: + length = sizeof(cl_uint) * (nitems + 1); + break; + case ArrowNodeTag__Bool: + length = BITMAPLEN(nitems); + break; + case ArrowNodeTag__Decimal: + length = sizeof(int128) * nitems; + break; + case ArrowNodeTag__Date: + switch (type->Date.unit) + { + case ArrowDateUnit__Day: + length = sizeof(cl_int) * nitems; + break; + case ArrowDateUnit__MilliSecond: + length = sizeof(cl_long) * nitems; + break; + default: + elog(ERROR, "Not a supported Date unit"); + } + break; + case ArrowNodeTag__Time: + switch (type->Time.unit) + { + case ArrowTimeUnit__Second: + case ArrowTimeUnit__MilliSecond: + length = sizeof(cl_int) * nitems; + break; + case ArrowTimeUnit__MicroSecond: + case ArrowTimeUnit__NanoSecond: + length = sizeof(cl_long) * nitems; + break; + default: + elog(ERROR, "Not a supported Time unit"); + } + break; + case ArrowNodeTag__Timestamp: + length = sizeof(cl_long) * nitems; + break; + case ArrowNodeTag__Interval: + switch (type->Interval.unit) + { + case ArrowIntervalUnit__Year_Month: + length = sizeof(cl_uint) * nitems; + break; + case ArrowIntervalUnit__Day_Time: + length = sizeof(cl_long) * nitems; + break; + default: + elog(ERROR, "Not a supported Interval unit"); + } + break; + case ArrowNodeTag__Struct: //to be supported later + length = 0; /* only nullmap */ + break; + case ArrowNodeTag__FixedSizeBinary: + length = (size_t)type->FixedSizeBinary.byteWidth * nitems; + break; + default: + elog(ERROR, "Arrow Type '%s' is not supported now", + type->node.tagName); + break; + } + return length; +} + +/* + * arrowSchemaCompatibilityCheck + */ +static bool +__arrowSchemaCompatibilityCheck(TupleDesc tupdesc, + RecordBatchFieldState *rb_fstate) +{ + int j; + + for (j=0; j < tupdesc->natts; j++) + { + RecordBatchFieldState *fstate = &rb_fstate[j]; + Form_pg_attribute attr = tupleDescAttr(tupdesc, j); + + if (!fstate->children) + { + /* shortcut, it should be a scalar built-in type */ + Assert(fstate->num_children == 0); + if (attr->atttypid != fstate->atttypid) + return false; + } + else + { + Form_pg_type typ; + HeapTuple tup; + bool type_is_ok = true; + + tup = SearchSysCache1(TYPEOID, ObjectIdGetDatum(attr->atttypid)); + if (!HeapTupleIsValid(tup)) + elog(ERROR, "cache lookup failed for type %u", attr->atttypid); + typ = (Form_pg_type) GETSTRUCT(tup); + if (OidIsValid(typ->typelem) && typ->typlen == -1 && + fstate->num_children == 1) + { + /* Arrow::List */ + RecordBatchFieldState *cstate = &fstate->children[0]; + + if (typ->typelem == cstate->atttypid) + { + /* + * overwrite typoid / typmod because a same arrow file + * can be reused, and it may be on behalf of different + * user defined data type. + */ + fstate->atttypid = attr->atttypid; + fstate->atttypmod = attr->atttypmod; + } + else + { + type_is_ok = false; + } + } + else if (typ->typlen == -1 && OidIsValid(typ->typrelid)) + { + /* Arrow::Struct */ + TupleDesc sdesc = lookup_rowtype_tupdesc(attr->atttypid, + attr->atttypmod); + if (sdesc->natts == fstate->num_children && + __arrowSchemaCompatibilityCheck(sdesc, fstate->children)) + { + /* see comment above */ + fstate->atttypid = attr->atttypid; + fstate->atttypmod = attr->atttypmod; + } + else + { + type_is_ok = false; + } + DecrTupleDescRefCount(sdesc); + + } + else + { + /* unknown */ + type_is_ok = false; + } + ReleaseSysCache(tup); + if (!type_is_ok) + return false; + } + } + return true; +} + +static bool +arrowSchemaCompatibilityCheck(TupleDesc tupdesc, RecordBatchState *rb_state) +{ + if (tupdesc->natts != rb_state->ncols) + return false; + return __arrowSchemaCompatibilityCheck(tupdesc, rb_state->columns); +} + +/* + * pg_XXX_arrow_ref + */ +static Datum +pg_varlena_arrow_ref(kern_data_store *kds, + kern_colmeta *cmeta, size_t index) +{ + cl_uint *offset = (cl_uint *) + ((char *)kds + __kds_unpack(cmeta->values_offset)); + char *extra = (char *)kds + __kds_unpack(cmeta->extra_offset); + cl_uint len; + struct varlena *res; + + if (sizeof(uint32) * (index+2) > __kds_unpack(cmeta->values_length)) + elog(ERROR, "corruption? varlena index out of range"); + len = offset[index+1] - offset[index]; + if (offset[index] > offset[index+1] || + offset[index+1] > __kds_unpack(cmeta->extra_length)) + elog(ERROR, "corruption? varlena points out of extra buffer"); + + res = palloc(VARHDRSZ + len); + SET_VARSIZE(res, VARHDRSZ + len); + memcpy(VARDATA(res), extra + offset[index], len); + + return PointerGetDatum(res); +} + +static Datum +pg_bpchar_arrow_ref(kern_data_store *kds, + kern_colmeta *cmeta, size_t index) +{ + cl_char *values = ((char *)kds + __kds_unpack(cmeta->values_offset)); + size_t length = __kds_unpack(cmeta->values_length); + cl_int unitsz = cmeta->atttypmod - VARHDRSZ; + struct varlena *res; + + if (unitsz <= 0) + elog(ERROR, "CHAR(%d) is not expected", unitsz); + if (unitsz * index >= length) + elog(ERROR, "corruption? bpchar points out of range"); + res = palloc(VARHDRSZ + unitsz); + memcpy((char *)res + VARHDRSZ, values + unitsz * index, unitsz); + SET_VARSIZE(res, VARHDRSZ + unitsz); + + return PointerGetDatum(res); +} + +static Datum +pg_bool_arrow_ref(kern_data_store *kds, + kern_colmeta *cmeta, size_t index) +{ + uint8 *bitmap = (uint8 *)kds + __kds_unpack(cmeta->values_offset); + size_t length = __kds_unpack(cmeta->values_length); + uint8 mask = (1 << (index & 7)); + + index >>= 3; + if (sizeof(uint8) * index >= length) + elog(ERROR, "corruption? bool points out of range"); + return BoolGetDatum((bitmap[index] & mask) != 0 ? true : false); +} + +static Datum +pg_int1_arrow_ref(kern_data_store *kds, + kern_colmeta *cmeta, size_t index) +{ + int8 *values = (int8 *)((char *)kds + __kds_unpack(cmeta->values_offset)); + size_t length = __kds_unpack(cmeta->values_length); + + if (sizeof(int8) * index >= length) + elog(ERROR, "corruption? int8 points out of range"); + return values[index]; +} + +static Datum +pg_int2_arrow_ref(kern_data_store *kds, + kern_colmeta *cmeta, size_t index) +{ + int16 *values = (int16 *)((char *)kds + __kds_unpack(cmeta->values_offset)); + size_t length = __kds_unpack(cmeta->values_length); + + if (sizeof(int16) * index >= length) + elog(ERROR, "corruption? int16 points out of range"); + return values[index]; +} + +static Datum +pg_int4_arrow_ref(kern_data_store *kds, + kern_colmeta *cmeta, size_t index) +{ + int32 *values = (int32 *)((char *)kds + __kds_unpack(cmeta->values_offset)); + size_t length = __kds_unpack(cmeta->values_length); + + if (sizeof(int32) * index >= length) + elog(ERROR, "corruption? int32 points out of range"); + return values[index]; +} + +static Datum +pg_int8_arrow_ref(kern_data_store *kds, + kern_colmeta *cmeta, size_t index) +{ + int64 *values = (int64 *)((char *)kds + __kds_unpack(cmeta->values_offset)); + size_t length = __kds_unpack(cmeta->values_length); + + if (sizeof(int64) * index >= length) + elog(ERROR, "corruption? int64 points out of range"); + return values[index]; +} + +static Datum +pg_numeric_arrow_ref(kern_data_store *kds, + kern_colmeta *cmeta, size_t index) +{ + char *result = palloc0(sizeof(struct NumericData)); + char *base = (char *)kds + __kds_unpack(cmeta->values_offset); + size_t length = __kds_unpack(cmeta->values_length); + int dscale = cmeta->attopts.decimal.scale; + Int128_t decimal; + + if (sizeof(int128) * index >= length) + elog(ERROR, "corruption? numeric points out of range"); + decimal.ival = ((int128 *)base)[index]; + + while (dscale > 0 && decimal.ival % 10 == 0) + { + decimal.ival /= 10; + dscale--; + } + pg_numeric_to_varlena(result, dscale, decimal); + + return PointerGetDatum(result); +} + +static Datum +pg_date_arrow_ref(kern_data_store *kds, + kern_colmeta *cmeta, size_t index) +{ + char *base = (char *)kds + __kds_unpack(cmeta->values_offset); + size_t length = __kds_unpack(cmeta->values_length); + DateADT dt; + + switch (cmeta->attopts.date.unit) + { + case ArrowDateUnit__Day: + if (sizeof(uint32) * index >= length) + elog(ERROR, "corruption? Date[day] points out of range"); + dt = ((uint32 *)base)[index]; + break; + case ArrowDateUnit__MilliSecond: + if (sizeof(uint64) * index >= length) + elog(ERROR, "corruption? Date[ms] points out of range"); + dt = ((uint64 *)base)[index] / 1000; + break; + default: + elog(ERROR, "Bug? unexpected unit of Date type"); + } + /* convert UNIX epoch to PostgreSQL epoch */ + dt -= (POSTGRES_EPOCH_JDATE - UNIX_EPOCH_JDATE); + return DateADTGetDatum(dt); +} + +static Datum +pg_time_arrow_ref(kern_data_store *kds, + kern_colmeta *cmeta, size_t index) +{ + char *base = (char *)kds + __kds_unpack(cmeta->values_offset); + size_t length = __kds_unpack(cmeta->values_length); + TimeADT tm; + + switch (cmeta->attopts.time.unit) + { + case ArrowTimeUnit__Second: + if (sizeof(uint32) * index >= length) + elog(ERROR, "corruption? Time[sec] points out of range"); + tm = ((uint32 *)base)[index] * 1000000L; + break; + case ArrowTimeUnit__MilliSecond: + if (sizeof(uint32) * index >= length) + elog(ERROR, "corruption? Time[ms] points out of range"); + tm = ((uint32 *)base)[index] * 1000L; + break; + case ArrowTimeUnit__MicroSecond: + if (sizeof(uint64) * index >= length) + elog(ERROR, "corruption? Time[us] points out of range"); + tm = ((uint64 *)base)[index]; + break; + case ArrowTimeUnit__NanoSecond: + if (sizeof(uint64) * index >= length) + elog(ERROR, "corruption? Time[ns] points out of range"); + tm = ((uint64 *)base)[index] / 1000L; + break; + default: + elog(ERROR, "Bug? unexpected unit of Time type"); + break; + } + return TimeADTGetDatum(tm); +} + +static Datum +pg_timestamp_arrow_ref(kern_data_store *kds, + kern_colmeta *cmeta, size_t index) +{ + char *base = (char *)kds + __kds_unpack(cmeta->values_offset); + size_t length = __kds_unpack(cmeta->values_length); + Timestamp ts; + + switch (cmeta->attopts.timestamp.unit) + { + case ArrowTimeUnit__Second: + if (sizeof(uint64) * index >= length) + elog(ERROR, "corruption? Timestamp[sec] points out of range"); + ts = ((uint64 *)base)[index] * 1000000UL; + break; + case ArrowTimeUnit__MilliSecond: + if (sizeof(uint64) * index >= length) + elog(ERROR, "corruption? Timestamp[ms] points out of range"); + ts = ((uint64 *)base)[index] * 1000UL; + break; + case ArrowTimeUnit__MicroSecond: + if (sizeof(uint64) * index >= length) + elog(ERROR, "corruption? Timestamp[us] points out of range"); + ts = ((uint64 *)base)[index]; + break; + case ArrowTimeUnit__NanoSecond: + if (sizeof(uint64) * index >= length) + elog(ERROR, "corruption? Timestamp[ns] points out of range"); + ts = ((uint64 *)base)[index] / 1000UL; + break; + default: + elog(ERROR, "Bug? unexpected unit of Timestamp type"); + break; + } + /* convert UNIX epoch to PostgreSQL epoch */ + ts -= (POSTGRES_EPOCH_JDATE - + UNIX_EPOCH_JDATE) * USECS_PER_DAY; + return TimestampGetDatum(ts); +} + +static Datum +pg_interval_arrow_ref(kern_data_store *kds, + kern_colmeta *cmeta, size_t index) +{ + char *base = (char *)kds + __kds_unpack(cmeta->values_offset); + size_t length = __kds_unpack(cmeta->values_length); + Interval *iv = palloc0(sizeof(Interval)); + + switch (cmeta->attopts.interval.unit) + { + case ArrowIntervalUnit__Year_Month: + /* 32bit: number of months */ + if (sizeof(uint32) * index >= length) + elog(ERROR, "corruption? Interval[Year/Month] points out of range"); + iv->month = ((uint32 *)base)[index]; + break; + case ArrowIntervalUnit__Day_Time: + /* 32bit+32bit: number of days and milliseconds */ + if (2 * sizeof(uint32) * index >= length) + elog(ERROR, "corruption? Interval[Day/Time] points out of range"); + iv->day = ((int32 *)base)[2 * index]; + iv->time = ((int32 *)base)[2 * index + 1] * 1000; + break; + default: + elog(ERROR, "Bug? unexpected unit of Interval type"); + } + return PointerGetDatum(iv); +} + +static Datum +pg_macaddr_arrow_ref(kern_data_store *kds, + kern_colmeta *cmeta, size_t index) +{ + char *base = (char *)kds + __kds_unpack(cmeta->values_offset); + size_t length = __kds_unpack(cmeta->values_length); + + if (cmeta->attopts.fixed_size_binary.byteWidth != sizeof(macaddr)) + elog(ERROR, "Bug? wrong FixedSizeBinary::byteWidth(%d) for macaddr", + cmeta->attopts.fixed_size_binary.byteWidth); + if (sizeof(macaddr) * index >= length) + elog(ERROR, "corruption? Binary[macaddr] points out of range"); + + return PointerGetDatum(base + sizeof(macaddr) * index); +} + +static Datum +pg_inet_arrow_ref(kern_data_store *kds, + kern_colmeta *cmeta, size_t index) +{ + char *base = (char *)kds + __kds_unpack(cmeta->values_offset); + size_t length = __kds_unpack(cmeta->values_length); + inet *ip = palloc(sizeof(inet)); + + if (cmeta->attopts.fixed_size_binary.byteWidth == 4) + { + if (4 * index >= length) + elog(ERROR, "corruption? Binary[inet4] points out of range"); + ip->inet_data.family = PGSQL_AF_INET; + ip->inet_data.bits = 32; + memcpy(ip->inet_data.ipaddr, base + 4 * index, 4); + } + else if (cmeta->attopts.fixed_size_binary.byteWidth == 16) + { + if (16 * index >= length) + elog(ERROR, "corruption? Binary[inet6] points out of range"); + ip->inet_data.family = PGSQL_AF_INET6; + ip->inet_data.bits = 128; + memcpy(ip->inet_data.ipaddr, base + 16 * index, 16); + } + else + elog(ERROR, "Bug? wrong FixedSizeBinary::byteWidth(%d) for inet", + cmeta->attopts.fixed_size_binary.byteWidth); + + SET_INET_VARSIZE(ip); + return PointerGetDatum(ip); +} + +static Datum +pg_array_arrow_ref(kern_data_store *kds, + kern_colmeta *smeta, + cl_uint start, cl_uint end) +{ + ArrayType *res; + size_t sz; + cl_uint i, nitems = end - start; + bits8 *nullmap = NULL; + size_t usage, __usage; + + /* sanity checks */ + if (start > end) + elog(ERROR, "Bug? array index has reversed order [%u..%u]", start, end); + + /* allocation of the result buffer */ + if (smeta->nullmap_offset != 0) + sz = ARR_OVERHEAD_WITHNULLS(1, nitems); + else + sz = ARR_OVERHEAD_NONULLS(1); + + if (smeta->attlen > 0) + { + sz += TYPEALIGN(smeta->attalign, + smeta->attlen) * nitems; + } + else if (smeta->attlen == -1) + { + sz += 400; /* tentative allocation */ + } + else + elog(ERROR, "Bug? corrupted kernel column metadata"); + + res = palloc0(sz); + res->ndim = 1; + if (smeta->nullmap_offset != 0) + { + res->dataoffset = ARR_OVERHEAD_WITHNULLS(1, nitems); + nullmap = ARR_NULLBITMAP(res); + } + res->elemtype = smeta->atttypid; + ARR_DIMS(res)[0] = nitems; + ARR_LBOUND(res)[0] = 1; + usage = ARR_DATA_OFFSET(res); + for (i=0; i < nitems; i++) + { + Datum datum; + bool isnull; + + pg_datum_arrow_ref(kds, smeta, start+i, &datum, &isnull); + if (isnull) + { + if (!nullmap) + elog(ERROR, "Bug? element item should not be NULL"); + } + else if (smeta->attlen > 0) + { + if (nullmap) + nullmap[i>>3] |= (1<<(i&7)); + __usage = TYPEALIGN(smeta->attalign, usage); + while (__usage + smeta->attlen > sz) + { + sz += sz; + res = repalloc(res, sz); + } + if (__usage > usage) + memset((char *)res + usage, 0, __usage - usage); + memcpy((char *)res + __usage, &datum, smeta->attlen); + usage = __usage + smeta->attlen; + } + else if (smeta->attlen == -1) + { + cl_int vl_len = VARSIZE(datum); + + if (nullmap) + nullmap[i>>3] |= (1<<(i&7)); + __usage = TYPEALIGN(smeta->attalign, usage); + while (__usage + vl_len > sz) + { + sz += sz; + res = repalloc(res, sz); + } + if (__usage > usage) + memset((char *)res + usage, 0, __usage - usage); + memcpy((char *)res + __usage, DatumGetPointer(datum), vl_len); + usage = __usage + vl_len; + + pfree(DatumGetPointer(datum)); + } + else + elog(ERROR, "Bug? corrupted kernel column metadata"); + } + SET_VARSIZE(res, usage); + + return PointerGetDatum(res); +} + +/* + * pg_datum_arrow_ref + */ +static void +pg_datum_arrow_ref(kern_data_store *kds, + kern_colmeta *cmeta, + size_t index, + Datum *p_datum, + bool *p_isnull) +{ + Datum datum = 0; + bool isnull = true; + + if (cmeta->nullmap_offset != 0) + { + size_t nullmap_offset = __kds_unpack(cmeta->nullmap_offset); + uint8 *nullmap = (uint8 *)kds + nullmap_offset; + + if (att_isnull(index, nullmap)) + goto out; + } + + if (cmeta->atttypkind == TYPE_KIND__ARRAY) + { + /* array type */ + kern_colmeta *smeta; + uint32 *offset; + + if (cmeta->num_subattrs != 1 || + cmeta->idx_subattrs < kds->ncols || + cmeta->idx_subattrs >= kds->nr_colmeta) + elog(ERROR, "Bug? corrupted kernel column metadata"); + if (sizeof(uint32) * (index+2) > __kds_unpack(cmeta->values_length)) + elog(ERROR, "Bug? array index is out of range"); + smeta = &kds->colmeta[cmeta->idx_subattrs]; + offset = (uint32 *)((char *)kds + __kds_unpack(cmeta->values_offset)); + datum = pg_array_arrow_ref(kds, smeta, + offset[index], + offset[index+1]); + isnull = false; + } + else if (cmeta->atttypkind == TYPE_KIND__COMPOSITE) + { + /* composite type */ + TupleDesc tupdesc = lookup_rowtype_tupdesc(cmeta->atttypid, -1); + Datum *sub_values = alloca(sizeof(Datum) * tupdesc->natts); + bool *sub_isnull = alloca(sizeof(bool) * tupdesc->natts); + HeapTuple htup; + int j; + + if (tupdesc->natts != cmeta->num_subattrs) + elog(ERROR, "Struct definition is conrrupted?"); + if (cmeta->idx_subattrs < kds->ncols || + cmeta->idx_subattrs + cmeta->num_subattrs > kds->nr_colmeta) + elog(ERROR, "Bug? strange kernel column metadata"); + + for (j=0; j < tupdesc->natts; j++) + { + kern_colmeta *sub_meta = &kds->colmeta[cmeta->idx_subattrs + j]; + + pg_datum_arrow_ref(kds, sub_meta, index, + sub_values + j, + sub_isnull + j); + } + htup = heap_form_tuple(tupdesc, sub_values, sub_isnull); + + ReleaseTupleDesc(tupdesc); + + datum = PointerGetDatum(htup->t_data); + isnull = false; + } + else if (cmeta->atttypkind != TYPE_KIND__NULL) + { + /* anything else, except for unreferenced column */ + int i; + + switch (cmeta->atttypid) + { + case INT1OID: + datum = pg_int1_arrow_ref(kds, cmeta, index); + break; + case INT2OID: + case FLOAT2OID: + datum = pg_int2_arrow_ref(kds, cmeta, index); + break; + case INT4OID: + case FLOAT4OID: + datum = pg_int4_arrow_ref(kds, cmeta, index); + break; + case INT8OID: + case FLOAT8OID: + datum = pg_int8_arrow_ref(kds, cmeta, index); + break; + case TEXTOID: + case BYTEAOID: + datum = pg_varlena_arrow_ref(kds, cmeta, index); + break; + case BPCHAROID: + datum = pg_bpchar_arrow_ref(kds, cmeta, index); + break; + case BOOLOID: + datum = pg_bool_arrow_ref(kds, cmeta, index); + break; + case NUMERICOID: + datum = pg_numeric_arrow_ref(kds, cmeta, index); + break; + case DATEOID: + datum = pg_date_arrow_ref(kds, cmeta, index); + break; + case TIMEOID: + datum = pg_time_arrow_ref(kds, cmeta, index); + break; + case TIMESTAMPOID: + case TIMESTAMPTZOID: + datum = pg_timestamp_arrow_ref(kds, cmeta, index); + break; + case INTERVALOID: + datum = pg_interval_arrow_ref(kds, cmeta, index); + break; + case MACADDROID: + datum = pg_macaddr_arrow_ref(kds, cmeta, index); + break; + case INETOID: + datum = pg_inet_arrow_ref(kds, cmeta, index); + break; + default: + for (i=0; i < pgstrom_num_users_extra; i++) + { + pgstromUsersExtraDescriptor *extra = &pgstrom_users_extra_desc[i]; + + if (extra->arrow_datum_ref && + extra->arrow_datum_ref(kds, cmeta, index, &datum, &isnull)) + { + goto out; + } + } + elog(ERROR, "Bug? unexpected datum type: %u", cmeta->atttypid); + break; + } + isnull = false; + } +out: + *p_datum = datum; + *p_isnull = isnull; +} + +/* + * KDS_fetch_tuple_arrow + */ +bool +KDS_fetch_tuple_arrow(TupleTableSlot *slot, + kern_data_store *kds, + size_t index) +{ + int j; + + if (index >= kds->nitems) + return false; + ExecStoreAllNullTuple(slot); + for (j=0; j < kds->ncols; j++) + { + kern_colmeta *cmeta = &kds->colmeta[j]; + + pg_datum_arrow_ref(kds, cmeta, + index, + slot->tts_values + j, + slot->tts_isnull + j); + } + return true; +} + +/* + * arrowFdwExtractFilesList + */ +static List * +__arrowFdwExtractFilesList(List *options_list, + int *p_parallel_nworkers, + bool *p_writable) +{ + ListCell *lc; + List *filesList = NIL; + char *dir_path = NULL; + char *dir_suffix = NULL; + int parallel_nworkers = -1; + bool writable = false; /* default: read-only */ + + foreach (lc, options_list) + { + DefElem *defel = lfirst(lc); + + Assert(IsA(defel->arg, String)); + if (strcmp(defel->defname, "file") == 0) + { + char *temp = strVal(defel->arg); + filesList = lappend(filesList, makeString(pstrdup(temp))); + } + else if (strcmp(defel->defname, "files") == 0) + { + char *temp = pstrdup(strVal(defel->arg)); + char *saveptr; + char *tok, *pos; + + while ((tok = strtok_r(temp, ",", &saveptr)) != NULL) + { + while (isspace(*tok)) + tok++; + pos = tok + strlen(tok) - 1; + while (pos >= tok && isspace(*pos)) + *pos-- = '\0'; + + filesList = lappend(filesList, makeString(pstrdup(tok))); + + temp = NULL; + } + } + else if (strcmp(defel->defname, "dir") == 0) + { + dir_path = strVal(defel->arg); + } + else if (strcmp(defel->defname, "suffix") == 0) + { + dir_suffix = strVal(defel->arg); + } + else if (strcmp(defel->defname, "parallel_workers") == 0) + { + if (parallel_nworkers >= 0) + elog(ERROR, "'parallel_workers' appeared twice"); + parallel_nworkers = atoi(strVal(defel->arg)); + } + else if (strcmp(defel->defname, "writable") == 0) + { + writable = defGetBoolean(defel); + } + else + elog(ERROR, "arrow: unknown option (%s)", defel->defname); + } + if (dir_suffix && !dir_path) + elog(ERROR, "arrow: cannot use 'suffix' option without 'dir'"); + + if (writable) + { + if (dir_path) + elog(ERROR, "arrow: 'dir_path' and 'writable' options are exclusive"); + if (list_length(filesList) == 0) + elog(ERROR, "arrow: 'writable' needs a backend file specified by 'file' option"); + if (list_length(filesList) > 1) + elog(ERROR, "arrow: 'writable' cannot use multiple backend files"); + } + + if (dir_path) + { + struct dirent *dentry; + DIR *dir; + char *temp; + + dir = AllocateDir(dir_path); + while ((dentry = ReadDir(dir, dir_path)) != NULL) + { + if (strcmp(dentry->d_name, ".") == 0 || + strcmp(dentry->d_name, "..") == 0) + continue; + if (dir_suffix) + { + int dlen = strlen(dentry->d_name); + int slen = strlen(dir_suffix); + int diff; + + if (dlen < 2 + slen) + continue; + diff = dlen - slen; + if (dentry->d_name[diff-1] != '.' || + strcmp(dentry->d_name + diff, dir_suffix) != 0) + continue; + } + temp = psprintf("%s/%s", dir_path, dentry->d_name); + filesList = lappend(filesList, makeString(temp)); + } + FreeDir(dir); + } + + if (filesList == NIL) + elog(ERROR, "no files are configured on behalf of the arrow_fdw foreign table"); + foreach (lc, filesList) + { + const char *fname = strVal(lfirst(lc)); + + if (!writable) + { + if (access(fname, R_OK) != 0) + elog(ERROR, "unable to read '%s': %m", fname); + } + else + { + if (access(fname, R_OK | W_OK) != 0) + { + if (errno != ENOENT) + elog(ERROR, "unable to read/write '%s': %m", fname); + else + { + char *temp = pstrdup(fname); + char *dname = dirname(temp); + + if (access(dname, R_OK | W_OK | X_OK) != 0) + elog(ERROR, "unable to create '%s': %m", fname); + pfree(temp); + } + } + } + } + /* other properties */ + if (p_parallel_nworkers) + *p_parallel_nworkers = parallel_nworkers; + if (p_writable) + *p_writable = writable; + + return filesList; +} + +static List * +arrowFdwExtractFilesList(List *options_list) +{ + return __arrowFdwExtractFilesList(options_list, NULL, NULL); +} + + +/* + * validator of Arrow_Fdw + */ +Datum +pgstrom_arrow_fdw_validator(PG_FUNCTION_ARGS) +{ + List *options_list = untransformRelOptions(PG_GETARG_DATUM(0)); + Oid catalog = PG_GETARG_OID(1); + + if (catalog == ForeignTableRelationId) + { + List *filesList; + ListCell *lc; + + filesList = arrowFdwExtractFilesList(options_list); + foreach (lc, filesList) + { + ArrowFileInfo af_info; + const char *fname = strVal(lfirst(lc)); + + readArrowFile(fname, &af_info, true); + } + } + else if (options_list != NIL) + { + const char *label; + char temp[80]; + + switch (catalog) + { + case ForeignDataWrapperRelationId: + label = "FOREIGN DATA WRAPPER"; + break; + case ForeignServerRelationId: + label = "SERVER"; + break; + case UserMappingRelationId: + label = "USER MAPPING"; + break; + case AttributeRelationId: + label = "attribute of FOREIGN TABLE"; + break; + default: + snprintf(temp, sizeof(temp), + "[unexpected object catalog=%u]", catalog); + label = temp; + break; + } + elog(ERROR, "Arrow_Fdw does not support any options for %s", label); + } + PG_RETURN_VOID(); +} +PG_FUNCTION_INFO_V1(pgstrom_arrow_fdw_validator); + +/* + * pgstrom_arrow_fdw_precheck_schema + */ +static void +arrow_fdw_precheck_schema(Relation rel) +{ + TupleDesc tupdesc = RelationGetDescr(rel); + ForeignTable *ft = GetForeignTable(RelationGetRelid(rel)); + List *filesList; + ListCell *lc; + bool writable; +#if 0 + int j; + + /* check schema definition is supported by Apache Arrow */ + for (j=0; j < tupdesc->natts; j++) + { + Form_pg_attribute attr = tupleDescAttr(tupdesc, j); + + if (!arrowTypeIsConvertible(attr->atttypid, + attr->atttypmod)) + elog(ERROR, "column %s of foreign table %s has %s type that is not convertible any supported Apache Arrow types", + NameStr(attr->attname), + RelationGetRelationName(rel), + format_type_be(attr->atttypid)); + } +#endif + filesList = __arrowFdwExtractFilesList(ft->options, + NULL, + &writable); + foreach (lc, filesList) + { + const char *fname = strVal(lfirst(lc)); + File filp; + List *rb_cached = NIL; + ListCell *cell; + + filp = PathNameOpenFile(fname, O_RDONLY | PG_BINARY); + if (filp < 0) + { + if (writable && errno == ENOENT) + continue; + elog(ERROR, "failed to open '%s' on behalf of '%s': %m", + fname, RelationGetRelationName(rel)); + } + /* check schema compatibility */ + rb_cached = arrowLookupOrBuildMetadataCache(filp, NULL); + foreach (cell, rb_cached) + { + RecordBatchState *rb_state = lfirst(cell); + + if (!arrowSchemaCompatibilityCheck(tupdesc, rb_state)) + elog(ERROR, "arrow file '%s' on behalf of the foreign table '%s' has incompatible schema definition", + fname, RelationGetRelationName(rel)); + } + list_free(rb_cached); + } +} + +Datum +pgstrom_arrow_fdw_precheck_schema(PG_FUNCTION_ARGS) +{ + EventTriggerData *trigdata; + + if (!CALLED_AS_EVENT_TRIGGER(fcinfo)) + elog(ERROR, "%s: must be called as EventTrigger", + __FUNCTION__); + trigdata = (EventTriggerData *) fcinfo->context; + if (strcmp(trigdata->event, "ddl_command_end") != 0) + elog(ERROR, "%s: must be called on ddl_command_end event", + __FUNCTION__); + if (strcmp(GetCommandTagName(trigdata->tag), + "CREATE FOREIGN TABLE") == 0) + { + CreateStmt *stmt = (CreateStmt *)trigdata->parsetree; + Relation rel; + + rel = relation_openrv_extended(stmt->relation, AccessShareLock, true); + if (!rel) + PG_RETURN_NULL(); + if (rel->rd_rel->relkind == RELKIND_FOREIGN_TABLE && + GetFdwRoutineForRelation(rel, false) == &pgstrom_arrow_fdw_routine) + { + arrow_fdw_precheck_schema(rel); + } + relation_close(rel, AccessShareLock); + } + else if (strcmp(GetCommandTagName(trigdata->tag), + "ALTER FOREIGN TABLE") == 0 && + IsA(trigdata->parsetree, AlterTableStmt)) + { + AlterTableStmt *stmt = (AlterTableStmt *)trigdata->parsetree; + Relation rel; + ListCell *lc; + bool has_schema_change = false; + + rel = relation_openrv_extended(stmt->relation, AccessShareLock, true); + if (!rel) + PG_RETURN_NULL(); + if (rel->rd_rel->relkind == RELKIND_FOREIGN_TABLE && + GetFdwRoutineForRelation(rel, false) == &pgstrom_arrow_fdw_routine) + { + foreach (lc, stmt->cmds) + { + AlterTableCmd *cmd = lfirst(lc); + + if (cmd->subtype == AT_AddColumn || + cmd->subtype == AT_DropColumn || + cmd->subtype == AT_AlterColumnType) + { + has_schema_change = true; + break; + } + } + if (has_schema_change) + arrow_fdw_precheck_schema(rel); + } + relation_close(rel, AccessShareLock); + } + PG_RETURN_NULL(); +} +PG_FUNCTION_INFO_V1(pgstrom_arrow_fdw_precheck_schema); + +/* + * arrowInvalidateMetadataCache + * + * NOTE: caller must have lock_slots[] with EXCLUSIVE mode + */ +static uint64 +__arrowInvalidateMetadataCache(arrowMetadataCache *mcache, bool detach_lru) +{ + arrowMetadataCache *mtemp; + dlist_node *dnode; + uint64 released = 0; + + while (!dlist_is_empty(&mcache->siblings)) + { + dnode = dlist_pop_head_node(&mcache->siblings); + mtemp = dlist_container(arrowMetadataCache, chain, dnode); + Assert(dlist_is_empty(&mtemp->siblings) && + !mtemp->lru_chain.prev && !mtemp->lru_chain.next); + dlist_delete(&mtemp->chain); + released += MAXALIGN(offsetof(arrowMetadataCache, + fstate[mtemp->nfields])); + pfree(mtemp); + } + released += MAXALIGN(offsetof(arrowMetadataCache, + fstate[mcache->nfields])); + if (detach_lru) + { + SpinLockAcquire(&arrow_metadata_state->lru_lock); + dlist_delete(&mcache->lru_chain); + SpinLockRelease(&arrow_metadata_state->lru_lock); + } + dlist_delete(&mcache->chain); + pfree(mcache); + + return pg_atomic_sub_fetch_u64(&arrow_metadata_state->consumed, released); +} + +static void +arrowInvalidateMetadataCache(MetadataCacheKey *mkey, bool detach_lru) +{ + dlist_mutable_iter miter; + int index = mkey->hash % ARROW_METADATA_HASH_NSLOTS; + + dlist_foreach_modify(miter, &arrow_metadata_state->hash_slots[index]) + { + arrowMetadataCache *mcache + = dlist_container(arrowMetadataCache, chain, miter.cur); + + if (mcache->stat_buf.st_dev == mkey->st_dev && + mcache->stat_buf.st_ino == mkey->st_ino) + { + elog(DEBUG2, "arrow_fdw: metadata cache invalidation for the file (st_dev=%lu/st_ino=%lu)", + mkey->st_dev, mkey->st_ino); + __arrowInvalidateMetadataCache(mcache, true); + } + } +} + +/* + * copyMetadataFieldCache - copy for nested structure + */ +static int +copyMetadataFieldCache(RecordBatchFieldState *dest_curr, + RecordBatchFieldState *dest_tail, + int nattrs, + RecordBatchFieldState *columns, + Bitmapset **p_stat_attrs) +{ + RecordBatchFieldState *dest_next = dest_curr + nattrs; + int j, k, nslots = nattrs; + + if (dest_next > dest_tail) + return -1; + + for (j=0; j < nattrs; j++) + { + RecordBatchFieldState *__dest = dest_curr + j; + RecordBatchFieldState *__orig = columns + j; + + memcpy(__dest, __orig, sizeof(RecordBatchFieldState)); + if (__dest->num_children == 0) + Assert(__dest->children == NULL); + else + { + __dest->children = dest_next; + k = copyMetadataFieldCache(dest_next, + dest_tail, + __orig->num_children, + __orig->children, + NULL); + if (k < 0) + return -1; + dest_next += k; + nslots += k; + } + if (p_stat_attrs && !__orig->stat_isnull) + *p_stat_attrs = bms_add_member(*p_stat_attrs, j+1); + } + return nslots; +} + +/* + * makeRecordBatchStateFromCache + * - setup RecordBatchState from arrowMetadataCache + */ +static RecordBatchState * +makeRecordBatchStateFromCache(arrowMetadataCache *mcache, + File fdesc, + Bitmapset **p_stat_attrs) +{ + RecordBatchState *rbstate; + + rbstate = palloc0(offsetof(RecordBatchState, + columns[mcache->nfields])); + rbstate->fdesc = fdesc; + memcpy(&rbstate->stat_buf, &mcache->stat_buf, sizeof(struct stat)); + rbstate->rb_index = mcache->rb_index; + rbstate->rb_offset = mcache->rb_offset; + rbstate->rb_length = mcache->rb_length; + rbstate->rb_nitems = mcache->rb_nitems; + rbstate->ncols = mcache->ncols; + copyMetadataFieldCache(rbstate->columns, + rbstate->columns + mcache->nfields, + mcache->ncols, + mcache->fstate, + p_stat_attrs); + return rbstate; +} + +/* + * arrowReclaimMetadataCache + */ +static void +arrowReclaimMetadataCache(void) +{ + arrowMetadataCache *mcache; + LWLock *lock = NULL; + dlist_node *dnode; + uint32 lru_hash; + uint32 lru_index; + uint64 consumed; + + consumed = pg_atomic_read_u64(&arrow_metadata_state->consumed); + if (consumed <= arrow_metadata_cache_size) + return; + + SpinLockAcquire(&arrow_metadata_state->lru_lock); + if (dlist_is_empty(&arrow_metadata_state->lru_list)) + { + SpinLockRelease(&arrow_metadata_state->lru_lock); + return; + } + dnode = dlist_tail_node(&arrow_metadata_state->lru_list); + mcache = dlist_container(arrowMetadataCache, lru_chain, dnode); + lru_hash = mcache->hash; + SpinLockRelease(&arrow_metadata_state->lru_lock); + + do { + lru_index = lru_hash % ARROW_METADATA_HASH_NSLOTS; + lock = &arrow_metadata_state->lock_slots[lru_index]; + + LWLockAcquire(lock, LW_EXCLUSIVE); + SpinLockAcquire(&arrow_metadata_state->lru_lock); + if (dlist_is_empty(&arrow_metadata_state->lru_list)) + { + SpinLockRelease(&arrow_metadata_state->lru_lock); + LWLockRelease(lock); + break; + } + dnode = dlist_tail_node(&arrow_metadata_state->lru_list); + mcache = dlist_container(arrowMetadataCache, lru_chain, dnode); + if (mcache->hash == lru_hash) + { + dlist_delete(&mcache->lru_chain); + memset(&mcache->lru_chain, 0, sizeof(dlist_node)); + SpinLockRelease(&arrow_metadata_state->lru_lock); + consumed = __arrowInvalidateMetadataCache(mcache, false); + } + else + { + /* LRU-tail was referenced by someone, try again */ + lru_hash = mcache->hash; + SpinLockRelease(&arrow_metadata_state->lru_lock); + } + LWLockRelease(lock); + } while (consumed > arrow_metadata_cache_size); +} + +/* + * __arrowBuildMetadataCache + * + * NOTE: caller must have exclusive lock on arrow_metadata_state->lock_slots[] + */ +static arrowMetadataCache * +__arrowBuildMetadataCache(List *rb_state_list, uint32 hash) +{ + arrowMetadataCache *mcache = NULL; + arrowMetadataCache *mtemp; + dlist_node *dnode; + Size sz, consumed = 0; + int nfields; + ListCell *lc; + + foreach (lc, rb_state_list) + { + RecordBatchState *rbstate = lfirst(lc); + + if (!mcache) + nfields = RecordBatchFieldCount(rbstate); + else + Assert(nfields == RecordBatchFieldCount(rbstate)); + + sz = offsetof(arrowMetadataCache, fstate[nfields]); + mtemp = MemoryContextAllocZero(TopSharedMemoryContext, sz); + if (!mtemp) + { + /* !!out of memory!! */ + if (mcache) + { + while (!dlist_is_empty(&mcache->siblings)) + { + dnode = dlist_pop_head_node(&mcache->siblings); + mtemp = dlist_container(arrowMetadataCache, + chain, dnode); + pfree(mtemp); + } + pfree(mcache); + } + return NULL; + } + + dlist_init(&mtemp->siblings); + memcpy(&mtemp->stat_buf, &rbstate->stat_buf, sizeof(struct stat)); + mtemp->hash = hash; + mtemp->rb_index = rbstate->rb_index; + mtemp->rb_offset = rbstate->rb_offset; + mtemp->rb_length = rbstate->rb_length; + mtemp->rb_nitems = rbstate->rb_nitems; + mtemp->ncols = rbstate->ncols; + mtemp->nfields = + copyMetadataFieldCache(mtemp->fstate, + mtemp->fstate + nfields, + rbstate->ncols, + rbstate->columns, + NULL); + Assert(mtemp->nfields == nfields); + + if (!mcache) + mcache = mtemp; + else + dlist_push_tail(&mcache->siblings, &mtemp->chain); + consumed += MAXALIGN(sz); + } + pg_atomic_add_fetch_u64(&arrow_metadata_state->consumed, consumed); + + return mcache; +} + + +/* + * checkArrowRecordBatchIsVisible + * + * NOTE: It must be called under shared lock on lock_slots[] + */ +static bool +checkArrowRecordBatchIsVisible(RecordBatchState *rbstate, + dlist_head *mvcc_slot) +{ + dlist_iter iter; + + dlist_foreach(iter, mvcc_slot) + { + arrowWriteMVCCLog *mvcc = dlist_container(arrowWriteMVCCLog, + chain, iter.cur); + if (mvcc->key.st_dev == rbstate->stat_buf.st_dev && + mvcc->key.st_ino == rbstate->stat_buf.st_ino && + mvcc->record_batch == rbstate->rb_index) + { + if (TransactionIdIsCurrentTransactionId(mvcc->xid)) + return true; + else + return false; + } + } + return true; +} + +/* + * arrowLookupOrBuildMetadataCache + */ +List * +arrowLookupOrBuildMetadataCache(File fdesc, Bitmapset **p_stat_attrs) +{ + MetadataCacheKey key; + struct stat stat_buf; + uint32 index; + LWLock *lock; + dlist_head *hash_slot; + dlist_head *mvcc_slot; + dlist_iter iter1, iter2; + bool has_exclusive = false; + List *results = NIL; + + if (fstat(FileGetRawDesc(fdesc), &stat_buf) != 0) + elog(ERROR, "failed on fstat('%s'): %m", FilePathName(fdesc)); + + index = initMetadataCacheKey(&key, &stat_buf); + lock = &arrow_metadata_state->lock_slots[index]; + hash_slot = &arrow_metadata_state->hash_slots[index]; + mvcc_slot = &arrow_metadata_state->mvcc_slots[index]; + + LWLockAcquire(lock, LW_SHARED); +retry: + dlist_foreach(iter1, hash_slot) + { + arrowMetadataCache *mcache + = dlist_container(arrowMetadataCache, chain, iter1.cur); + if (mcache->stat_buf.st_dev == stat_buf.st_dev && + mcache->stat_buf.st_ino == stat_buf.st_ino) + { + RecordBatchState *rbstate; + + Assert(mcache->hash == key.hash); + if (timespec_comp(&mcache->stat_buf.st_mtim, + &stat_buf.st_mtim) < 0 || + timespec_comp(&mcache->stat_buf.st_ctim, + &stat_buf.st_ctim) < 0) + { + char buf1[80], buf2[80], buf3[80], buf4[80]; + char *tail; + + if (!has_exclusive) + { + LWLockRelease(lock); + LWLockAcquire(lock, LW_EXCLUSIVE); + has_exclusive = true; + goto retry; + } + ctime_r(&mcache->stat_buf.st_mtime, buf1); + ctime_r(&mcache->stat_buf.st_ctime, buf2); + ctime_r(&stat_buf.st_mtime, buf3); + ctime_r(&stat_buf.st_ctime, buf4); + for (tail=buf1+strlen(buf1)-1; isspace(*tail); *tail--='\0'); + for (tail=buf2+strlen(buf2)-1; isspace(*tail); *tail--='\0'); + for (tail=buf3+strlen(buf3)-1; isspace(*tail); *tail--='\0'); + for (tail=buf4+strlen(buf4)-1; isspace(*tail); *tail--='\0'); + elog(DEBUG2, "arrow_fdw: metadata cache for '%s' (m:%s, c:%s) is older than the latest file (m:%s, c:%s), so invalidated", + FilePathName(fdesc), buf1, buf2, buf3, buf4); + __arrowInvalidateMetadataCache(mcache, true); + break; + } + /* + * Ok, arrow file metadata cache found and still valid + * + * NOTE: we currently support min/max statistics on the top- + * level variables only, not sub-field of the composite values. + */ + rbstate = makeRecordBatchStateFromCache(mcache, fdesc, + p_stat_attrs); + if (checkArrowRecordBatchIsVisible(rbstate, mvcc_slot)) + results = list_make1(rbstate); + dlist_foreach (iter2, &mcache->siblings) + { + arrowMetadataCache *__mcache + = dlist_container(arrowMetadataCache, chain, iter2.cur); + rbstate = makeRecordBatchStateFromCache(__mcache, fdesc, + p_stat_attrs); + if (checkArrowRecordBatchIsVisible(rbstate, mvcc_slot)) + results = lappend(results, rbstate); + } + SpinLockAcquire(&arrow_metadata_state->lru_lock); + dlist_move_head(&arrow_metadata_state->lru_list, + &mcache->lru_chain); + SpinLockRelease(&arrow_metadata_state->lru_lock); + LWLockRelease(lock); + + return results; + } + } + + /* + * Hmm... no valid metadata cache was not found, so build a new entry + * under the exclusive lock on the arrow file. + */ + if (!has_exclusive) + { + LWLockRelease(lock); + LWLockAcquire(lock, LW_EXCLUSIVE); + has_exclusive = true; + goto retry; + } + else + { + ArrowFileInfo af_info; + arrowMetadataCache *mcache; + arrowStatsBinary *arrow_bstats; + List *rb_state_any = NIL; + + readArrowFileDesc(FileGetRawDesc(fdesc), &af_info); + if (af_info.dictionaries != NULL) + elog(ERROR, "DictionaryBatch is not supported"); + Assert(af_info.footer._num_dictionaries == 0); + + if (af_info.recordBatches == NULL) + elog(DEBUG2, "arrow file '%s' contains no RecordBatch", + FilePathName(fdesc)); + + arrow_bstats = buildArrowStatsBinary(&af_info.footer, p_stat_attrs); + for (index = 0; index < af_info.footer._num_recordBatches; index++) + { + RecordBatchState *rb_state; + ArrowBlock *block + = &af_info.footer.recordBatches[index]; + ArrowRecordBatch *rbatch + = &af_info.recordBatches[index].body.recordBatch; + + rb_state = makeRecordBatchState(&af_info.footer.schema, + block, rbatch); + rb_state->fdesc = fdesc; + memcpy(&rb_state->stat_buf, &stat_buf, sizeof(struct stat)); + rb_state->rb_index = index; + + if (arrow_bstats) + applyArrowStatsBinary(rb_state, arrow_bstats); + + if (checkArrowRecordBatchIsVisible(rb_state, mvcc_slot)) + results = lappend(results, rb_state); + rb_state_any = lappend(rb_state_any, rb_state); + } + releaseArrowStatsBinary(arrow_bstats); + /* try to build a metadata cache for further references */ + mcache = __arrowBuildMetadataCache(rb_state_any, key.hash); + if (mcache) + { + dlist_push_head(hash_slot, &mcache->chain); + SpinLockAcquire(&arrow_metadata_state->lru_lock); + dlist_push_head(&arrow_metadata_state->lru_list, + &mcache->lru_chain); + SpinLockRelease(&arrow_metadata_state->lru_lock); + } + } + LWLockRelease(lock); + /* + * reclaim unreferenced metadata cache entries based on LRU, if shared- + * memory consumption exceeds the configured threshold. + */ + arrowReclaimMetadataCache(); + + return results; +} + +/* + * lookup_type_extension_info + */ +static void +lookup_type_extension_info(Oid type_oid, + const char **p_extname, + const char **p_extschema) +{ + Oid ext_oid; + char *extname = NULL; + char *extschema = NULL; + + ext_oid = get_object_extension_oid(TypeRelationId, + type_oid, 0, true); + if (OidIsValid(ext_oid)) + { + Relation rel; + SysScanDesc sscan; + ScanKeyData skey; + HeapTuple tup; + + rel = table_open(ExtensionRelationId, AccessShareLock); + ScanKeyInit(&skey, + Anum_pg_extension_oid, + BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(ext_oid)); + sscan = systable_beginscan(rel, ExtensionOidIndexId, + true, NULL, 1, &skey); + tup = systable_getnext(sscan); + if (HeapTupleIsValid(tup)) + { + Form_pg_extension __ext = (Form_pg_extension) GETSTRUCT(tup); + + extname = pstrdup(NameStr(__ext->extname)); + if (__ext->extrelocatable) + extschema = get_namespace_name(__ext->extnamespace); + } + systable_endscan(sscan); + table_close(rel, AccessShareLock); + } + *p_extname = extname; + *p_extschema = extschema; +} + +/* + * setupArrowSQLbufferSchema + */ +static void +__setupArrowSQLbufferField(SQLtable *table, + SQLfield *column, + const char *attname, + Oid atttypid, + int32 atttypmod, + ArrowField *afield) +{ + HeapTuple tup; + Form_pg_type __type; + const char *typname; + const char *typnamespace; + const char *timezone = show_timezone(); + const char *extname; + const char *extschema; + SQLstat *stat_list; + + /* walk down to the base type, if domain */ + for (;;) + { + tup = SearchSysCache1(TYPEOID, ObjectIdGetDatum(atttypid)); + if (!HeapTupleIsValid(tup)) + elog(ERROR, "cache lookup failed for type: %u", atttypid); + __type = (Form_pg_type) GETSTRUCT(tup); + if (__type->typtype != TYPTYPE_DOMAIN) + break; + atttypid = __type->typbasetype; + atttypmod = __type->typtypmod; + ReleaseSysCache(tup); + } + typname = NameStr(__type->typname); + typnamespace = get_namespace_name(__type->typnamespace); + lookup_type_extension_info(atttypid, + &extname, + &extschema); + table->numFieldNodes++; + table->numBuffers += + assignArrowTypePgSQL(column, + attname, + atttypid, + atttypmod, + typname, + typnamespace, + __type->typlen, + __type->typbyval, + __type->typtype, + __type->typalign, + __type->typrelid, + __type->typelem, + timezone, + extname, + extschema, + afield); + /* assign existing min/max statistics, if any */ + if (afield) + { + stat_list = __buildArrowFieldStatsList(afield, table->numRecordBatches); + if (stat_list) + { + column->stat_list = stat_list; + column->stat_enabled = true; + table->has_statistics = true; + } + } + + if (OidIsValid(__type->typelem) && __type->typlen == -1) + { + /* array type */ + char elem_name[NAMEDATALEN+10]; + ArrowField *__afield = NULL; + + snprintf(elem_name, sizeof(elem_name), "_%s[]", attname); + column->element = palloc0(sizeof(SQLfield)); + if (afield) + { + if (afield->_num_children != 1) + elog(ERROR, "Arrow::Field (%s) is not compatible", afield->name); + __afield = &afield->children[0]; + } + __setupArrowSQLbufferField(table, + column->element, + elem_name, + __type->typelem, + -1, + __afield); + } + else if (OidIsValid(__type->typrelid)) + { + /* composite type */ + TupleDesc tupdesc = lookup_rowtype_tupdesc(atttypid, atttypmod); + int j; + + if (afield && afield->_num_children != tupdesc->natts) + elog(ERROR, "Arrow::Field (%s) is not compatible", afield->name); + + column->nfields = tupdesc->natts; + column->subfields = palloc0(sizeof(SQLfield) * tupdesc->natts); + for (j=0; j < tupdesc->natts; j++) + { + Form_pg_attribute sattr = tupleDescAttr(tupdesc, j); + ArrowField *__afield = NULL; + + if (afield) + __afield = &afield->children[j]; + __setupArrowSQLbufferField(table, + &column->subfields[j], + NameStr(sattr->attname), + sattr->atttypid, + sattr->atttypmod, + __afield); + } + ReleaseTupleDesc(tupdesc); + } + else if (__type->typtype == 'e') + { + elog(ERROR, "Enum type is not supported right now"); + } + ReleaseSysCache(tup); +} + +static void +setupArrowSQLbufferSchema(SQLtable *table, TupleDesc tupdesc, + ArrowFileInfo *af_info) +{ + int j; + + Assert(!af_info || af_info->footer.schema._num_fields == tupdesc->natts); + table->nfields = tupdesc->natts; + for (j=0; j < tupdesc->natts; j++) + { + Form_pg_attribute attr = tupleDescAttr(tupdesc, j); + ArrowField *afield = NULL; + + if (af_info) + afield = &af_info->footer.schema.fields[j]; + __setupArrowSQLbufferField(table, + &table->columns[j], + NameStr(attr->attname), + attr->atttypid, + attr->atttypmod, + afield); + } + table->segment_sz = (size_t)arrow_record_batch_size_kb << 10; +} + +static void +setupArrowSQLbufferBatches(SQLtable *table, ArrowFileInfo *af_info) +{ + loff_t pos = 0; + int i, nitems; + + /* restore DictionaryBatches already in the file */ + nitems = af_info->footer._num_dictionaries; + table->numDictionaries = nitems; + if (nitems > 0) + { + table->dictionaries = palloc(sizeof(ArrowBlock) * nitems); + memcpy(table->dictionaries, + af_info->footer.dictionaries, + sizeof(ArrowBlock) * nitems); + for (i=0; i < nitems; i++) + { + ArrowBlock *block = &table->dictionaries[i]; + + pos = Max(pos, ARROWALIGN(block->offset + + block->metaDataLength + + block->bodyLength)); + } + } + else + table->dictionaries = NULL; + + /* restore RecordBatches already in the file */ + nitems = af_info->footer._num_recordBatches; + table->numRecordBatches = nitems; + if (nitems > 0) + { + table->recordBatches = palloc(sizeof(ArrowBlock) * nitems); + memcpy(table->recordBatches, + af_info->footer.recordBatches, + sizeof(ArrowBlock) * nitems); + for (i=0; i < nitems; i++) + { + ArrowBlock *block = &table->recordBatches[i]; + + pos = Max(pos, ARROWALIGN(block->offset + + block->metaDataLength + + block->bodyLength)); + } + } + else + table->recordBatches = NULL; + + if (lseek(table->fdesc, pos, SEEK_SET) < 0) + elog(ERROR, "failed on lseek('%s',%lu): %m", + table->filename, pos); + table->f_pos = pos; +} + +/* + * createArrowWriteRedoLog + */ +static loff_t +createArrowWriteRedoLog(File filp, bool is_newfile) +{ + arrowWriteRedoLog *redo; + int fdesc = FileGetRawDesc(filp); + const char *fname = FilePathName(filp); + TransactionId curr_xid = GetCurrentTransactionId(); + CommandId curr_cid = GetCurrentCommandId(true); + dlist_iter iter; + MetadataCacheKey key; + struct stat stat_buf; + size_t main_sz; + + if (fstat(fdesc, &stat_buf) != 0) + elog(ERROR, "failed on fstat(2): %m"); + initMetadataCacheKey(&key, &stat_buf); + + dlist_foreach(iter, &arrow_write_redo_list) + { + redo = dlist_container(arrowWriteRedoLog, chain, iter.cur); + + if (redo->key.st_dev == key.st_dev && + redo->key.st_ino == key.st_ino && + redo->xid == curr_xid && + redo->cid <= curr_cid) + { + elog(ERROR, "Why? '%s' on behalf of arrow_fdw foreign-table is concurrently opened for update, please confirm the configuration", fname); + } + } + + if (is_newfile) + { + main_sz = MAXALIGN(offsetof(arrowWriteRedoLog, footer_backup)); + redo = MemoryContextAllocZero(CacheMemoryContext, + main_sz + strlen(fname) + 1); + memcpy(&redo->key, &key, sizeof(MetadataCacheKey)); + redo->xid = curr_xid; + redo->cid = curr_cid; + redo->pathname = (char *)redo + main_sz; + strcpy(redo->pathname, fname); + redo->is_truncate = false; + redo->footer_offset = 0; + redo->footer_length = 0; + } + else + { + ssize_t nbytes; + off_t offset; + char temp[100]; + + /* make backup image of the Footer section */ + nbytes = sizeof(int32) + 6; /* = strlen("ARROW1") */ + offset = stat_buf.st_size - nbytes; + if (__preadFile(fdesc, temp, nbytes, offset) != nbytes) + elog(ERROR, "failed on pread(2): %m"); + offset -= *((int32 *)temp); + + nbytes = stat_buf.st_size - offset; + if (nbytes <= 0) + elog(ERROR, "strange apache arrow format"); + main_sz = MAXALIGN(offsetof(arrowWriteRedoLog, + footer_backup[nbytes])); + redo = MemoryContextAllocZero(CacheMemoryContext, + main_sz + strlen(fname) + 1); + memcpy(&redo->key, &key, sizeof(MetadataCacheKey)); + redo->xid = curr_xid; + redo->cid = curr_cid; + redo->pathname = (char *)redo + main_sz; + strcpy(redo->pathname, fname); + redo->is_truncate = false; + PG_TRY(); + { + if (__preadFile(fdesc, redo->footer_backup, nbytes, offset) != nbytes) + elog(ERROR, "failed on pread(2): %m"); + if (lseek(fdesc, offset, SEEK_SET) < 0) + elog(ERROR, "failed on lseek(2): %m"); + redo->footer_offset = offset; + redo->footer_length = nbytes; + } + PG_CATCH(); + { + pfree(redo); + PG_RE_THROW(); + } + PG_END_TRY(); + } + elog(DEBUG2, "arrow: redo-log on '%s' (st_dev=%u/st_ino=%u) xid=%u cid=%u offset=%lu length=%zu", + redo->pathname, (uint32)redo->key.st_dev, (uint32)redo->key.st_ino, + (uint32)redo->xid, (uint32)redo->cid, + (uint64)redo->footer_offset, + (uint64)redo->footer_length); + + dlist_push_head(&arrow_write_redo_list, &redo->chain); + + return redo->footer_offset; +} + +/* + * writeOutArrowRecordBatch + */ +static void +writeOutArrowRecordBatch(arrowWriteState *aw_state, bool with_footer) +{ + SQLtable *table = &aw_state->sql_table; + int index = aw_state->hash % ARROW_METADATA_HASH_NSLOTS; + arrowWriteMVCCLog *mvcc = NULL; + + if (table->nitems > 0) + { + mvcc = MemoryContextAllocZero(TopSharedMemoryContext, + sizeof(arrowWriteMVCCLog)); + memcpy(&mvcc->key, &aw_state->key, sizeof(MetadataCacheKey)); + mvcc->xid = GetCurrentTransactionId(); + mvcc->cid = GetCurrentCommandId(true); + } + + PG_TRY(); + { + LWLockAcquire(&arrow_metadata_state->lock_slots[index], + LW_EXCLUSIVE); + /* write out an empty arrow file */ + if (table->f_pos == 0) + { + arrowFileWrite(table, "ARROW1\0\0", 8); + writeArrowSchema(table); + } + if (table->nitems > 0) + { + mvcc->record_batch = writeArrowRecordBatch(table); + sql_table_clear(table); + dlist_push_tail(&arrow_metadata_state->mvcc_slots[index], + &mvcc->chain); + elog(DEBUG2, + "arrow-write: '%s' (st_dev=%u, st_ino=%u), xid=%u, cid=%u, record_batch=%u nitems=%lu", + FilePathName(aw_state->file), + (uint32)mvcc->key.st_dev, (uint32)mvcc->key.st_ino, + (uint32)mvcc->xid, (uint32)mvcc->cid, mvcc->record_batch, + table->nitems); + } + if (with_footer) + writeArrowFooter(table); + + /* + * Invalidation of the metadata cache, if any + * + * NOTE: metadata cache shall be invalidated on the next reference, + * if st_mtime of the file is newer than st_mtime of the mcache. + * Linux kernel offers nanosecond precision in st_Xtime, but it never + * guarantee the st_Xtime is recorded in nanosecond precision... + */ + arrowInvalidateMetadataCache(&aw_state->key, true); + + LWLockRelease(&arrow_metadata_state->lock_slots[index]); + } + PG_CATCH(); + { + if (mvcc) + pfree(mvcc); + PG_RE_THROW(); + } + PG_END_TRY(); +} + +/* + * TRUNCATE support + */ +static void +__arrowExecTruncateRelation(Relation frel) +{ + TupleDesc tupdesc = RelationGetDescr(frel); + Oid frel_oid = RelationGetRelid(frel); + ForeignTable *ft = GetForeignTable(frel_oid); + arrowWriteRedoLog *redo; + ArrowFileInfo af_info; + struct stat stat_buf; + MetadataCacheKey key; + int index; + List *filesList; + SQLtable *table; + const char *path_name; + const char *dir_name; + const char *file_name; + size_t main_sz; + int fdesc = -1; + char backup_path[MAXPGPATH]; + bool writable; + + filesList = __arrowFdwExtractFilesList(ft->options, + NULL, + &writable); + if (!writable) + elog(ERROR, "arrow_fdw: foreign table \"%s\" is not writable", + RelationGetRelationName(frel)); + Assert(list_length(filesList) == 1); + path_name = strVal(linitial(filesList)); + readArrowFile(path_name, &af_info, false); + if (stat(path_name, &stat_buf) != 0) + elog(ERROR, "failed on stat('%s'): %m", path_name); + /* metadata cache invalidation */ + index = initMetadataCacheKey(&key, &stat_buf); + LWLockAcquire(&arrow_metadata_state->lock_slots[index], LW_EXCLUSIVE); + arrowInvalidateMetadataCache(&key, true); + LWLockRelease(&arrow_metadata_state->lock_slots[index]); + + /* build SQLtable to write out schema */ + table = palloc0(offsetof(SQLtable, columns[tupdesc->natts])); + setupArrowSQLbufferSchema(table, tupdesc, &af_info); + + /* create REDO log entry */ + main_sz = MAXALIGN(offsetof(arrowWriteRedoLog, footer_backup)); + redo = MemoryContextAllocZero(CacheMemoryContext, + main_sz + strlen(path_name) + 1); + redo->xid = GetCurrentTransactionId(); + redo->cid = GetCurrentCommandId(true); + redo->pathname = (char *)redo + main_sz; + strcpy(redo->pathname, path_name); + redo->is_truncate = true; + + PG_TRY(); + { + /* + * move the current arrow file to the backup + */ + dir_name = dirname(pstrdup(path_name)); + file_name = basename(pstrdup(path_name)); + for (;;) + { + redo->suffix = random(); + snprintf(backup_path, sizeof(backup_path), + "%s/%s.%u.backup", + dir_name, file_name, redo->suffix); + if (stat(backup_path, &stat_buf) != 0) + { + if (errno == ENOENT) + break; + elog(ERROR, "failed on stat('%s'): %m", backup_path); + } + } + if (rename(path_name, backup_path) != 0) + elog(ERROR, "failed on rename('%s','%s'): %m", + path_name, backup_path); + + /* + * create an empty arrow file + */ + PG_TRY(); + { + fdesc = open(path_name, O_RDWR | O_CREAT | O_EXCL, 0600); + if (fdesc < 0) + elog(ERROR, "failed on open('%s'): %m", path_name); + if (fstat(fdesc, &stat_buf) != 0) + elog(ERROR, "failed on fstat('%s'): %m", path_name); + initMetadataCacheKey(&redo->key, &stat_buf); + table->filename = path_name; + table->fdesc = fdesc; + arrowFileWrite(table, "ARROW1\0\0", 8); + writeArrowSchema(table); + writeArrowFooter(table); + } + PG_CATCH(); + { + if (fdesc >= 0) + close(fdesc); + if (rename(backup_path, path_name) != 0) + elog(WARNING, "failed on rename('%s', '%s'): %m", + backup_path, path_name); + PG_RE_THROW(); + } + PG_END_TRY(); + close(fdesc); + } + PG_CATCH(); + { + pfree(redo); + PG_RE_THROW(); + } + PG_END_TRY(); + /* save the REDO log entry */ + dlist_push_head(&arrow_write_redo_list, &redo->chain); +} + +#if PG_VERSION_NUM >= 140000 +/* + * TRUNCATE support + */ +static void +ArrowExecForeignTruncate(List *rels, DropBehavior behavior, bool restart_seqs) +{ + ListCell *lc; + + foreach (lc, rels) + { + Relation frel = lfirst(lc); + + __arrowExecTruncateRelation(frel); + } +} +#endif + +/* + * pgstrom_arrow_fdw_truncate + */ +Datum +pgstrom_arrow_fdw_truncate(PG_FUNCTION_ARGS) +{ +#if PG_VERSION_NUM < 140000 + Oid frel_oid = PG_GETARG_OID(0); + Relation frel; + FdwRoutine *routine; + + frel = table_open(frel_oid, AccessExclusiveLock); + if (frel->rd_rel->relkind != RELKIND_FOREIGN_TABLE) + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("\"%s\" is not arrow_fdw foreign table", + RelationGetRelationName(frel)))); + routine = GetFdwRoutineForRelation(frel, false); + if (memcmp(routine, &pgstrom_arrow_fdw_routine, sizeof(FdwRoutine)) != 0) + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("\"%s\" is not arrow_fdw foreign table", + RelationGetRelationName(frel)))); + __arrowExecTruncateRelation(frel); + + table_close(frel, NoLock); +#else + elog(ERROR, "PostgreSQL v14 supports TRUNCATE ; use the standard statement instead of the legacy interface"); +#endif + PG_RETURN_VOID(); +} +PG_FUNCTION_INFO_V1(pgstrom_arrow_fdw_truncate); + +static void +__applyArrowTruncateRedoLog(arrowWriteRedoLog *redo, bool is_commit) +{ + char backup[MAXPGPATH]; + + snprintf(backup, MAXPGPATH, "%s.%u.backup", + redo->pathname, redo->suffix); + if (is_commit) + { + elog(DEBUG2, "arrow-redo: unlink [%s]", backup); + if (unlink(backup) != 0) + ereport(WARNING, + (errcode_for_file_access(), + errmsg("could not remove truncated file \"%s\": %m", + backup), + errhint("remove the \"%s\" manually", backup))); + } + else + { + elog(DEBUG2, "arrow-redo: rename [%s]->[%s]", backup, redo->pathname); + if (rename(backup, redo->pathname) != 0) + ereport(WARNING, + (errcode_for_file_access(), + errmsg("could not restore backup file \"%s\": %m", + backup), + errhint("please restore \"%s\" to \"%s\" manually", + backup, redo->pathname))); + arrowInvalidateMetadataCache(&redo->key, true); + } +} + +static void +__applyArrowInsertRedoLog(arrowWriteRedoLog *redo, bool is_commit) +{ + int fdesc; + + if (is_commit) + return; + + /* special case, if it was an empty file */ + if (redo->footer_offset == 0 && + redo->footer_length == 0) + { + if (unlink(redo->pathname) != 0) + ereport(WARNING, + (errcode_for_file_access(), + errmsg("failed on truncate('%s'): %m", redo->pathname), + errdetail("could not apply REDO image, therefore, garbages are still remained"))); + return; + } + + fdesc = open(redo->pathname, O_RDWR); + if (fdesc < 0) + { + ereport(WARNING, + (errcode_for_file_access(), + errmsg("failed on open('%s'): %m", redo->pathname), + errdetail("could not apply REDO image, therefore, arrow file might be corrupted"))); + } + else if (lseek(fdesc, redo->footer_offset, SEEK_SET) < 0) + { + ereport(WARNING, + (errcode_for_file_access(), + errmsg("failed on lseek('%s'): %m", redo->pathname), + errdetail("could not apply REDO image, therefore, arrow file might be corrupted"))); + } + else if (__writeFile(fdesc, + redo->footer_backup, + redo->footer_length) != redo->footer_length) + { + ereport(WARNING, + (errcode_for_file_access(), + errmsg("failed on write('%s'): %m", redo->pathname), + errdetail("could not apply REDO image, therefore, arrow file might be corrupted"))); + } + else if (ftruncate(fdesc, (redo->footer_offset + + redo->footer_length)) != 0) + { + ereport(WARNING, + (errcode_for_file_access(), + errmsg("failed on ftruncate('%s'): %m", redo->pathname), + errdetail("could not apply REDO image, therefore, arrow file might be corrupted"))); + } + /* invalidation of the metadata-cache */ + arrowInvalidateMetadataCache(&redo->key, true); + + close(fdesc); + + elog(DEBUG2, "arrow_fdw: REDO log applied (xid=%u, cid=%u, file=[%s], offset=%zu, length=%zu)", redo->xid, redo->cid, redo->pathname, redo->footer_offset, redo->footer_length); +} + +static void +__cleanupArrowWriteMVCCLog(TransactionId curr_xid, dlist_head *mvcc_slot) +{ + dlist_mutable_iter iter; + + dlist_foreach_modify(iter, mvcc_slot) + { + arrowWriteMVCCLog *mvcc = dlist_container(arrowWriteMVCCLog, + chain, iter.cur); + if (mvcc->xid == curr_xid) + { + dlist_delete(&mvcc->chain); + elog(DEBUG2, "arrow: release mvcc-log (st_dev=%u, st_ino=%u), xid=%u, cid=%u, record_batch=%u", + (uint32)mvcc->key.st_dev, (uint32)mvcc->key.st_ino, + (uint32)mvcc->xid, (uint32)mvcc->cid, mvcc->record_batch); + pfree(mvcc); + } + } +} + +/* + * __arrowFdwXactCallback + */ +static void +__arrowFdwXactCallback(TransactionId curr_xid, bool is_commit) +{ + arrowWriteRedoLog *redo; + dlist_mutable_iter iter; + CommandId curr_cid = InvalidCommandId; + uint32 index; + bool locked[ARROW_METADATA_HASH_NSLOTS]; + LWLock *locks[ARROW_METADATA_HASH_NSLOTS]; + uint32 lcount = 0; + + if (curr_xid == InvalidTransactionId || + dlist_is_empty(&arrow_write_redo_list)) + return; + + memset(locked, 0, sizeof(locked)); + dlist_foreach_modify(iter, &arrow_write_redo_list) + { + redo = dlist_container(arrowWriteRedoLog, chain, iter.cur); + if (redo->xid != curr_xid) + continue; + if (curr_cid != InvalidCommandId && + curr_cid < redo->cid) + elog(WARNING, "Bug? Order of REDO log is not be correct. ABORT transaction might generate wrong image restored."); + + index = redo->key.hash % ARROW_METADATA_HASH_NSLOTS; + if (!locked[index]) + { + LWLock *lock = &arrow_metadata_state->lock_slots[index]; + dlist_head *slot = &arrow_metadata_state->mvcc_slots[index]; + + LWLockAcquire(lock, LW_EXCLUSIVE); + __cleanupArrowWriteMVCCLog(curr_xid, slot); + locked[index] = true; + locks[lcount++] = lock; + } + if (redo->is_truncate) + __applyArrowTruncateRedoLog(redo, is_commit); + else + __applyArrowInsertRedoLog(redo, is_commit); + + dlist_delete(&redo->chain); + pfree(redo); + } + + for (index=0; index < lcount; index++) + LWLockRelease(locks[index]); +} + +/* + * arrowFdwXactCallback + */ +static void +arrowFdwXactCallback(XactEvent event, void *arg) +{ + TransactionId curr_xid = GetCurrentTransactionIdIfAny(); + + if (event == XACT_EVENT_COMMIT) + __arrowFdwXactCallback(curr_xid, true); + else if (event == XACT_EVENT_ABORT) + __arrowFdwXactCallback(curr_xid, false); +} + +/* + * arrowFdwSubXactCallback + */ +static void +arrowFdwSubXactCallback(SubXactEvent event, SubTransactionId mySubid, + SubTransactionId parentSubid, void *arg) +{ + TransactionId curr_xid = GetCurrentTransactionIdIfAny(); + + if (event == SUBXACT_EVENT_COMMIT_SUB) + __arrowFdwXactCallback(curr_xid, true); + else if (event == SUBXACT_EVENT_ABORT_SUB) + __arrowFdwXactCallback(curr_xid, false); +} + +/* + * pgstrom_request_arrow_fdw + */ +static void +pgstrom_request_arrow_fdw(void) +{ + if (shmem_request_next) + shmem_request_next(); + RequestAddinShmemSpace(MAXALIGN(sizeof(arrowMetadataState))); +} + +/* + * pgstrom_startup_arrow_fdw + */ +static void +pgstrom_startup_arrow_fdw(void) +{ + bool found; + int i; + + if (shmem_startup_next) + (*shmem_startup_next)(); + + arrow_metadata_state = + ShmemInitStruct("arrow_metadata_state", + MAXALIGN(sizeof(arrowMetadataState)), + &found); + if (!IsUnderPostmaster) + { + SpinLockInit(&arrow_metadata_state->lru_lock); + dlist_init(&arrow_metadata_state->lru_list); + pg_atomic_init_u64(&arrow_metadata_state->consumed, 0UL); + for (i=0; i < ARROW_METADATA_HASH_NSLOTS; i++) + { + LWLockInitialize(&arrow_metadata_state->lock_slots[i], -1); + dlist_init(&arrow_metadata_state->hash_slots[i]); + dlist_init(&arrow_metadata_state->mvcc_slots[i]); + } + } +} + +/* + * pgstrom_init_arrow_fdw + */ +void +pgstrom_init_arrow_fdw(void) +{ + FdwRoutine *r = &pgstrom_arrow_fdw_routine; + + memset(r, 0, sizeof(FdwRoutine)); + NodeSetTag(r, T_FdwRoutine); + /* SCAN support */ + r->GetForeignRelSize = ArrowGetForeignRelSize; + r->GetForeignPaths = ArrowGetForeignPaths; + r->GetForeignPlan = ArrowGetForeignPlan; + r->BeginForeignScan = ArrowBeginForeignScan; + r->IterateForeignScan = ArrowIterateForeignScan; + r->ReScanForeignScan = ArrowReScanForeignScan; + r->EndForeignScan = ArrowEndForeignScan; + /* EXPLAIN support */ + r->ExplainForeignScan = ArrowExplainForeignScan; + /* ANALYZE support */ + r->AnalyzeForeignTable = ArrowAnalyzeForeignTable; + /* IMPORT FOREIGN SCHEMA support */ + r->ImportForeignSchema = ArrowImportForeignSchema; +#if PG_VERSION_NUM >= 140000 + r->ExecForeignTruncate = ArrowExecForeignTruncate; +#endif + /* CPU Parallel support */ + r->IsForeignScanParallelSafe = ArrowIsForeignScanParallelSafe; + r->EstimateDSMForeignScan = ArrowEstimateDSMForeignScan; + r->InitializeDSMForeignScan = ArrowInitializeDSMForeignScan; + r->ReInitializeDSMForeignScan = ArrowReInitializeDSMForeignScan; + r->InitializeWorkerForeignScan = ArrowInitializeWorkerForeignScan; + r->ShutdownForeignScan = ArrowShutdownForeignScan; + /* INSERT/DELETE support */ + r->PlanForeignModify = ArrowPlanForeignModify; + r->BeginForeignModify = ArrowBeginForeignModify; + r->ExecForeignInsert = ArrowExecForeignInsert; + r->EndForeignModify = ArrowEndForeignModify; +#if PG_VERSION_NUM >= 110000 + r->BeginForeignInsert = ArrowBeginForeignInsert; + r->EndForeignInsert = ArrowEndForeignInsert; +#endif + r->ExplainForeignModify = ArrowExplainForeignModify; + + /* + * Turn on/off arrow_fdw + */ + DefineCustomBoolVariable("arrow_fdw.enabled", + "Enables the planner's use of Arrow_Fdw", + NULL, + &arrow_fdw_enabled, + true, + PGC_USERSET, + GUC_NOT_IN_SAMPLE, + NULL, NULL, NULL); + /* + * Turn on/off min/max statistics hint + */ + DefineCustomBoolVariable("arrow_fdw.stats_hint_enabled", + "Enables min/max statistics hint, if any", + NULL, + &arrow_fdw_stats_hint_enabled, + true, + PGC_USERSET, + GUC_NOT_IN_SAMPLE, + NULL, NULL, NULL); + /* + * Configurations for arrow_fdw metadata cache + */ + DefineCustomIntVariable("arrow_fdw.metadata_cache_size", + "size of shared metadata cache for arrow files", + NULL, + &arrow_metadata_cache_size_kb, + 131072, /* 128MB */ + 32768, /* 32MB */ + INT_MAX, + PGC_POSTMASTER, + GUC_NOT_IN_SAMPLE | GUC_UNIT_KB, + NULL, NULL, NULL); + arrow_metadata_cache_size = (size_t)arrow_metadata_cache_size_kb << 10; + + /* + * Limit of RecordBatch size for writing + */ + DefineCustomIntVariable("arrow_fdw.record_batch_size", + "maximum size of record batch on writing", + NULL, + &arrow_record_batch_size_kb, + 256 * 1024, /* default: 256MB */ + 4 * 1024, /* min: 4MB */ + 2048 * 1024, /* max: 2GB */ + PGC_USERSET, + GUC_NOT_IN_SAMPLE | GUC_UNIT_KB, + NULL, NULL, NULL); + + /* shared memory size */ + shmem_request_next = shmem_request_hook; + shmem_request_hook = pgstrom_request_arrow_fdw; + shmem_startup_next = shmem_startup_hook; + shmem_startup_hook = pgstrom_startup_arrow_fdw; + + /* transaction callback */ + RegisterXactCallback(arrowFdwXactCallback, NULL); + RegisterSubXactCallback(arrowFdwSubXactCallback, NULL); + + /* misc init */ + dlist_init(&arrow_write_redo_list); +} diff --git a/next/arrow_ipc.h b/old/arrow_ipc.h similarity index 99% rename from next/arrow_ipc.h rename to old/arrow_ipc.h index ea0d9f50d..6abc07e7a 100644 --- a/next/arrow_ipc.h +++ b/old/arrow_ipc.h @@ -3,8 +3,8 @@ * * Definitions for Apache Arrow IPC stuff. * ---- - * Copyright 2011-2023 (C) KaiGai Kohei - * Copyright 2014-2023 (C) PG-Strom Developers Team + * Copyright 2011-2021 (C) KaiGai Kohei + * Copyright 2014-2021 (C) PG-Strom Developers Team * * This program is free software; you can redistribute it and/or modify * it under the terms of the PostgreSQL License. diff --git a/next/arrow_nodes.c b/old/arrow_nodes.c similarity index 95% rename from next/arrow_nodes.c rename to old/arrow_nodes.c index f7cb6e820..449108012 100644 --- a/next/arrow_nodes.c +++ b/old/arrow_nodes.c @@ -4,8 +4,8 @@ * Routines to handle ArrowNode objects, intermediation of PostgreSQL types * and Apache Arrow types. * ---- - * Copyright 2011-2023 (C) KaiGai Kohei - * Copyright 2014-2023 (C) PG-Strom Developers Team + * Copyright 2011-2021 (C) KaiGai Kohei + * Copyright 2014-2021 (C) PG-Strom Developers Team * * This program is free software; you can redistribute it and/or modify * it under the terms of the PostgreSQL License. @@ -1947,16 +1947,14 @@ readArrowFooter(ArrowFooter *node, const char *pos) #define ARROW_FILE_TAIL_SIGNATURE "ARROW1" #define ARROW_FILE_TAIL_SIGNATURE_SZ (sizeof(ARROW_FILE_TAIL_SIGNATURE) - 1) -#ifndef __PGSTROM_MODULE__ -#define PG_TRY() \ - if (true) { \ - bool __dummy__ __attribute__((unused)) -#define PG_FINALLY() \ - } else { \ - bool __dummy__ __attribute__((unused)) -#define PG_END_TRY() \ - } -#endif +#ifdef __PGSTROM_MODULE__ +#include "pg_strom.h" +#define __mmap(a,b,c,d,e,f) __mmapFile((a),(b),(c),(d),(e),(f)) +#define __munmap(a,b) __munmapFile((a)) +#else +#define __mmap(a,b,c,d,e,f) mmap((a),(b),(c),(d),(e),(f)) +#define __munmap(a,b) munmap((a),(b)) +#endif /* __PGSTROM_MODULE__ */ void readArrowFileDesc(int fdesc, ArrowFileInfo *af_info) @@ -1977,93 +1975,85 @@ readArrowFileDesc(int fdesc, ArrowFileInfo *af_info) if (__PAGE_SIZE == 0) __PAGE_SIZE = sysconf(_SC_PAGESIZE); mmap_sz = ((file_sz + __PAGE_SIZE - 1) & ~(__PAGE_SIZE - 1)); - mmap_head = mmap(NULL, mmap_sz, PROT_READ, MAP_SHARED, fdesc, 0); + mmap_head = __mmap(NULL, mmap_sz, PROT_READ, MAP_SHARED, fdesc, 0); if (mmap_head == MAP_FAILED) Elog("failed on mmap: %m"); mmap_tail = mmap_head + file_sz - ARROW_FILE_TAIL_SIGNATURE_SZ; /* check signature */ - PG_TRY(); + if (memcmp(mmap_head, + ARROW_FILE_HEAD_SIGNATURE, + ARROW_FILE_HEAD_SIGNATURE_SZ) != 0 || + memcmp(mmap_tail, + ARROW_FILE_TAIL_SIGNATURE, + ARROW_FILE_TAIL_SIGNATURE_SZ) != 0) { - if (memcmp(mmap_head, - ARROW_FILE_HEAD_SIGNATURE, - ARROW_FILE_HEAD_SIGNATURE_SZ) != 0 || - memcmp(mmap_tail, - ARROW_FILE_TAIL_SIGNATURE, - ARROW_FILE_TAIL_SIGNATURE_SZ) != 0) - { - Elog("Signature mismatch on Apache Arrow file"); - } + Elog("Signature mismatch on Apache Arrow file"); + } - /* Read Footer chunk */ - pos = mmap_tail - sizeof(int32_t); - offset = *((int32_t *)pos); - pos -= offset; - offset = *((int32_t *)pos); - readArrowFooter(&af_info->footer, pos + offset); + /* Read Footer chunk */ + pos = mmap_tail - sizeof(int32_t); + offset = *((int32_t *)pos); + pos -= offset; + offset = *((int32_t *)pos); + readArrowFooter(&af_info->footer, pos + offset); - /* Read DictionaryBatch chunks */ - nitems = af_info->footer._num_dictionaries; - if (nitems > 0) + /* Read DictionaryBatch chunks */ + nitems = af_info->footer._num_dictionaries; + if (nitems > 0) + { + af_info->dictionaries = palloc0(nitems * sizeof(ArrowMessage)); + for (i=0; i < nitems; i++) { - af_info->dictionaries = palloc0(nitems * sizeof(ArrowMessage)); - for (i=0; i < nitems; i++) + ArrowBlock *b = &af_info->footer.dictionaries[i]; + ArrowMessage *m = &af_info->dictionaries[i]; + int32_t *ival = (int32_t *)(mmap_head + b->offset); + int32_t metaLength __attribute__((unused)); + int32_t *headOffset; + + if (*ival == 0xffffffff) + { + metaLength = ival[1]; + headOffset = ival + 2; + } + else { - ArrowBlock *b = &af_info->footer.dictionaries[i]; - ArrowMessage *m = &af_info->dictionaries[i]; - int32_t *ival = (int32_t *)(mmap_head + b->offset); - int32_t metaLength __attribute__((unused)); - int32_t *headOffset; - - if (*ival == 0xffffffff) - { - metaLength = ival[1]; - headOffset = ival + 2; - } - else - { - /* Older format prior to Arrow v0.15 */ - metaLength = *ival; - headOffset = ival + 1; - } - pos = (const char *)headOffset + *headOffset; - readArrowMessage(m, pos); + /* Older format prior to Arrow v0.15 */ + metaLength = *ival; + headOffset = ival + 1; } + pos = (const char *)headOffset + *headOffset; + readArrowMessage(m, pos); } + } - /* Read RecordBatch chunks */ - nitems = af_info->footer._num_recordBatches; - if (nitems > 0) + /* Read RecordBatch chunks */ + nitems = af_info->footer._num_recordBatches; + if (nitems > 0) + { + af_info->recordBatches = palloc0(nitems * sizeof(ArrowMessage)); + for (i=0; i < nitems; i++) { - af_info->recordBatches = palloc0(nitems * sizeof(ArrowMessage)); - for (i=0; i < nitems; i++) + ArrowBlock *b = &af_info->footer.recordBatches[i]; + ArrowMessage *m = &af_info->recordBatches[i]; + int32_t *ival = (int32_t *)(mmap_head + b->offset); + int32_t metaLength __attribute__((unused)); + int32_t *headOffset; + + if (*ival == 0xffffffff) { - ArrowBlock *b = &af_info->footer.recordBatches[i]; - ArrowMessage *m = &af_info->recordBatches[i]; - int32_t *ival = (int32_t *)(mmap_head + b->offset); - int32_t metaLength __attribute__((unused)); - int32_t *headOffset; - - if (*ival == 0xffffffff) - { - metaLength = ival[1]; - headOffset = ival + 2; - } - else - { - /* Older format prior to Arrow v0.15 */ - metaLength = *ival; - headOffset = ival + 1; - } - pos = (const char *)headOffset + *headOffset; - readArrowMessage(m, pos); + metaLength = ival[1]; + headOffset = ival + 2; } + else + { + /* Older format prior to Arrow v0.15 */ + metaLength = *ival; + headOffset = ival + 1; + } + pos = (const char *)headOffset + *headOffset; + readArrowMessage(m, pos); } - munmap(mmap_head, mmap_sz); - } - PG_FINALLY(); - { - munmap(mmap_head, mmap_sz); } - PG_END_TRY(); + __munmap(mmap_head, mmap_sz); } diff --git a/src/arrow_pgsql.c b/old/arrow_pgsql.c similarity index 100% rename from src/arrow_pgsql.c rename to old/arrow_pgsql.c diff --git a/src/arrow_write.c b/old/arrow_write.c similarity index 100% rename from src/arrow_write.c rename to old/arrow_write.c diff --git a/old/codegen.c b/old/codegen.c new file mode 100644 index 000000000..914d04160 --- /dev/null +++ b/old/codegen.c @@ -0,0 +1,4929 @@ +/* + * codegen.c + * + * Routines for CUDA code generator + * ---- + * Copyright 2011-2021 (C) KaiGai Kohei + * Copyright 2014-2021 (C) PG-Strom Developers Team + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the PostgreSQL License. + */ +#include "pg_strom.h" +#include "cuda_numeric.h" +#include "cuda_postgis.h" + +static MemoryContext devinfo_memcxt; +static dlist_head devtype_info_slot[128]; +static dlist_head devfunc_info_slot[1024]; +static dlist_head devcast_info_slot[48]; +static dlist_head devindex_info_slot[48]; + +static cl_uint generic_devtype_hashfunc(devtype_info *dtype, Datum datum); +static cl_uint pg_int1_devtype_hashfunc(devtype_info *dtype, Datum datum); +static cl_uint pg_int2_devtype_hashfunc(devtype_info *dtype, Datum datum); +static cl_uint pg_int4_devtype_hashfunc(devtype_info *dtype, Datum datum); +static cl_uint pg_int8_devtype_hashfunc(devtype_info *dtype, Datum datum); +static cl_uint pg_float2_devtype_hashfunc(devtype_info *dtype, Datum datum); +static cl_uint pg_float4_devtype_hashfunc(devtype_info *dtype, Datum datum); +static cl_uint pg_float8_devtype_hashfunc(devtype_info *dtype, Datum datum); +static cl_uint pg_numeric_devtype_hashfunc(devtype_info *dtype, Datum datum); +static cl_uint pg_interval_devtype_hashfunc(devtype_info *dtype, Datum datum); +static cl_uint pg_bpchar_devtype_hashfunc(devtype_info *dtype, Datum datum); +static cl_uint pg_inet_devtype_hashfunc(devtype_info *dtype, Datum datum); +static cl_uint pg_jsonb_devtype_hashfunc(devtype_info *dtype, Datum datum); +static cl_uint pg_range_devtype_hashfunc(devtype_info *dtype, Datum datum); +static cl_uint pg_geometry_devtype_hashfunc(devtype_info *dtype, Datum datum); +static cl_uint pg_box2df_devtype_hashfunc(devtype_info *dtype, Datum datum); + +/* callback to handle special cases of device cast */ +static int devcast_text2numeric_callback(codegen_context *context, + StringInfo body, + devcast_info *dcast, + CoerceViaIO *node); +/* error report */ +#define __ELog(fmt, ...) \ + ereport(ERROR, \ + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), \ + errmsg((fmt), ##__VA_ARGS__))) + +/* known extension name */ +#define PGSTROM "pg_strom" +#define POSTGIS3 "postgis" + +/* + * Catalog of data types supported by device code + * + * naming convension of types: + * pg__t + */ + +/* + * MEMO: PG10 does not have OID definitions below + */ +#ifndef INT8RANGEOID +#define INT8RANGEOID 3926 +#endif +#ifndef TSRANGEOID +#define TSRANGEOID 3908 +#endif +#ifndef TSTZRANGEOID +#define TSTZRANGEOID 3910 +#endif +#ifndef DATERANGEOID +#define DATERANGEOID 3912 +#endif + +static struct { + const char *type_extension; + const char *type_name; + Oid type_oid_fixed; /* can be InvalidOid if not build-in */ + const char *type_oid_label; + cl_uint type_flags; /* library to declare this type */ + cl_uint extra_sz; /* required size to store internal form */ + devtype_hashfunc_type hash_func; +} devtype_catalog[] = { + /* + * Primitive datatypes + */ + { NULL, "bool", BOOLOID, "BOOLOID", + 0, 0, generic_devtype_hashfunc + }, + { PGSTROM, "int1", INT1OID, "INT1OID", + 0, 0, pg_int1_devtype_hashfunc + }, + { NULL, "int2", INT2OID, "INT2OID", + 0, 0, pg_int2_devtype_hashfunc + }, + { NULL, "int4", INT4OID, "INT4OID", + 0, 0, pg_int4_devtype_hashfunc + }, + { NULL, "int8", INT8OID, "INT8OID", + 0, 0, pg_int8_devtype_hashfunc + }, + /* XXX - float2 is not a built-in data type */ + { PGSTROM, "float2", FLOAT2OID, "FLOAT2OID", + 0, 0, pg_float2_devtype_hashfunc + }, + { NULL, "float4", FLOAT4OID, "FLOAT4OID", + 0, 0, pg_float4_devtype_hashfunc + }, + { NULL, "float8", FLOAT8OID, "FLOAT8OID", + 0, 0, pg_float8_devtype_hashfunc + }, + /* + * Misc data types + */ + { NULL, "money", CASHOID, "CASHOID", + DEVKERNEL_NEEDS_MISCLIB, 0, + generic_devtype_hashfunc + }, + { NULL, "uuid", UUIDOID, "UUIDOID", + DEVKERNEL_NEEDS_MISCLIB, UUID_LEN, + generic_devtype_hashfunc + }, + { NULL, "macaddr", MACADDROID, "MACADDROID", + DEVKERNEL_NEEDS_MISCLIB, sizeof(macaddr), + generic_devtype_hashfunc + }, + { NULL, "inet", INETOID, "INETOID", + DEVKERNEL_NEEDS_MISCLIB, sizeof(inet), + pg_inet_devtype_hashfunc + }, + { NULL, "cidr", CIDROID, "CIDROID", + DEVKERNEL_NEEDS_MISCLIB, sizeof(inet), + pg_inet_devtype_hashfunc + }, + /* + * Date and time datatypes + */ + { NULL, "date", DATEOID, "DATEOID", + DEVKERNEL_NEEDS_TIMELIB, 0, + generic_devtype_hashfunc + }, + { NULL, "time", TIMEOID, "TIMEOID", + DEVKERNEL_NEEDS_TIMELIB, 0, + generic_devtype_hashfunc + }, + { NULL, "timetz", TIMETZOID, "TIMETZOID", + DEVKERNEL_NEEDS_TIMELIB, sizeof(TimeTzADT), + generic_devtype_hashfunc + }, + { NULL, "timestamp", TIMESTAMPOID, "TIMESTAMPOID", + DEVKERNEL_NEEDS_TIMELIB, 0, + generic_devtype_hashfunc + }, + { NULL, "timestamptz", TIMESTAMPTZOID, "TIMESTAMPTZOID", + DEVKERNEL_NEEDS_TIMELIB, 0, + generic_devtype_hashfunc + }, + { NULL, "interval", INTERVALOID, "INTERVALOID", + DEVKERNEL_NEEDS_TIMELIB, sizeof(Interval), + pg_interval_devtype_hashfunc + }, + /* + * variable length datatypes + */ + { NULL, "bpchar", BPCHAROID, "BPCHAROID", + DEVKERNEL_NEEDS_TEXTLIB, 0, + pg_bpchar_devtype_hashfunc + }, + { NULL, "varchar", VARCHAROID, "VARCHAROID", + DEVKERNEL_NEEDS_TEXTLIB, 0, + generic_devtype_hashfunc + }, + { NULL, "numeric", NUMERICOID, "NUMERICOID", + 0, sizeof(struct NumericData), + pg_numeric_devtype_hashfunc + }, + { NULL, "bytea", BYTEAOID, "BYTEAOID", + 0, sizeof(pg_varlena_t), + generic_devtype_hashfunc + }, + { NULL, "text", TEXTOID, "TEXTOID", + DEVKERNEL_NEEDS_TEXTLIB, sizeof(pg_varlena_t), + generic_devtype_hashfunc + }, + { NULL, "jsonb", JSONBOID, "JSONBOID", + DEVKERNEL_NEEDS_JSONLIB, + /* see comment at vlbuf_estimate_jsonb() */ + TOAST_TUPLE_THRESHOLD, + pg_jsonb_devtype_hashfunc + }, + /* + * range types + */ + { NULL, "int4range", INT4RANGEOID, "INT4RANGEOID", + DEVKERNEL_NEEDS_RANGETYPE, + sizeof(RangeType) + 2 * sizeof(cl_int) + 1, + pg_range_devtype_hashfunc + }, + { NULL, "int8range", INT8RANGEOID, "INT8RANGEOID", + DEVKERNEL_NEEDS_RANGETYPE, + sizeof(RangeType) + 2 * sizeof(cl_long) + 1, + pg_range_devtype_hashfunc + }, + { NULL, "tsrange", TSRANGEOID, "TSRANGEOID", + DEVKERNEL_NEEDS_TIMELIB | DEVKERNEL_NEEDS_RANGETYPE, + sizeof(RangeType) + 2 * sizeof(Timestamp) + 1, + pg_range_devtype_hashfunc + }, + { NULL, "tstzrange", TSTZRANGEOID, "TSTZRANGEOID", + DEVKERNEL_NEEDS_TIMELIB | DEVKERNEL_NEEDS_RANGETYPE, + sizeof(RangeType) + 2 * sizeof(TimestampTz) + 1, + pg_range_devtype_hashfunc + }, + { NULL, "daterange", DATERANGEOID, "DATERANGEOID", + DEVKERNEL_NEEDS_TIMELIB | DEVKERNEL_NEEDS_RANGETYPE, + sizeof(RangeType) + 2 * sizeof(DateADT) + 1, + pg_range_devtype_hashfunc + }, + /* + * PostGIS types + */ + { POSTGIS3, "geometry", InvalidOid, "GEOMETRYOID", + DEVKERNEL_NEEDS_POSTGIS, + sizeof(pg_geometry_t), + pg_geometry_devtype_hashfunc + }, + { POSTGIS3, "box2df", InvalidOid, "BOX2DFOID", + DEVKERNEL_NEEDS_POSTGIS, + sizeof(pg_box2df_t), + pg_box2df_devtype_hashfunc + } +}; + +static const char * +get_extension_name_by_object(Oid class_id, Oid object_id) +{ + Relation rel; + ScanKeyData skeys[2]; + SysScanDesc scan; + HeapTuple htup; + const char *ext_name = NULL; + + ScanKeyInit(&skeys[0], + Anum_pg_depend_classid, + BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(class_id)); + ScanKeyInit(&skeys[1], + Anum_pg_depend_objid, + BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(object_id)); + + rel = table_open(DependRelationId, AccessShareLock); + scan = systable_beginscan(rel, DependDependerIndexId, true, + NULL, 2, skeys); + while (HeapTupleIsValid(htup = systable_getnext(scan))) + { + Form_pg_depend dep = (Form_pg_depend) GETSTRUCT(htup); + const char *__ext_name; + + if (dep->refclassid == ExtensionRelationId && + dep->deptype == DEPENDENCY_EXTENSION) + { + __ext_name = get_extension_name(dep->refobjid); + if (__ext_name) + ext_name = quote_identifier(__ext_name); + break; + } + } + systable_endscan(scan); + table_close(rel, AccessShareLock); + + return ext_name; +} + +static void +append_string_devtype_identifier(StringInfo buf, Oid type_oid) +{ + HeapTuple htup; + Form_pg_type type_form; + char *nsp_name; + + htup = SearchSysCache1(TYPEOID, ObjectIdGetDatum(type_oid)); + if (!HeapTupleIsValid(htup)) + elog(ERROR, "cache lookup failed for type %u", type_oid); + type_form = (Form_pg_type) GETSTRUCT(htup); + + nsp_name = get_namespace_name(type_form->typnamespace); + if (!nsp_name) + elog(ERROR, "cache lookup failed for namespace %u", type_form->typnamespace); + appendStringInfo(buf, "%s.%s", + quote_identifier(nsp_name), + quote_identifier(NameStr(type_form->typname))); + ReleaseSysCache(htup); +} + +/* + * build_extra_devtype_info + * + * it queries the extra device type support + */ +static devtype_info * +build_extra_devtype_info(TypeCacheEntry *tcache, const char *ext_name) +{ + StringInfoData ident; + devtype_info __dtype; + devtype_info *dtype = NULL; + int i; + + /* setup arguments */ + initStringInfo(&ident); + append_string_devtype_identifier(&ident, tcache->type_id); + + memset(&__dtype, 0, sizeof(devtype_info)); + __dtype.type_extension = ext_name; + __dtype.type_oid = tcache->type_id; + __dtype.type_flags = 0; + __dtype.type_length = tcache->typlen; + __dtype.type_align = typealign_get_width(tcache->typalign); + __dtype.type_byval = tcache->typbyval; + __dtype.type_name = NULL; /* callback must set the device type name */ + __dtype.extra_sz = 0; + __dtype.hash_func = NULL; + __dtype.type_eqfunc = get_opcode(tcache->eq_opr); + __dtype.type_cmpfunc = tcache->cmp_proc; + + for (i=0; i < pgstrom_num_users_extra; i++) + { + pgstromUsersExtraDescriptor *extra = &pgstrom_users_extra_desc[i]; + + if (extra->lookup_extra_devtype && + extra->lookup_extra_devtype(ident.data, &__dtype)) + { + MemoryContext oldcxt; + + /* must be still base type */ + Assert(__dtype.type_element == NULL && + __dtype.comp_nfields == 0); + if (!__dtype.type_name) + { + elog(DEBUG2, "Extra module didn't set device type name for '%s'", + format_type_be(tcache->type_id)); + continue; + } + oldcxt = MemoryContextSwitchTo(devinfo_memcxt); + dtype = pmemdup(&__dtype, offsetof(devtype_info, comp_subtypes[0])); + if (__dtype.type_extension) + dtype->type_extension = pstrdup(__dtype.type_extension); + dtype->type_name = pstrdup(__dtype.type_name); + dtype->type_flags |= extra->extra_flags; + MemoryContextSwitchTo(oldcxt); + break; + } + } + pfree(ident.data); + return dtype; +} + +static devtype_info * +build_basic_devtype_info(TypeCacheEntry *tcache, const char *ext_name) +{ + HeapTuple htup; + Form_pg_type type_form; + const char *type_name; + devtype_info *entry = NULL; + int i; + + htup = SearchSysCache1(TYPEOID, ObjectIdGetDatum(tcache->type_id)); + if (!HeapTupleIsValid(htup)) + elog(ERROR, "cache lookup failed for type %u", tcache->type_id); + type_form = (Form_pg_type) GETSTRUCT(htup); + type_name = NameStr(type_form->typname); + + for (i=0; i < lengthof(devtype_catalog); i++) + { + const char *__ext_name = devtype_catalog[i].type_extension; + const char *__type_name = devtype_catalog[i].type_name; + + if (ext_name) + { + if (!__ext_name || strcmp(ext_name, __ext_name) != 0) + continue; + } + else + { + if (__ext_name || type_form->typnamespace != PG_CATALOG_NAMESPACE) + continue; + } + + if (strcmp(type_name, __type_name) == 0) + { + MemoryContext oldcxt = MemoryContextSwitchTo(devinfo_memcxt); + + entry = palloc0(offsetof(devtype_info, comp_subtypes[0])); + if (ext_name) + entry->type_extension = pstrdup(ext_name); + entry->type_oid = tcache->type_id; + entry->type_flags = devtype_catalog[i].type_flags; + entry->type_length = tcache->typlen; + entry->type_align = typealign_get_width(tcache->typalign); + entry->type_byval = tcache->typbyval; + entry->type_name = devtype_catalog[i].type_name; /* const */ + entry->extra_sz = devtype_catalog[i].extra_sz; + entry->hash_func = devtype_catalog[i].hash_func; + /* type equality functions */ + entry->type_eqfunc = get_opcode(tcache->eq_opr); + entry->type_cmpfunc = tcache->cmp_proc; + + MemoryContextSwitchTo(oldcxt); + break; + } + } + if (!entry && pgstrom_num_users_extra > 0) + entry = build_extra_devtype_info(tcache, ext_name); + ReleaseSysCache(htup); + + return entry; +} + +static devtype_info * +build_array_devtype_info(TypeCacheEntry *tcache, const char *ext_name) +{ + devtype_info *element; + devtype_info *entry; + Oid typelem; + MemoryContext oldcxt; + + typelem = get_element_type(tcache->type_id); + Assert(OidIsValid(typelem) && tcache->typlen == -1); + element = pgstrom_devtype_lookup(typelem); + if (!element) + return NULL; + + oldcxt = MemoryContextSwitchTo(devinfo_memcxt); + entry = palloc0(offsetof(devtype_info, comp_subtypes[0])); + if (ext_name) + entry->type_extension = pstrdup(ext_name); + entry->type_oid = tcache->type_id; + entry->type_flags = element->type_flags; + entry->type_length = tcache->typlen; + entry->type_align = typealign_get_width(tcache->typalign); + entry->type_byval = tcache->typbyval; + entry->type_name = "array"; + entry->extra_sz = sizeof(pg_array_t); + entry->hash_func = generic_devtype_hashfunc; + entry->type_element = element; + MemoryContextSwitchTo(oldcxt); + + return entry; +} + +static devtype_info * +build_composite_devtype_info(TypeCacheEntry *tcache, const char *ext_name) +{ + Oid type_relid = tcache->typrelid; + int j, nfields = get_relnatts(type_relid); + devtype_info **subtypes = alloca(sizeof(devtype_info *) * nfields); + devtype_info *entry; + cl_uint extra_flags = 0; + size_t extra_sz; + MemoryContext oldcxt; + + extra_sz = (MAXALIGN(sizeof(Datum) * nfields) + + MAXALIGN(sizeof(bool) * nfields)); + for (j=0; j < nfields; j++) + { + HeapTuple tup; + Oid atttypid; + devtype_info *dtype; + + tup = SearchSysCache2(ATTNUM, + ObjectIdGetDatum(type_relid), + Int16GetDatum(j+1)); + if (!HeapTupleIsValid(tup)) + return NULL; + atttypid = ((Form_pg_attribute) GETSTRUCT(tup))->atttypid; + ReleaseSysCache(tup); + + dtype = pgstrom_devtype_lookup(atttypid); + if (!dtype) + return NULL; + subtypes[j] = dtype; + + extra_flags |= dtype->type_flags; + extra_sz += MAXALIGN(dtype->extra_sz); + } + + oldcxt = MemoryContextSwitchTo(devinfo_memcxt); + entry = palloc0(offsetof(devtype_info, comp_subtypes[nfields])); + if (ext_name) + entry->type_extension = pstrdup(ext_name); + entry->type_oid = tcache->type_id; + entry->type_flags = extra_flags; + entry->type_length = tcache->typlen; + entry->type_align = typealign_get_width(tcache->typalign); + entry->type_byval = tcache->typbyval; + entry->type_name = "composite"; + entry->extra_sz = extra_sz; + entry->comp_nfields = nfields; + memcpy(entry->comp_subtypes, subtypes, + sizeof(devtype_info *) * nfields); + MemoryContextSwitchTo(oldcxt); + + return entry; +} + +devtype_info * +pgstrom_devtype_lookup(Oid type_oid) +{ + TypeCacheEntry *tcache; + devtype_info *dtype; + uint32 hashvalue; + uint32 hindex; + size_t sz; + dlist_iter iter; + const char *ext_name; + + /* lookup dtype that is already built */ + hashvalue = GetSysCacheHashValue(TYPEOID, ObjectIdGetDatum(type_oid), 0, 0, 0); + hindex = hashvalue % lengthof(devtype_info_slot); + dlist_foreach(iter, &devtype_info_slot[hindex]) + { + dtype = dlist_container(devtype_info, chain, iter.cur); + + if (dtype->type_oid == type_oid) + { + if (dtype->type_is_negative) + return NULL; + return dtype; + } + } + /* try to build devtype_info entry */ + ext_name = get_extension_name_by_object(TypeRelationId, type_oid); + tcache = lookup_type_cache(type_oid, + TYPECACHE_EQ_OPR | + TYPECACHE_CMP_PROC); + if (OidIsValid(tcache->typrelid)) + { + /* composite type */ + dtype = build_composite_devtype_info(tcache, ext_name); + } + else if (OidIsValid(tcache->typelem) && tcache->typlen == -1) + { + /* array type */ + dtype = build_array_devtype_info(tcache, ext_name); + } + else + { + /* base or extra type */ + dtype = build_basic_devtype_info(tcache, ext_name); + } + + /* makes a negative entry, if not in the catalog */ + if (!dtype) + { + sz = offsetof(devtype_info, comp_subtypes[0]); + dtype = MemoryContextAllocZero(devinfo_memcxt, sz); + dtype->type_oid = type_oid; + dtype->type_is_negative = true; + } + dtype->hashvalue = hashvalue; + dlist_push_head(&devtype_info_slot[hindex], &dtype->chain); + + if (dtype->type_is_negative) + return NULL; + return dtype; +} + +devtype_info * +pgstrom_devtype_lookup_and_track(Oid type_oid, codegen_context *context) +{ + devtype_info *dtype = pgstrom_devtype_lookup(type_oid); + + if (dtype) + context->extra_flags |= dtype->type_flags; + + return dtype; +} + +static devtype_info * +pgstrom_devtype_lookup_by_name(const char *type_ident) +{ + char *type_name = NULL; + char *ext_name = NULL; + const char *__ext_name; + Oid type_oid = InvalidOid; + Relation rel; + ScanKeyData skey; + SysScanDesc sscan; + HeapTuple htup; + + type_name = alloca(strlen(type_ident) + 1); + strcpy(type_name, type_ident); + ext_name = strchr(type_name, '@'); + if (ext_name) + *ext_name++ = '\0'; + + htup = SearchSysCache2(TYPENAMENSP, + CStringGetDatum(type_name), + ObjectIdGetDatum(PG_CATALOG_NAMESPACE)); + if (HeapTupleIsValid(htup)) + { + type_oid = PgTypeTupleGetOid(htup); + __ext_name = get_extension_name_by_object(TypeRelationId, type_oid); + if (ext_name) + { + if (!__ext_name || strcmp(ext_name, __ext_name) != 0) + type_oid = InvalidOid; + } + else if (__ext_name != NULL) + type_oid = InvalidOid; + ReleaseSysCache(htup); + } + + if (!OidIsValid(type_oid)) + { + rel = table_open(TypeRelationId, AccessShareLock); + + ScanKeyInit(&skey, + Anum_pg_type_typname, + BTEqualStrategyNumber, F_NAMEEQ, + CStringGetDatum(type_name)); + sscan = systable_beginscan(rel, TypeNameNspIndexId, + true, NULL, 1, &skey); + do { + htup = systable_getnext(sscan); + if (!HeapTupleIsValid(htup)) + break; + type_oid = PgTypeTupleGetOid(htup); + __ext_name = get_extension_name_by_object(TypeRelationId, type_oid); + if (ext_name) + { + if (!__ext_name || strcmp(ext_name, __ext_name) != 0) + type_oid = InvalidOid; + } + else if (__ext_name != NULL) + type_oid = InvalidOid; + } while (!OidIsValid(type_oid)); + + systable_endscan(sscan); + table_close(rel, AccessShareLock); + } + + if (OidIsValid(type_oid)) + return pgstrom_devtype_lookup(type_oid); + return NULL; +} + +/* code for extra device types */ +size_t +pgstrom_codegen_extra_devtypes(char *buf, size_t bufsz, uint32 extra_flags) +{ + size_t off = 0; + int i; + + /* only extra device types */ + extra_flags &= DEVKERNEL_USERS_EXTRA_MASK; + + for (i=0; i < pgstrom_num_users_extra; i++) + { + pgstromUsersExtraDescriptor *ex_desc = &pgstrom_users_extra_desc[i]; + + if ((ex_desc->extra_flags & extra_flags) == ex_desc->extra_flags) + { + off += snprintf(buf + off, bufsz - off, + "#include \"%s.h\"\n", + ex_desc->extra_name); + } + } + /* array type support */ + off += snprintf( + buf + off, bufsz - off, + "\n" + "DEVICE_FUNCTION(cl_uint)\n" + "pg_extras_array_from_arrow(kern_context *kcxt,\n" + " char *dest,\n" + " kern_colmeta *smeta,\n" + " char *base,\n" + " cl_uint start,\n" + " cl_uint end)\n" + "{\n"); + if (pgstrom_num_users_extra > 0) + { + off += snprintf( + buf + off, bufsz - off, + " switch (smeta->atttypid)\n" + " {\n"); + for (i=0; i < lengthof(devtype_info_slot); i++) + { + dlist_iter iter; + devtype_info *dtype; + + dlist_foreach(iter, &devtype_info_slot[i]) + { + dtype = dlist_container(devtype_info, chain, iter.cur); + if ((dtype->type_flags & extra_flags) == 0) + continue; + off += snprintf( + buf + off, bufsz - off, + " case %u:\n" + " return pg_%s_array_from_arrow(kcxt, dest,\n" + " smeta, base,\n" + " start, end);\n", + dtype->type_oid, + dtype->type_name); + } + } + off += snprintf( + buf + off, bufsz - off, + " default:\n" + " break;\n" + " }\n"); + } + off += snprintf( + buf + off, bufsz - off, + " return 0;\n" + "}\n"); + + /* composite type support */ + off += snprintf( + buf + off, bufsz - off, + "\n" + "DEVICE_FUNCTION(cl_bool)\n" + "pg_extras_composite_from_arrow(kern_context *kcxt,\n" + " kern_colmeta *smeta,\n" + " char *base,\n" + " cl_uint rowidx,\n" + " cl_char *p_dclass,\n" + " Datum *p_datum)\n" + "{\n"); + + if (pgstrom_num_users_extra > 0) + { + off += snprintf( + buf + off, bufsz - off, + " switch (smeta->atttypid)\n" + " {\n"); + for (i=0; i < lengthof(devtype_info_slot); i++) + { + dlist_iter iter; + devtype_info *dtype; + + dlist_foreach(iter, &devtype_info_slot[i]) + { + dtype = dlist_container(devtype_info, chain, iter.cur); + if ((dtype->type_flags & extra_flags) == 0) + continue; + + off += snprintf( + buf + off, bufsz - off, + " case %u: {\n" + " pg_%s_t temp;\n" + " pg_datum_fetch_arrow(kcxt, temp, smeta, base, rowidx);\n" + " pg_datum_store(kcxt, temp, p_dclass, p_datum);\n" + " return true;\n" + " }\n", + dtype->type_oid, + dtype->type_name); + } + } + off += snprintf( + buf + off, bufsz - off, + " default:\n" + " break;\n" + " }\n"); + } + off += snprintf( + buf + off, bufsz - off, + " return false;\n" + "}\n"); + return off; +} + +/* + * Device type specific hash-functions + * + * Some device types have internal representation, like numeric, which shall + * be used to GpuHashJoin for join-key hashing. + */ +static cl_uint +generic_devtype_hashfunc(devtype_info *dtype, Datum datum) +{ + if (dtype->type_byval) + return hash_any((unsigned char *)&datum, dtype->type_length); + if (dtype->type_length > 0) + return hash_any((unsigned char *)DatumGetPointer(datum), + dtype->type_length); + Assert(dtype->type_length == -1); + return hash_any((cl_uchar *)VARDATA_ANY(datum), + VARSIZE_ANY_EXHDR(datum)); +} + +static cl_uint +pg_int1_devtype_hashfunc(devtype_info *dtype, Datum datum) +{ + cl_int ival = DatumGetChar(datum); + + return hash_any((cl_uchar *)&ival, sizeof(cl_char)); +} + +static cl_uint +pg_int2_devtype_hashfunc(devtype_info *dtype, Datum datum) +{ + cl_int ival = DatumGetInt16(datum); + + return hash_any((cl_uchar *)&ival, sizeof(cl_short)); +} + +static cl_uint +pg_int4_devtype_hashfunc(devtype_info *dtype, Datum datum) +{ + cl_int ival = DatumGetInt32(datum); + + return hash_any((cl_uchar *)&ival, sizeof(cl_int)); +} + +static cl_uint +pg_int8_devtype_hashfunc(devtype_info *dtype, Datum datum) +{ + cl_long ival = DatumGetInt64(datum); + cl_uint lo = (ival & 0xffffffffL); + cl_uint hi = (ival >> 32); + + lo ^= (ival >= 0 ? hi : ~hi); + + return hash_any((cl_uchar *)&lo, sizeof(cl_int)); +} + +extern Datum pgstrom_float2_to_float8(PG_FUNCTION_ARGS); +static cl_uint +pg_float2_devtype_hashfunc(devtype_info *dtype, Datum datum) +{ + Datum v = DirectFunctionCall1(pgstrom_float2_to_float8, datum); + cl_double fval = DatumGetFloat8(v); + + if (fval == 0.0) + return 0; + return hash_any((cl_uchar *)&fval, sizeof(cl_double)); +} + +static cl_uint +pg_float4_devtype_hashfunc(devtype_info *dtype, Datum datum) +{ + cl_double fval = DatumGetFloat4(datum); + + if (fval == 0.0) + return 0; + return hash_any((cl_uchar *)&fval, sizeof(cl_double)); +} + +static cl_uint +pg_float8_devtype_hashfunc(devtype_info *dtype, Datum datum) +{ + cl_double fval = DatumGetFloat8(datum); + + if (fval == 0.0) + return 0; + return hash_any((cl_uchar *)&fval, sizeof(cl_double)); +} + +static cl_uint +pg_numeric_devtype_hashfunc(devtype_info *dtype, Datum datum) +{ + kern_context dummy; + pg_numeric_t temp; + + memset(&dummy, 0, sizeof(dummy)); + /* + * MEMO: If NUMERIC value is out of range, we may not be able to + * execute GpuJoin in the kernel space for all the outer chunks. + * Is it still valuable to run on GPU kernel? + */ + temp = pg_numeric_from_varlena(&dummy, (struct varlena *) + DatumGetPointer(datum)); + if (dummy.errcode != ERRCODE_STROM_SUCCESS) + elog(ERROR, "failed on hash calculation of device numeric: %s", + DatumGetCString(DirectFunctionCall1(numeric_out, datum))); + + return hash_any((cl_uchar *)&temp.value, + offsetof(pg_numeric_t, weight) + sizeof(cl_short)); +} + +static cl_uint +pg_interval_devtype_hashfunc(devtype_info *dtype, Datum datum) +{ + Interval *interval = DatumGetIntervalP(datum); + cl_long frac; + cl_long days; + + frac = interval->time % USECS_PER_DAY; + days = (interval->time / USECS_PER_DAY + + interval->month * 30L + + interval->day); + days ^= frac; + + return hash_any((cl_uchar *)&days, sizeof(cl_long)); +} + +static cl_uint +pg_bpchar_devtype_hashfunc(devtype_info *dtype, Datum datum) +{ + char *s = VARDATA_ANY(datum); + int i, len = VARSIZE_ANY_EXHDR(datum); + + Assert(dtype->type_oid == BPCHAROID); + /* + * whitespace is the tail end of CHAR(n) data shall be ignored + * when we calculate hash-value, to match same text exactly. + */ + for (i = len - 1; i >= 0 && s[i] == ' '; i--) + ; + return hash_any((unsigned char *)VARDATA_ANY(datum), i+1); +} + +static cl_uint +pg_inet_devtype_hashfunc(devtype_info *dtype, Datum datum) +{ + inet_struct *is = (inet_struct *) VARDATA_ANY(datum); + + Assert(dtype->type_oid == INETOID || + dtype->type_oid == CIDROID); + if (is->family == PGSQL_AF_INET) + return hash_any((cl_uchar *)is, offsetof(inet_struct, ipaddr[4])); + else if (is->family == PGSQL_AF_INET6) + return hash_any((cl_uchar *)is, offsetof(inet_struct, ipaddr[16])); + + elog(ERROR, "unexpected address family: %d", is->family); + return ~0U; +} + +static cl_uint +__jsonb_devtype_hashfunc(devtype_info *dtype, JsonbContainer *jc) +{ + cl_uint hash = 0; + cl_uint j, nitems = JsonContainerSize(jc); + char *base = NULL; + char *data; + cl_uint datalen; + + if (!JsonContainerIsScalar(jc)) + { + if (JsonContainerIsObject(jc)) + { + base = (char *)(jc->children + 2 * nitems); + hash ^= JB_FOBJECT; + } + else + { + base = (char *)(jc->children + nitems); + hash ^= JB_FARRAY; + } + } + + for (j=0; j < nitems; j++) + { + cl_uint index = j; + cl_uint temp; + JEntry entry; + + /* hash value for key */ + if (JsonContainerIsObject(jc)) + { + entry = jc->children[index]; + if (!JBE_ISSTRING(entry)) + elog(ERROR, "jsonb key value is not STRING"); + data = base + getJsonbOffset(jc, index); + datalen = getJsonbLength(jc, index); + temp = hash_any((cl_uchar *)data, datalen); + hash = ((hash << 1) | (hash >> 31)) ^ temp; + + index += nitems; + } + /* hash value for element */ + entry = jc->children[index]; + if (JBE_ISNULL(entry)) + temp = 0x01; + else if (JBE_ISSTRING(entry)) + { + data = base + getJsonbOffset(jc, index); + datalen = getJsonbLength(jc, index); + temp = hash_any((cl_uchar *)data, datalen); + } + else if (JBE_ISNUMERIC(entry)) + { + data = base + INTALIGN(getJsonbOffset(jc, index)); + temp = pg_numeric_devtype_hashfunc(NULL, PointerGetDatum(data)); + } + else if (JBE_ISBOOL_TRUE(entry)) + temp = 0x02; + else if (JBE_ISBOOL_FALSE(entry)) + temp = 0x04; + else if (JBE_ISCONTAINER(entry)) + { + data = base + INTALIGN(getJsonbOffset(jc, index)); + temp = __jsonb_devtype_hashfunc(dtype, (JsonbContainer *)data); + } + else + elog(ERROR, "Unexpected jsonb entry (%08x)", entry); + hash = ((hash << 1) | (hash >> 31)) ^ temp; + } + return hash; +} + +static cl_uint +pg_jsonb_devtype_hashfunc(devtype_info *dtype, Datum datum) +{ + JsonbContainer *jc = (JsonbContainer *) VARDATA_ANY(datum); + + return __jsonb_devtype_hashfunc(dtype, jc); +} + +static cl_uint +pg_range_devtype_hashfunc(devtype_info *dtype, Datum datum) +{ + RangeType *r = DatumGetRangeTypeP(datum); + cl_uchar flags = *((char *)r + VARSIZE_ANY(r) - 1); + cl_uchar *pos = (cl_uchar *)(r + 1); + struct { + Datum l_val; + Datum u_val; + cl_uchar flags; + } temp; + int32 ival32; + + if (RANGE_HAS_LBOUND(flags)) + { + switch (RangeTypeGetOid(r)) + { + case INT4RANGEOID: + case DATERANGEOID: + memcpy(&ival32, pos, sizeof(cl_int)); + temp.l_val = (cl_long)ival32; + pos += sizeof(cl_int); + break; + case INT8RANGEOID: + case TSRANGEOID: + case TSTZRANGEOID: + memcpy(&temp.l_val, pos, sizeof(cl_long)); + pos += sizeof(cl_long); + break; + default: + elog(ERROR, "unexpected range type: %s", + format_type_be(RangeTypeGetOid(r))); + } + } + if (RANGE_HAS_UBOUND(flags)) + { + switch (RangeTypeGetOid(r)) + { + case INT4RANGEOID: + case DATERANGEOID: + memcpy(&ival32, pos, sizeof(cl_int)); + temp.l_val = (cl_long)ival32; + pos += sizeof(cl_int); + break; + case INT8RANGEOID: + case TSRANGEOID: + case TSTZRANGEOID: + memcpy(&temp.l_val, pos, sizeof(cl_long)); + pos += sizeof(cl_long); + break; + default: + elog(ERROR, "unexpected range type: %s", + format_type_be(RangeTypeGetOid(r))); + } + } + temp.flags = flags; + + return hash_any((unsigned char *)&temp, + 2*sizeof(Datum)+sizeof(cl_uchar)); +} + +static cl_uint +pg_geometry_devtype_hashfunc(devtype_info *dtype, Datum datum) +{ + return 0; //TODO +} + +static cl_uint +pg_box2df_devtype_hashfunc(devtype_info *dtype, Datum datum) +{ + return 0; //TODO +} + +/* + * varlena buffer estimation handler + */ +static int +vlbuf_estimate_textcat(codegen_context *context, + devfunc_info *dfunc, + Expr **args, int *vl_width) +{ + int i, nargs = list_length(dfunc->func_args); + int maxlen = 0; + + for (i=0; i < nargs; i++) + { + if (vl_width[i] < 0) + __ELog("unable to estimate result size of textcat"); + maxlen += vl_width[i]; + } + /* it consumes varlena buffer on run-time */ + context->extra_bufsz += MAXALIGN(maxlen + VARHDRSZ); + + return maxlen; +} + +static int +vlbuf_estimate_substring(codegen_context *context, + devfunc_info *dfunc, + Expr **args, int *vl_width) +{ + if (list_length(dfunc->func_args) > 2 && + IsA(args[2], Const)) + { + Const *con = (Const *)args[2]; + + Assert(con->consttype == INT4OID); + if (con->constisnull) + return 0; + return Max(DatumGetInt32(con->constvalue), 0); + } + return vl_width[0]; +} + +static int +vlbuf_estimate_jsonb(codegen_context *context, + devfunc_info *dfunc, + Expr **args, int *vl_width) +{ + context->extra_bufsz += MAXALIGN(TOAST_TUPLE_THRESHOLD); + /* + * We usually have no information about jsonb object length preliminary, + * however, plain varlena must be less than the threshold of toasting. + * If user altered storage option of jsonb column to 'main', it may be + * increased to BLCKSZ, but unusual. + */ + return TOAST_TUPLE_THRESHOLD; +} + +static int +vlbuf_estimate__st_makepoint(codegen_context *context, + devfunc_info *dfunc, + Expr **args, int *vl_width) +{ + int nargs = list_length(dfunc->func_args); + + context->extra_bufsz += MAXALIGN(sizeof(double) * 2 * nargs); + + return -1; +} + +static int +vlbuf_estimate__st_relate(codegen_context *context, + devfunc_info *dfunc, + Expr **args, int *vl_width) +{ + context->extra_bufsz += MAXALIGN(VARHDRSZ + 9); + + return VARHDRSZ + 9; +} + +static int +vlbuf_estimate__st_expand(codegen_context *context, + devfunc_info *dfunc, + Expr **args, int *vl_width) +{ + context->extra_bufsz += MAXALIGN(4 * sizeof(cl_float) + /* bounding-box */ + 2 * sizeof(cl_uint) + /* nitems + padding */ + 10 * sizeof(double)); /* polygon rawdata */ + return -1; /* not a normal varlena */ +} + +/* + * Catalog of functions supported by device code + * + * naming convension of functions: + * pgfn_(...) + * + * func_template is a set of characters based on the rules below: + * + * [/]f: + * + * attributes: + * 'L' : this function is locale aware, thus, available only if simple + * collation configuration (none, and C-locale). + * 'C' : this function uses its special callback to estimate the result + * width of varlena-buffer. + * 'p' : this function needs cuda_primitive.h + * 's' : this function needs cuda_textlib.h + * 't' : this function needs cuda_timelib.h + * 'j' : this function needs cuda_jsonlib.h + * 'm' : this function needs cuda_misclib.h + * 'r' : this function needs cuda_rangetype.h + * 'g' : this function needs cuda_postgis.h + * + * class character: + * 'r' : right operator that takes an argument (deprecated) + * 'l' : left operator that takes an argument (deprecated) + * 'b' : both operator that takes two arguments (deprecated) + * 'f' : this function is implemented as device function. + * ==> extra is the function name being declared somewhere + */ +#define DEVFUNC_MAX_NARGS 4 + +typedef struct devfunc_catalog_t { + const char *func_extension; /* NULL, if built-in functions */ + const char *func_signature; + int func_devcost; /* relative cost to run on device */ + const char *func_template; /* a template string if simple function */ + devfunc_result_sz_type devfunc_result_sz; +} devfunc_catalog_t; + +static devfunc_catalog_t devfunc_common_catalog[] = { + /* Type cast functions */ + { NULL, "bool bool(int4)", 1, "f:to_bool" }, + + { PGSTROM, "int1 int1(int2)", 1, "f:to_int1" }, + { PGSTROM, "int1 int1(int4)", 1, "f:to_int1" }, + { PGSTROM, "int1 int1(int8)", 1, "f:to_int1" }, + { PGSTROM, "int1 int1(float2)", 1, "f:to_int1" }, + { PGSTROM, "int1 int1(float4)", 1, "f:to_int1" }, + { PGSTROM, "int1 int1(float8)", 1, "f:to_int1" }, + + { PGSTROM, "int2 int2(int1)", 1, "f:to_int2" }, + { NULL, "int2 int2(int4)", 1, "f:to_int2" }, + { NULL, "int2 int2(int8)", 1, "f:to_int2" }, + { PGSTROM, "int2 int2(float2)", 1, "f:to_int2" }, + { NULL, "int2 int2(float4)", 1, "f:to_int2" }, + { NULL, "int2 int2(float8)", 1, "f:to_int2" }, + + { NULL, "int4 int4(bool)", 1, "f:to_int4" }, + { PGSTROM, "int4 int4(int1)", 1, "f:to_int4" }, + { NULL, "int4 int4(int2)", 1, "f:to_int4" }, + { NULL, "int4 int4(int8)", 1, "f:to_int4" }, + { PGSTROM, "int4 int4(float2)", 1, "f:to_int4" }, + { NULL, "int4 int4(float4)", 1, "f:to_int4" }, + { NULL, "int4 int4(float8)", 1, "f:to_int4" }, + + { PGSTROM, "int8 int8(int1)", 1, "f:to_int8" }, + { NULL, "int8 int8(int2)", 1, "f:to_int8" }, + { NULL, "int8 int8(int4)", 1, "f:to_int8" }, + { PGSTROM, "int8 int8(float2)", 1, "f:to_int8" }, + { NULL, "int8 int8(float4)", 1, "f:to_int8" }, + { NULL, "int8 int8(float8)", 1, "f:to_int8" }, + + { PGSTROM, "float2 float2(int1)", 1, "f:to_float2" }, + { PGSTROM, "float2 float2(int2)", 1, "f:to_float2" }, + { PGSTROM, "float2 float2(int4)", 1, "f:to_float2" }, + { PGSTROM, "float2 float2(int8)", 1, "f:to_float2" }, + { PGSTROM, "float2 float2(float4)", 1, "f:to_float2" }, + { PGSTROM, "float2 float2(float8)", 1, "f:to_float2" }, + + { PGSTROM, "float4 float4(int1)", 1, "f:to_float4" }, + { NULL, "float4 float4(int2)", 1, "f:to_float4" }, + { NULL, "float4 float4(int4)", 1, "f:to_float4" }, + { NULL, "float4 float4(int8)", 1, "f:to_float4" }, + { PGSTROM, "float4 float4(float2)", 1, "f:to_float4" }, + { NULL, "float4 float4(float8)", 1, "f:to_float4" }, + + { PGSTROM, "float8 float8(int1)", 1, "f:to_float8" }, + { NULL, "float8 float8(int2)", 1, "f:to_float8" }, + { NULL, "float8 float8(int4)", 1, "f:to_float8" }, + { NULL, "float8 float8(int8)", 1, "f:to_float8" }, + { PGSTROM, "float8 float8(float2)", 1, "f:to_float8" }, + { NULL, "float8 float8(float4)", 1, "f:to_float8" }, + + /* '+' : add operators */ + { PGSTROM, "int1 int1pl(int1,int1)", 1, "p/f:int1pl" }, + { PGSTROM, "int2 int12pl(int1,int2)", 1, "p/f:int12pl" }, + { PGSTROM, "int4 int14pl(int1,int4)", 1, "p/f:int14pl" }, + { PGSTROM, "int8 int18pl(int1,int8)", 1, "p/f:int18pl" }, + { PGSTROM, "int2 int21pl(int2,int1)", 1, "p/f:int21pl" }, + { NULL, "int2 int2pl(int2,int2)", 1, "p/f:int2pl" }, + { NULL, "int4 int24pl(int2,int4)", 1, "p/f:int24pl" }, + { NULL, "int8 int28pl(int2,int8)", 1, "p/f:int28pl" }, + { PGSTROM, "int4 int41pl(int4,int1)", 1, "p/f:int41pl" }, + { NULL, "int4 int42pl(int4,int2)", 1, "p/f:int42pl" }, + { NULL, "int4 int4pl(int4,int4)", 1, "p/f:int4pl" }, + { NULL, "int8 int48pl(int4,int8)", 1, "p/f:int48pl" }, + { PGSTROM, "int8 int81pl(int8,int1)", 1, "p/f:int81pl" }, + { NULL, "int8 int82pl(int8,int2)", 1, "p/f:int82pl" }, + { NULL, "int8 int84pl(int8,int4)", 1, "p/f:int84pl" }, + { NULL, "int8 int8pl(int8,int8)", 1, "p/f:int8pl" }, + { PGSTROM, "float4 float2_pl(float2,float2)", 1, "p/f:float2pl" }, + { PGSTROM, "float4 float24_pl(float2,float4)", 1, "p/f:float24pl" }, + { PGSTROM, "float8 float28_pl(float2,float8)", 1, "p/f:float28pl" }, + { PGSTROM, "float4 float42_pl(float4,float2)", 1, "p/f:float42pl" }, + { NULL, "float4 float4pl(float4,float4)", 1, "p/f:float4pl" }, + { NULL, "float8 float48pl(float4,float8)", 1, "p/f:float48pl" }, + { PGSTROM, "float8 float82_pl(float8,float2)", 1, "p/f:float82pl" }, + { NULL, "float8 float84pl(float8,float4)", 1, "p/f:float84pl" }, + { NULL, "float8 float8pl(float8,float8)", 1, "p/f:float8pl" }, + + /* '-' : subtract operators */ + { PGSTROM, "int1 int1mi(int1,int1)", 1, "p/f:int1mi" }, + { PGSTROM, "int2 int12mi(int1,int2)", 1, "p/f:int12mi" }, + { PGSTROM, "int4 int14mi(int1,int4)", 1, "p/f:int14mi" }, + { PGSTROM, "int8 int18mi(int1,int8)", 1, "p/f:int18mi" }, + { PGSTROM, "int2 int21mi(int2,int1)", 1, "p/f:int21mi" }, + { NULL, "int2 int2mi(int2,int2)", 1, "p/f:int2mi" }, + { NULL, "int4 int24mi(int2,int4)", 1, "p/f:int24mi" }, + { NULL, "int8 int28mi(int2,int8)", 1, "p/f:int28mi" }, + { PGSTROM, "int4 int41mi(int4,int1)", 1, "p/f:int41mi" }, + { NULL, "int4 int42mi(int4,int2)", 1, "p/f:int42mi" }, + { NULL, "int4 int4mi(int4,int4)", 1, "p/f:int4mi" }, + { NULL, "int8 int48mi(int4,int8)", 1, "p/f:int48mi" }, + { PGSTROM, "int8 int81mi(int8,int1)", 1, "p/f:int81mi" }, + { NULL, "int8 int82mi(int8,int2)", 1, "p/f:int82mi" }, + { NULL, "int8 int84mi(int8,int4)", 1, "p/f:int84mi" }, + { NULL, "int8 int8mi(int8,int8)", 1, "p/f:int8mi" }, + { PGSTROM, "float4 float2_mi(float2,float2)", 1, "p/f:float2mi" }, + { PGSTROM, "float4 float24_mi(float2,float4)", 1, "p/f:float24mi" }, + { PGSTROM, "float8 float28_mi(float2,float8)", 1, "p/f:float28mi" }, + { PGSTROM, "float4 float42_mi(float4,float2)", 1, "p/f:float42mi" }, + { NULL, "float4 float4mi(float4,float4)", 1, "p/f:float4mi" }, + { NULL, "float8 float48mi(float4,float8)", 1, "p/f:float48mi" }, + { PGSTROM, "float8 float82_mi(float8,float2)", 1, "p/f:float82mi" }, + { NULL, "float8 float84mi(float8,float4)", 1, "p/f:float84mi" }, + { NULL, "float8 float8mi(float8,float8)", 1, "p/f:float8mi" }, + + /* '*' : mutiply operators */ + { PGSTROM, "int1 int1mul(int1,int1)", 2, "p/f:int1mul" }, + { PGSTROM, "int2 int12mul(int1,int2)", 2, "p/f:int12mul" }, + { PGSTROM, "int4 int14mul(int1,int4)", 2, "p/f:int14mul" }, + { PGSTROM, "int8 int18mul(int1,int8)", 2, "p/f:int18mul" }, + { PGSTROM, "int2 int21mul(int2,int1)", 2, "p/f:int21mul" }, + { NULL, "int2 int2mul(int2,int2)", 2, "p/f:int2mul" }, + { NULL, "int4 int24mul(int2,int4)", 2, "p/f:int24mul" }, + { NULL, "int8 int28mul(int2,int8)", 2, "p/f:int28mul" }, + { PGSTROM, "int4 int41mul(int4,int1)", 2, "p/f:int41mul" }, + { NULL, "int4 int42mul(int4,int2)", 2, "p/f:int42mul" }, + { NULL, "int4 int4mul(int4,int4)", 2, "p/f:int4mul" }, + { NULL, "int8 int48mul(int4,int8)", 2, "p/f:int48mul" }, + { PGSTROM, "int8 int81mul(int8,int1)", 2, "p/f:int81mul" }, + { NULL, "int8 int82mul(int8,int2)", 2, "p/f:int82mul" }, + { NULL, "int8 int84mul(int8,int4)", 2, "p/f:int84mul" }, + { NULL, "int8 int8mul(int8,int8)", 2, "p/f:int8mul" }, + { PGSTROM, "float4 float2_mul(float2,float2)", 2, "p/f:float2mul" }, + { PGSTROM, "float4 float24_mul(float2,float4)", 2, "p/f:float24mul" }, + { PGSTROM, "float8 float28_mul(float2,float8)", 2, "p/f:float28mul" }, + { PGSTROM, "float4 float42_mul(float4,float2)", 2, "p/f:float42mul" }, + { NULL, "float4 float4mul(float4,float4)", 2, "p/f:float4mul" }, + { NULL, "float8 float48mul(float4,float8)", 2, "p/f:float48mul" }, + { PGSTROM, "float8 float82_mul(float8,float2)", 2, "p/f:float82mul" }, + { NULL, "float8 float84mul(float8,float4)", 2, "p/f:float84mul" }, + { NULL, "float8 float8mul(float8,float8)", 2, "p/f:float8mul" }, + + /* '/' : divide operators */ + { PGSTROM, "int1 int1div(int1,int1)", 2, "p/f:int1div" }, + { PGSTROM, "int2 int12div(int1,int2)", 2, "p/f:int12div" }, + { PGSTROM, "int4 int14div(int1,int4)", 2, "p/f:int14div" }, + { PGSTROM, "int8 int18div(int1,int8)", 2, "p/f:int18div" }, + { PGSTROM, "int2 int21div(int2,int1)", 2, "p/f:int21div" }, + { NULL, "int2 int2div(int2,int2)", 2, "p/f:int2div" }, + { NULL, "int4 int24div(int2,int4)", 2, "p/f:int24div" }, + { NULL, "int8 int28div(int2,int8)", 2, "p/f:int28div" }, + { PGSTROM, "int4 int41div(int4,int1)", 2, "p/f:int41div" }, + { NULL, "int4 int42div(int4,int2)", 2, "p/f:int42div" }, + { NULL, "int4 int4div(int4,int4)", 2, "p/f:int4div" }, + { NULL, "int8 int48div(int4,int8)", 2, "p/f:int48div" }, + { PGSTROM, "int2 int81div(int8,int1)", 2, "p/f:int81div" }, + { NULL, "int8 int82div(int8,int2)", 2, "p/f:int82div" }, + { NULL, "int8 int84div(int8,int4)", 2, "p/f:int84div" }, + { NULL, "int8 int8div(int8,int8)", 2, "p/f:int8div" }, + { PGSTROM, "float4 float2_div(float2,float2)", 2, "p/f:float2div" }, + { PGSTROM, "float4 float24_div(float2,float4)", 2, "p/f:float24div" }, + { PGSTROM, "float8 float28_div(float2,float8)", 2, "p/f:float28div" }, + { PGSTROM, "float4 float42_div(float4,float2)", 2, "p/f:float42div" }, + { NULL, "float4 float4div(float4,float4)", 2, "p/f:float4div" }, + { NULL, "float8 float48div(float4,float8)", 2, "p/f:float48div" }, + { PGSTROM, "float8 float82_div(float8,float2)", 2, "p/f:float82div" }, + { NULL, "float8 float84div(float8,float4)", 2, "p/f:float84div" }, + { NULL, "float8 float8div(float8,float8)", 2, "p/f:float8div" }, + + /* '%' : reminder operators */ + { PGSTROM, "int1 int1mod(int1,int1)", 2, "p/f:int1mod" }, + { NULL, "int2 int2mod(int2,int2)", 2, "p/f:int2mod" }, + { NULL, "int4 int4mod(int4,int4)", 2, "p/f:int4mod" }, + { NULL, "int8 int8mod(int8,int8)", 2, "p/f:int8mod" }, + + /* '+' : unary plus operators */ + { PGSTROM, "int1 int1up(int1)", 1, "p/f:int1up" }, + { NULL, "int2 int2up(int2)", 1, "p/f:int2up" }, + { NULL, "int4 int4up(int4)", 1, "p/f:int4up" }, + { NULL, "int8 int8up(int8)", 1, "p/f:int8up" }, + { PGSTROM, "float2 float2_up(float2)",1, "p/f:float2up" }, + { NULL, "float4 float4up(float4)", 1, "p/f:float4up" }, + { NULL, "float8 float8up(float8)", 1, "p/f:float8up" }, + + /* '-' : unary minus operators */ + { PGSTROM, "int1 int1um(int1)", 1, "p/f:int1um" }, + { NULL, "int2 int2um(int2)", 1, "p/f:int2um" }, + { NULL, "int4 int4um(int4)", 1, "p/f:int4um" }, + { NULL, "int8 int8um(int8)", 1, "p/f:int8um" }, + { PGSTROM, "float2 float2_um(float2)",1, "p/f:float2um" }, + { NULL, "float4 float4um(float4)", 1, "p/f:float4um" }, + { NULL, "float8 float8um(float8)", 1, "p/f:float8um" }, + + /* '@' : absolute value operators */ + { PGSTROM, "int1 int1abs(int1)", 1, "p/f:int1abs" }, + { NULL, "int2 int2abs(int2)", 1, "p/f:int2abs" }, + { NULL, "int4 int4abs(int4)", 1, "p/f:int4abs" }, + { NULL, "int8 int8abs(int8)", 1, "p/f:int8abs" }, + { PGSTROM, "float2 float2abs(float2)", 1, "p/f:float2abs" }, + { NULL, "float4 float4abs(float4)", 1, "p/f:float4abs" }, + { NULL, "float8 float8abs(float8)", 1, "p/f:float8abs" }, + + /* '=' : equal operators */ + { NULL, "bool booleq(bool,bool)", 1, "f:booleq" }, + { PGSTROM, "bool int1eq(int1,int1)", 1, "f:int1eq" }, + { PGSTROM, "bool int12eq(int1,int2)", 1, "f:int12eq" }, + { PGSTROM, "bool int14eq(int1,int4)", 1, "f:int14eq" }, + { PGSTROM, "bool int18eq(int1,int8)", 1, "f:int18eq" }, + { PGSTROM, "bool int21eq(int2,int1)", 1, "f:int21eq" }, + { NULL, "bool int2eq(int2,int2)", 1, "f:int2eq" }, + { NULL, "bool int24eq(int2,int4)", 1, "f:int24eq" }, + { NULL, "bool int28eq(int2,int8)", 1, "f:int28eq" }, + { PGSTROM, "bool int41eq(int4,int1)", 1, "f:int41eq" }, + { NULL, "bool int42eq(int4,int2)", 1, "f:int42eq" }, + { NULL, "bool int4eq(int4,int4)", 1, "f:int4eq" }, + { NULL, "bool int48eq(int4,int8)", 1, "f:int48eq" }, + { PGSTROM, "bool int81eq(int8,int1)", 1, "f:int81eq" }, + { NULL, "bool int82eq(int8,int2)", 1, "f:int82eq" }, + { NULL, "bool int84eq(int8,int4)", 1, "f:int84eq" }, + { NULL, "bool int8eq(int8,int8)", 1, "f:int8eq" }, + { PGSTROM, "bool float2_eq(float2,float2)", 1, "f:float2eq" }, + { PGSTROM, "bool float24_eq(float2,float4)", 1, "f:float24eq" }, + { PGSTROM, "bool float28_eq(float2,float8)", 1, "f:float28eq" }, + { PGSTROM, "bool float42_eq(float4,float2)", 1, "f:float42eq" }, + { NULL, "bool float4eq(float4,float4)", 1, "f:float4eq" }, + { NULL, "bool float48eq(float4,float8)", 1, "f:float48eq" }, + { PGSTROM, "bool float82_eq(float8,float2)", 1, "f:float82eq" }, + { NULL, "bool float84eq(float8,float4)", 1, "f:float84eq" }, + { NULL, "bool float8eq(float8,float8)", 1, "f:float8eq" }, + + /* '<>' : not equal operators */ + { PGSTROM, "bool int1ne(int1,int1)", 1, "f:int1ne" }, + { PGSTROM, "bool int12ne(int1,int2)", 1, "f:int12ne" }, + { PGSTROM, "bool int14ne(int1,int4)", 1, "f:int14ne" }, + { PGSTROM, "bool int18ne(int1,int8)", 1, "f:int18ne" }, + { PGSTROM, "bool int21ne(int2,int1)", 1, "f:int21ne" }, + { NULL, "bool int2ne(int2,int2)", 1, "f:int2ne" }, + { NULL, "bool int24ne(int2,int4)", 1, "f:int24ne" }, + { NULL, "bool int28ne(int2,int8)", 1, "f:int28ne" }, + { PGSTROM, "bool int41ne(int4,int1)", 1, "f:int41ne" }, + { NULL, "bool int42ne(int4,int2)", 1, "f:int42ne" }, + { NULL, "bool int4ne(int4,int4)", 1, "f:int4ne" }, + { NULL, "bool int48ne(int4,int8)", 1, "f:int48ne" }, + { PGSTROM, "bool int81ne(int8,int1)", 1, "f:int81ne" }, + { NULL, "bool int82ne(int8,int2)", 1, "f:int82ne" }, + { NULL, "bool int84ne(int8,int4)", 1, "f:int84ne" }, + { NULL, "bool int8ne(int8,int8)", 1, "f:int8ne" }, + { PGSTROM, "bool float2_ne(float2,float2)", 1, "f:float2ne" }, + { PGSTROM, "bool float24_ne(float2,float4)", 1, "f:float24ne" }, + { PGSTROM, "bool float28_ne(float2,float8)", 1, "f:float28ne" }, + { PGSTROM, "bool float42_ne(float4,float2)", 1, "f:float42ne" }, + { NULL, "bool float4ne(float4,float4)", 1, "f:float4ne" }, + { NULL, "bool float48ne(float4,float8)", 1, "f:float48ne" }, + { PGSTROM, "bool float82_ne(float8,float2)", 1, "f:float82ne" }, + { NULL, "bool float84ne(float8,float4)", 1, "f:float84ne" }, + { NULL, "bool float8ne(float8,float8)", 1, "f:float8ne" }, + + /* '>' : greater than operators */ + { PGSTROM, "bool int1gt(int1,int1)", 1, "f:int1gt" }, + { PGSTROM, "bool int12gt(int1,int2)", 1, "f:int12gt" }, + { PGSTROM, "bool int14gt(int1,int4)", 1, "f:int14gt" }, + { PGSTROM, "bool int18gt(int1,int8)", 1, "f:int18gt" }, + { PGSTROM, "bool int21gt(int2,int1)", 1, "f:int21gt" }, + { NULL, "bool int2gt(int2,int2)", 1, "f:int2gt" }, + { NULL, "bool int24gt(int2,int4)", 1, "f:int24gt" }, + { NULL, "bool int28gt(int2,int8)", 1, "f:int28gt" }, + { PGSTROM, "bool int41gt(int4,int1)", 1, "f:int41gt" }, + { NULL, "bool int42gt(int4,int2)", 1, "f:int42gt" }, + { NULL, "bool int4gt(int4,int4)", 1, "f:int4gt" }, + { NULL, "bool int48gt(int4,int8)", 1, "f:int48gt" }, + { PGSTROM, "bool int81gt(int8,int1)", 1, "f:int81gt" }, + { NULL, "bool int82gt(int8,int2)", 1, "f:int82gt" }, + { NULL, "bool int84gt(int8,int4)", 1, "f:int84gt" }, + { NULL, "bool int8gt(int8,int8)", 1, "f:int8gt" }, + { PGSTROM, "bool float2_gt(float2,float2)", 1, "f:float2gt" }, + { PGSTROM, "bool float24_gt(float2,float4)", 1, "f:float24gt" }, + { PGSTROM, "bool float28_gt(float2,float8)", 1, "f:float28gt" }, + { PGSTROM, "bool float42_gt(float4,float2)", 1, "f:float42gt" }, + { NULL, "bool float4gt(float4,float4)", 1, "f:float4gt" }, + { NULL, "bool float48gt(float4,float8)", 1, "f:float48gt" }, + { PGSTROM, "bool float82_gt(float8,float2)", 1, "f:float82gt" }, + { NULL, "bool float84gt(float8,float4)", 1, "f:float84gt" }, + { NULL, "bool float8gt(float8,float8)", 1, "f:float8gt" }, + + /* '<' : less than operators */ + { PGSTROM, "bool int1lt(int1,int1)", 1, "f:int1lt" }, + { PGSTROM, "bool int12lt(int1,int2)", 1, "f:int12lt" }, + { PGSTROM, "bool int14lt(int1,int4)", 1, "f:int14lt" }, + { PGSTROM, "bool int18lt(int1,int8)", 1, "f:int18lt" }, + { PGSTROM, "bool int21lt(int2,int1)", 1, "f:int21lt" }, + { NULL, "bool int2lt(int2,int2)", 1, "f:int2lt" }, + { NULL, "bool int24lt(int2,int4)", 1, "f:int24lt" }, + { NULL, "bool int28lt(int2,int8)", 1, "f:int28lt" }, + { PGSTROM, "bool int41lt(int4,int1)", 1, "f:int41lt" }, + { NULL, "bool int42lt(int4,int2)", 1, "f:int42lt" }, + { NULL, "bool int4lt(int4,int4)", 1, "f:int4lt" }, + { NULL, "bool int48lt(int4,int8)", 1, "f:int48lt" }, + { PGSTROM, "bool int81lt(int8,int1)", 1, "f:int81lt" }, + { NULL, "bool int82lt(int8,int2)", 1, "f:int82lt" }, + { NULL, "bool int84lt(int8,int4)", 1, "f:int84lt" }, + { NULL, "bool int8lt(int8,int8)", 1, "f:int8lt" }, + { PGSTROM, "bool float2_lt(float2,float2)", 1, "f:float2lt" }, + { PGSTROM, "bool float24_lt(float2,float4)", 1, "f:float24lt" }, + { PGSTROM, "bool float28_lt(float2,float8)", 1, "f:float28lt" }, + { PGSTROM, "bool float42_lt(float4,float2)", 1, "f:float42lt" }, + { NULL, "bool float4lt(float4,float4)", 1, "f:float4lt" }, + { NULL, "bool float48lt(float4,float8)", 1, "f:float48lt" }, + { PGSTROM, "bool float82_lt(float8,float2)", 1, "f:float82lt" }, + { NULL, "bool float84lt(float8,float4)", 1, "f:float84lt" }, + { NULL, "bool float8lt(float8,float8)", 1, "f:float8lt" }, + + /* '>=' : relational greater-than or equal-to */ + { PGSTROM, "bool int1ge(int1,int1)", 1, "f:int1ge" }, + { PGSTROM, "bool int12ge(int1,int2)", 1, "f:int12ge" }, + { PGSTROM, "bool int14ge(int1,int4)", 1, "f:int14ge" }, + { PGSTROM, "bool int18ge(int1,int8)", 1, "f:int18ge" }, + { PGSTROM, "bool int21ge(int2,int1)", 1, "f:int21ge" }, + { NULL, "bool int2ge(int2,int2)", 1, "f:int2ge" }, + { NULL, "bool int24ge(int2,int4)", 1, "f:int24ge" }, + { NULL, "bool int28ge(int2,int8)", 1, "f:int28ge" }, + { PGSTROM, "bool int41ge(int4,int1)", 1, "f:int41ge" }, + { NULL, "bool int42ge(int4,int2)", 1, "f:int42ge" }, + { NULL, "bool int4ge(int4,int4)", 1, "f:int4ge" }, + { NULL, "bool int48ge(int4,int8)", 1, "f:int48ge" }, + { PGSTROM, "bool int81ge(int8,int1)", 1, "f:int81ge" }, + { NULL, "bool int82ge(int8,int2)", 1, "f:int82ge" }, + { NULL, "bool int84ge(int8,int4)", 1, "f:int84ge" }, + { NULL, "bool int8ge(int8,int8)", 1, "f:int8ge" }, + { PGSTROM, "bool float2_ge(float2,float2)", 1, "f:float2ge" }, + { PGSTROM, "bool float24_ge(float2,float4)", 1, "f:float24ge" }, + { PGSTROM, "bool float28_ge(float2,float8)", 1, "f:float28ge" }, + { PGSTROM, "bool float42_ge(float4,float2)", 1, "f:float42ge" }, + { NULL, "bool float4ge(float4,float4)", 1, "f:float4ge" }, + { NULL, "bool float48ge(float4,float8)", 1, "f:float48ge" }, + { PGSTROM, "bool float82_ge(float8,float2)", 1, "f:float82ge" }, + { NULL, "bool float84ge(float8,float4)", 1, "f:float84ge" }, + { NULL, "bool float8ge(float8,float8)", 1, "f:float8ge" }, + + /* '<=' : relational greater-than or equal-to */ + { PGSTROM, "bool int1le(int1,int1)", 1, "f:int1le" }, + { PGSTROM, "bool int12le(int1,int2)", 1, "f:int12le" }, + { PGSTROM, "bool int14le(int1,int4)", 1, "f:int14le" }, + { PGSTROM, "bool int18le(int1,int8)", 1, "f:int18le" }, + { PGSTROM, "bool int21le(int2,int1)", 1, "f:int21le" }, + { NULL, "bool int2le(int2,int2)", 1, "f:int2le" }, + { NULL, "bool int24le(int2,int4)", 1, "f:int24le" }, + { NULL, "bool int28le(int2,int8)", 1, "f:int28le" }, + { PGSTROM, "bool int41le(int4,int1)", 1, "f:int41le" }, + { NULL, "bool int42le(int4,int2)", 1, "f:int42le" }, + { NULL, "bool int4le(int4,int4)", 1, "f:int4le" }, + { NULL, "bool int48le(int4,int8)", 1, "f:int48le" }, + { PGSTROM, "bool int81le(int8,int1)", 1, "f:int81le" }, + { NULL, "bool int82le(int8,int2)", 1, "f:int82le" }, + { NULL, "bool int84le(int8,int4)", 1, "f:int84le" }, + { NULL, "bool int8le(int8,int8)", 1, "f:int8le" }, + { PGSTROM, "bool float2_le(float2,float2)", 1, "f:float2le" }, + { PGSTROM, "bool float24_le(float2,float4)", 1, "f:float24le" }, + { PGSTROM, "bool float28_le(float2,float8)", 1, "f:float28le" }, + { PGSTROM, "bool float42_le(float4,float2)", 2, "f:float42le" }, + { NULL, "bool float4le(float4,float4)", 1, "f:float4le" }, + { NULL, "bool float48le(float4,float8)", 1, "f:float48le" }, + { PGSTROM, "bool float82_le(float8,float2)", 1, "f:float82le" }, + { NULL, "bool float84le(float8,float4)", 1, "f:float84le" }, + { NULL, "bool float8le(float8,float8)", 1, "f:float8le" }, + + /* '&' : bitwise and */ + { PGSTROM, "int1 int1and(int1,int1)", 1, "p/f:int1and" }, + { NULL, "int2 int2and(int2,int2)", 1, "p/f:int2and" }, + { NULL, "int4 int4and(int4,int4)", 1, "p/f:int4and" }, + { NULL, "int8 int8and(int8,int8)", 1, "p/f:int8and" }, + + /* '|' : bitwise or */ + { PGSTROM, "int1 int1or(int1,int1)", 1, "p/f:int1or" }, + { NULL, "int2 int2or(int2,int2)", 1, "p/f:int2or" }, + { NULL, "int4 int4or(int4,int4)", 1, "p/f:int4or" }, + { NULL, "int8 int8or(int8,int8)", 1, "p/f:int8or" }, + + /* '#' : bitwise xor */ + { PGSTROM, "int1 int1xor(int1,int1)", 1, "p/f:int1xor" }, + { NULL, "int2 int2xor(int2,int2)", 1, "p/f:int2xor" }, + { NULL, "int4 int4xor(int4,int4)", 1, "p/f:int4xor" }, + { NULL, "int8 int8xor(int8,int8)", 1, "p/f:int8xor" }, + + /* '~' : bitwise not operators */ + { PGSTROM, "int1 int1not(int1)", 1, "p/f:int1not" }, + { NULL, "int2 int2not(int2)", 1, "p/f:int2not" }, + { NULL, "int4 int4not(int4)", 1, "p/f:int4not" }, + { NULL, "int8 int8not(int8)", 1, "p/f:int8not" }, + + /* '>>' : right shift */ + { PGSTROM, "int1 int1shr(int1,int4)", 1, "p/f:int1shr" }, + { NULL, "int2 int2shr(int2,int4)", 1, "p/f:int2shr" }, + { NULL, "int4 int4shr(int4,int4)", 1, "p/f:int4shr" }, + { NULL, "int8 int8shr(int8,int4)", 1, "p/f:int8shr" }, + + /* '<<' : left shift */ + { PGSTROM, "int1 int1shl(int1,int4)", 1, "p/f:int1shl" }, + { NULL, "int2 int2shl(int2,int4)", 1, "p/f:int2shl" }, + { NULL, "int4 int4shl(int4,int4)", 1, "p/f:int4shl" }, + { NULL, "int8 int8shl(int8,int4)", 1, "p/f:int8shl" }, + + /* comparison functions */ + { NULL, "int4 btboolcmp(bool,bool)", 1, "p/f:type_compare" }, + { PGSTROM, "int4 btint1cmp(int1,int1)", 1, "p/f:type_compare" }, + { PGSTROM, "int4 btint12cmp(int1,int2)", 1, "p/f:type_compare" }, + { PGSTROM, "int4 btint14cmp(int1,int4)", 1, "p/f:type_compare" }, + { PGSTROM, "int4 btint18cmp(int1,int8)", 1, "p/f:type_compare" }, + { PGSTROM, "int4 btint21cmp(int2,int1)", 1, "p/f:type_compare" }, + { NULL, "int4 btint2cmp(int2,int2)", 1, "p/f:type_compare" }, + { NULL, "int4 btint24cmp(int2,int4)", 1, "p/f:type_compare" }, + { NULL, "int4 btint28cmp(int2,int8)", 1, "p/f:type_compare" }, + { PGSTROM, "int4 btint41cmp(int4,int1)", 1, "p/f:type_compare" }, + { NULL, "int4 btint42cmp(int4,int2)", 1, "p/f:type_compare" }, + { NULL, "int4 btint4cmp(int4,int4)", 1, "p/f:type_compare" }, + { NULL, "int4 btint48cmp(int4,int8)", 1, "p/f:type_compare" }, + { PGSTROM, "int4 btint81cmp(int8,int1)", 1, "p/f:type_compare" }, + { NULL, "int4 btint82cmp(int8,int2)", 1, "p/f:type_compare" }, + { NULL, "int4 btint84cmp(int8,int4)", 1, "p/f:type_compare" }, + { NULL, "int4 btint8cmp(int8,int8)", 1, "p/f:type_compare" }, + { PGSTROM, "int4 float2_cmp(float2,float2)", 1, "f:type_compare" }, + { PGSTROM, "int4 float24_cmp(float2,float4)", 1, "f:type_compare" }, + { PGSTROM, "int4 float28_cmp(float2,float8)", 1, "f:type_compare" }, + { PGSTROM, "int4 float42_cmp(float4,float2)", 1, "f:type_compare" }, + { NULL, "int4 btfloat4cmp(float4,float4)", 1, "p/f:type_compare" }, + { NULL, "int4 btfloat48cmp(float4,float8)", 1, "p/f:type_compare" }, + { NULL, "int4 btfloat84cmp(float8,float4)", 1, "p/f:type_compare" }, + { NULL, "int4 btfloat8cmp(float8,float8)", 1, "p/f:type_compare" }, + { PGSTROM, "int4 float82_cmp(float8,float2)", 1, "f:type_compare" }, + + /* currency cast */ + { NULL, "money money(numeric)", 1, "m/f:numeric_cash" }, + { NULL, "money money(int4)", 1, "m/f:int4_cash" }, + { NULL, "money money(int8)", 1, "m/f:int8_cash" }, + /* currency operators */ + { NULL, "money cash_pl(money,money)", 1, "m/f:cash_pl" }, + { NULL, "money cash_mi(money,money)", 1, "m/f:cash_mi" }, + { NULL, "float8 cash_div_cash(money,money)", 2, "m/f:cash_div_cash" }, + { PGSTROM, "money cash_mul_int1(money,int1)", 2, "m/f:cash_mul_int1" }, + { NULL, "money cash_mul_int2(money,int2)", 2, "m/f:cash_mul_int2" }, + { NULL, "money cash_mul_int4(money,int4)", 2, "m/f:cash_mul_int4" }, + { PGSTROM, "money cash_mul_flt2(money,float2)", 2, "m/f:cash_mul_flt2" }, + { NULL, "money cash_mul_flt4(money,float4)", 2, "m/f:cash_mul_flt4" }, + { NULL, "money cash_mul_flt8(money,float8)", 2, "m/f:cash_mul_flt8" }, + { PGSTROM, "money cash_div_int1(money,int1)", 2, "m/f:cash_div_int1" }, + { NULL, "money cash_div_int2(money,int2)", 2, "m/f:cash_div_int2" }, + { NULL, "money cash_div_int4(money,int4)", 2, "m/f:cash_div_int4" }, + { PGSTROM, "money cash_div_flt2(money,float2)", 2, "m/f:cash_div_flt2" }, + { NULL, "money cash_div_flt4(money,float4)", 2, "m/f:cash_div_flt4" }, + { NULL, "money cash_div_flt8(money,float8)", 2, "m/f:cash_div_flt8" }, + { PGSTROM, "money int1_mul_cash(int1,money)", 2, "m/f:int1_mul_cash" }, + { NULL, "money int2_mul_cash(int2,money)", 2, "m/f:int2_mul_cash" }, + { NULL, "money int4_mul_cash(int4,money)", 2, "m/f:int4_mul_cash" }, + { PGSTROM, "money flt2_mul_cash(float2,money)", 2, "m/f:flt2_mul_cash" }, + { NULL, "money flt4_mul_cash(float4,money)", 2, "m/f:flt4_mul_cash" }, + { NULL, "money flt8_mul_cash(float8,money)", 2, "m/f:flt8_mul_cash" }, + /* currency comparison */ + { NULL, "int4 cash_cmp(money,money)", 1, "m/f:type_compare" }, + { NULL, "bool cash_eq(money,money)", 1, "m/f:cash_eq" }, + { NULL, "bool cash_ne(money,money)", 1, "m/f:cash_ne" }, + { NULL, "bool cash_lt(money,money)", 1, "m/f:cash_lt" }, + { NULL, "bool cash_le(money,money)", 1, "m/f:cash_le" }, + { NULL, "bool cash_gt(money,money)", 1, "m/f:cash_gt" }, + { NULL, "bool cash_ge(money,money)", 1, "m/f:cash_ge" }, + /* uuid comparison */ + { NULL, "int4 uuid_cmp(uuid,uuid)", 5, "m/f:type_compare" }, + { NULL, "bool uuid_eq(uuid,uuid)", 5, "m/f:uuid_eq" }, + { NULL, "bool uuid_ne(uuid,uuid)", 5, "m/f:uuid_ne" }, + { NULL, "bool uuid_lt(uuid,uuid)", 5, "m/f:uuid_lt" }, + { NULL, "bool uuid_le(uuid,uuid)", 5, "m/f:uuid_le" }, + { NULL, "bool uuid_gt(uuid,uuid)", 5, "m/f:uuid_gt" }, + { NULL, "bool uuid_ge(uuid,uuid)", 5, "m/f:uuid_ge" }, + /* macaddr comparison */ + { NULL, "int4 macaddr_cmp(macaddr,macaddr)", 5, "m/f:type_compare" }, + { NULL, "bool macaddr_eq(macaddr,macaddr)", 5, "m/f:macaddr_eq" }, + { NULL, "bool macaddr_ne(macaddr,macaddr)", 5, "m/f:macaddr_ne" }, + { NULL, "bool macaddr_lt(macaddr,macaddr)", 5, "m/f:macaddr_lt" }, + { NULL, "bool macaddr_le(macaddr,macaddr)", 5, "m/f:macaddr_le" }, + { NULL, "bool macaddr_gt(macaddr,macaddr)", 5, "m/f:macaddr_gt" }, + { NULL, "bool macaddr_ge(macaddr,macaddr)", 5, "m/f:macaddr_ge" }, + /* inet comparison */ + { NULL, "int4 network_cmp(inet,inet)", 8, "m/f:type_compare" }, + { NULL, "bool network_eq(inet,inet)", 8, "m/f:network_eq" }, + { NULL, "bool network_ne(inet,inet)", 8, "m/f:network_ne" }, + { NULL, "bool network_lt(inet,inet)", 8, "m/f:network_lt" }, + { NULL, "bool network_le(inet,inet)", 8, "m/f:network_le" }, + { NULL, "bool network_gt(inet,inet)", 8, "m/f:network_gt" }, + { NULL, "bool network_ge(inet,inet)", 8, "m/f:network_ge" }, + { NULL, "inet network_larger(inet,inet)", 8, "m/f:network_larger" }, + { NULL, "inet network_smaller(inet,inet)",8, "m/f:network_smaller" }, + { NULL, "bool network_sub(inet,inet)", 8, "m/f:network_sub" }, + { NULL, "bool network_subeq(inet,inet)", 8, "m/f:network_subeq" }, + { NULL, "bool network_sup(inet,inet)", 8, "m/f:network_sup" }, + { NULL, "bool network_supeq(inet,inet)", 8, "m/f:network_supeq" }, + { NULL, "bool network_overlap(inet,inet)",8, "m/f:network_overlap" }, + + /* + * Mathmatical functions + */ + { PGSTROM, "int1 abs(int1)", 1, "p/f:int1abs" }, + { NULL, "int2 abs(int2)", 1, "p/f:int2abs" }, + { NULL, "int4 abs(int4)", 1, "p/f:int4abs" }, + { NULL, "int8 abs(int8)", 1, "p/f:int8abs" }, + { PGSTROM, "float2 abs(float2)", 1, "p/f:float2abs" }, + { NULL, "float4 abs(float4)", 1, "p/f:float4abs" }, + { NULL, "float8 abs(float8)", 1, "p/f:float8abs" }, + { NULL, "float8 cbrt(float8)", 1, "m/f:cbrt" }, + { NULL, "float8 dcbrt(float8)", 1, "m/f:cbrt" }, + { NULL, "float8 ceil(float8)", 1, "m/f:ceil" }, + { NULL, "float8 ceiling(float8)", 1, "m/f:ceil" }, + { NULL, "float8 exp(float8)", 5, "m/f:exp" }, + { NULL, "float8 dexp(float8)", 5, "m/f:exp" }, + { NULL, "float8 floor(float8)", 1, "m/f:floor" }, + { NULL, "float8 ln(float8)", 5, "m/f:ln" }, + { NULL, "float8 dlog1(float8)", 5, "m/f:ln" }, + { NULL, "float8 log(float8)", 5, "m/f:log10" }, + { NULL, "float8 dlog10(float8)", 5, "m/f:log10" }, + { NULL, "float8 pi()", 0, "m/f:dpi" }, + { NULL, "float8 power(float8,float8)", 5, "m/f:dpow" }, + { NULL, "float8 pow(float8,float8)", 5, "m/f:dpow" }, + { NULL, "float8 dpow(float8,float8)", 5, "m/f:dpow" }, + { NULL, "float8 round(float8)", 5, "m/f:round" }, + { NULL, "float8 dround(float8)", 5, "m/f:round" }, + { NULL, "float8 sign(float8)", 1, "m/f:sign" }, + { NULL, "float8 sqrt(float8)", 5, "m/f:dsqrt" }, + { NULL, "float8 dsqrt(float8)", 5, "m/f:dsqrt" }, + { NULL, "float8 trunc(float8)", 1, "m/f:trunc" }, + { NULL, "float8 dtrunc(float8)", 1, "m/f:trunc" }, + + /* + * Trigonometric function + */ + { NULL, "float8 degrees(float8)", 5, "m/f:degrees" }, + { NULL, "float8 radians(float8)", 5, "m/f:radians" }, + { NULL, "float8 acos(float8)", 5, "m/f:acos" }, + { NULL, "float8 asin(float8)", 5, "m/f:asin" }, + { NULL, "float8 atan(float8)", 5, "m/f:atan" }, + { NULL, "float8 atan2(float8,float8)", 5, "m/f:atan2" }, + { NULL, "float8 cos(float8)", 5, "m/f:cos" }, + { NULL, "float8 cot(float8)", 5, "m/f:cot" }, + { NULL, "float8 sin(float8)", 5, "m/f:sin" }, + { NULL, "float8 tan(float8)", 5, "m/f:tan" }, + + /* + * Numeric functions + * ------------------------- */ + /* Numeric type cast functions */ + { PGSTROM, "int1 int1(numeric)", 8, "f:numeric_int1" }, + { NULL, "int2 int2(numeric)", 8, "f:numeric_int2" }, + { NULL, "int4 int4(numeric)", 8, "f:numeric_int4" }, + { NULL, "int8 int8(numeric)", 8, "f:numeric_int8" }, + { PGSTROM, "float2 float2(numeric)", 8, "f:numeric_float2" }, + { NULL, "float4 float4(numeric)", 8, "f:numeric_float4" }, + { NULL, "float8 float8(numeric)", 8, "f:numeric_float8" }, + { PGSTROM, "numeric numeric(int1)", 5, "f:int1_numeric" }, + { NULL, "numeric numeric(int2)", 5, "f:int2_numeric" }, + { NULL, "numeric numeric(int4)", 5, "f:int4_numeric" }, + { NULL, "numeric numeric(int8)", 5, "f:int8_numeric" }, + { PGSTROM, "numeric numeric(float2)", 5, "f:float2_numeric" }, + { NULL, "numeric numeric(float4)", 5, "f:float4_numeric" }, + { NULL, "numeric numeric(float8)", 5, "f:float8_numeric" }, + /* Numeric operators */ + { NULL, "numeric numeric_add(numeric,numeric)", 10, "f:numeric_add" }, + { NULL, "numeric numeric_sub(numeric,numeric)", 10, "f:numeric_sub" }, + { NULL, "numeric numeric_mul(numeric,numeric)", 10, "f:numeric_mul" }, + { NULL, "numeric numeric_uplus(numeric)", 10, "f:numeric_uplus" }, + { NULL, "numeric numeric_uminus(numeric)", 10, "f:numeric_uminus" }, + { NULL, "numeric numeric_abs(numeric)", 10, "f:numeric_abs" }, + { NULL, "numeric abs(numeric)", 10, "f:numeric_abs" }, + /* Numeric comparison */ + { NULL, "bool numeric_eq(numeric,numeric)", 8, "f:numeric_eq" }, + { NULL, "bool numeric_ne(numeric,numeric)", 8, "f:numeric_ne" }, + { NULL, "bool numeric_lt(numeric,numeric)", 8, "f:numeric_lt" }, + { NULL, "bool numeric_le(numeric,numeric)", 8, "f:numeric_le" }, + { NULL, "bool numeric_gt(numeric,numeric)", 8, "f:numeric_gt" }, + { NULL, "bool numeric_ge(numeric,numeric)", 8, "f:numeric_ge" }, + { NULL, "int4 numeric_cmp(numeric,numeric)", 8, "f:type_compare" }, + + /* + * Date and time functions + * ------------------------------- */ + /* Type cast functions */ + { NULL, "date date(timestamp)", 1, "t/f:timestamp_date" }, + { NULL, "date date(timestamptz)", 1, "t/f:timestamptz_date" }, + { NULL, "time time(timetz)", 1, "t/f:timetz_time" }, + { NULL, "time time(timestamp)", 1, "t/f:timestamp_time" }, + { NULL, "time time(timestamptz)", 1, "t/f:timestamptz_time" }, + { NULL, "timetz timetz(time)", 1, "t/f:time_timetz" }, + { NULL, "timetz timetz(timestamptz)", 1, "t/f:timestamptz_timetz" }, +#ifdef NOT_USED + { NULL, "timetz timetz(timetz,int4)", 1, "t/f:timetz_scale" }, +#endif + { NULL, "timestamp timestamp(date)", + 1, "t/f:date_timestamp" }, + { NULL, "timestamp timestamp(timestamptz)", + 1, "t/f:timestamptz_timestamp" }, + { NULL, "timestamptz timestamptz(date)", + 1, "t/f:date_timestamptz" }, + { NULL, "timestamptz timestamptz(timestamp)", + 1, "t/f:timestamp_timestamptz" }, + /* timedata operators */ + { NULL, "date date_pli(date,int4)", 1, "t/f:date_pli" }, + { NULL, "date date_mii(date,int4)", 1, "t/f:date_mii" }, + { NULL, "int4 date_mi(date,date)", 1, "t/f:date_mi" }, + { NULL, "timestamp datetime_pl(date,time)", 2, "t/f:datetime_pl" }, + { NULL, "date integer_pl_date(int4,date)", 2, "t/f:integer_pl_date" }, + { NULL, "timestamp timedate_pl(time,date)", 2, "t/f:timedate_pl" }, + /* time - time => interval */ + { NULL, "interval time_mi_time(time,time)", + 2, "t/f:time_mi_time" }, + /* timestamp - timestamp => interval */ + { NULL, "interval timestamp_mi(timestamp,timestamp)", + 4, "t/f:timestamp_mi" }, + /* timetz +/- interval => timetz */ + { NULL, "timetz timetz_pl_interval(timetz,interval)", + 4, "t/f:timetz_pl_interval" }, + { NULL, "timetz timetz_mi_interval(timetz,interval)", + 4, "t/f:timetz_mi_interval" }, + /* timestamptz +/- interval => timestamptz */ + { NULL, "timestamptz timestamptz_pl_interval(timestamptz,interval)", + 4, "t/f:timestamptz_pl_interval" }, + { NULL, "timestamptz timestamptz_mi_interval(timestamptz,interval)", + 4, "t/f:timestamptz_mi_interval" }, + /* interval operators */ + { NULL, "interval interval_um(interval)", 4, "t/f:interval_um" }, + { NULL, "interval interval_pl(interval,interval)", 4, "t/f:interval_pl" }, + { NULL, "interval interval_mi(interval,interval)", 4, "t/f:interval_mi" }, + /* date + timetz => timestamptz */ + { NULL, "timestamptz datetimetz_pl(date,timetz)", + 4, "t/f:datetimetz_timestamptz" }, + { NULL, "timestamptz timestamptz(date,timetz)", + 4, "t/f:datetimetz_timestamptz" }, + /* comparison between date */ + { NULL, "bool date_eq(date,date)", 2, "t/f:date_eq" }, + { NULL, "bool date_ne(date,date)", 2, "t/f:date_ne" }, + { NULL, "bool date_lt(date,date)", 2, "t/f:date_lt" }, + { NULL, "bool date_le(date,date)", 2, "t/f:date_le" }, + { NULL, "bool date_gt(date,date)", 2, "t/f:date_gt" }, + { NULL, "bool date_ge(date,date)", 2, "t/f:date_ge" }, + { NULL, "int4 date_cmp(date,date)", 2, "t/f:type_compare" }, + /* comparison of date and timestamp */ + { NULL, "bool date_eq_timestamp(date,timestamp)", + 2, "t/f:date_eq_timestamp" }, + { NULL, "bool date_ne_timestamp(date,timestamp)", + 2, "t/f:date_ne_timestamp" }, + { NULL, "bool date_lt_timestamp(date,timestamp)", + 2, "t/f:date_lt_timestamp" }, + { NULL, "bool date_le_timestamp(date,timestamp)", + 2, "t/f:date_le_timestamp" }, + { NULL, "bool date_gt_timestamp(date,timestamp)", + 2, "t/f:date_gt_timestamp" }, + { NULL, "bool date_ge_timestamp(date,timestamp)", + 2, "t/f:date_ge_timestamp" }, + { NULL, "int4 date_cmp_timestamp(date,timestamp)", + 2, "t/f:date_cmp_timestamp" }, + /* comparison between time */ + { NULL, "bool time_eq(time,time)", 2, "t/f:time_eq" }, + { NULL, "bool time_ne(time,time)", 2, "t/f:time_ne" }, + { NULL, "bool time_lt(time,time)", 2, "t/f:time_lt" }, + { NULL, "bool time_le(time,time)", 2, "t/f:time_le" }, + { NULL, "bool time_gt(time,time)", 2, "t/f:time_gt" }, + { NULL, "bool time_ge(time,time)", 2, "t/f:time_ge" }, + { NULL, "int4 time_cmp(time,time)",2, "t/f:type_compare" }, + /* comparison between timetz */ + { NULL, "bool timetz_eq(timetz,timetz)", 1, "t/f:timetz_eq" }, + { NULL, "bool timetz_ne(timetz,timetz)", 1, "t/f:timetz_ne" }, + { NULL, "bool timetz_lt(timetz,timetz)", 1, "t/f:timetz_lt" }, + { NULL, "bool timetz_le(timetz,timetz)", 1, "t/f:timetz_le" }, + { NULL, "bool timetz_ge(timetz,timetz)", 1, "t/f:timetz_ge" }, + { NULL, "bool timetz_gt(timetz,timetz)", 1, "t/f:timetz_gt" }, + { NULL, "int4 timetz_cmp(timetz,timetz)",1, "t/f:timetz_cmp" }, + /* comparison between timestamp */ + { NULL, "bool timestamp_eq(timestamp,timestamp)", 1, "t/f:timestamp_eq" }, + { NULL, "bool timestamp_ne(timestamp,timestamp)", 1, "t/f:timestamp_ne" }, + { NULL, "bool timestamp_lt(timestamp,timestamp)", 1, "t/f:timestamp_lt" }, + { NULL, "bool timestamp_le(timestamp,timestamp)", 1, "t/f:timestamp_le" }, + { NULL, "bool timestamp_gt(timestamp,timestamp)", 1, "t/f:timestamp_gt" }, + { NULL, "bool timestamp_ge(timestamp,timestamp)", 1, "t/f:timestamp_ge" }, + { NULL, "int4 timestamp_cmp(timestamp,timestamp)",1, "t/f:timestamp_cmp"}, + /* comparison of timestamp and date */ + { NULL, "bool timestamp_eq_date(timestamp,date)", + 3, "t/f:timestamp_eq_date" }, + { NULL, "bool timestamp_ne_date(timestamp,date)", + 3, "t/f:timestamp_ne_date" }, + { NULL, "bool timestamp_lt_date(timestamp,date)", + 3, "t/f:timestamp_lt_date" }, + { NULL, "bool timestamp_le_date(timestamp,date)", + 3, "t/f:timestamp_le_date" }, + { NULL, "bool timestamp_gt_date(timestamp,date)", + 3, "t/f:timestamp_gt_date" }, + { NULL, "bool timestamp_ge_date(timestamp,date)", + 3, "t/f:timestamp_ge_date" }, + { NULL, "int4 timestamp_cmp_date(timestamp,date)", + 3, "t/f:timestamp_cmp_date"}, + /* comparison between timestamptz */ + { NULL, "bool timestamptz_eq(timestamptz,timestamptz)", + 1, "t/f:timestamptz_eq" }, + { NULL, "bool timestamptz_ne(timestamptz,timestamptz)", + 1, "t/f:timestamptz_ne" }, + { NULL, "bool timestamptz_lt(timestamptz,timestamptz)", + 1, "t/f:timestamptz_lt" }, + { NULL, "bool timestamptz_le(timestamptz,timestamptz)", + 1, "t/f:timestamptz_le" }, + { NULL, "bool timestamptz_gt(timestamptz,timestamptz)", + 1, "t/f:timestamptz_gt" }, + { NULL, "bool timestamptz_ge(timestamptz,timestamptz)", + 1, "t/f:timestamptz_ge" }, + { NULL, "int4 timestamptz_cmp(timestamptz,timestamptz)", + 1, "t/f:type_compare" }, + /* comparison between date and timestamptz */ + { NULL, "bool date_lt_timestamptz(date,timestamptz)", + 3, "t/f:date_lt_timestamptz" }, + { NULL, "bool date_le_timestamptz(date,timestamptz)", + 3, "t/f:date_le_timestamptz" }, + { NULL, "bool date_eq_timestamptz(date,timestamptz)", + 3, "t/f:date_eq_timestamptz" }, + { NULL, "bool date_ge_timestamptz(date,timestamptz)", + 3, "t/f:date_ge_timestamptz" }, + { NULL, "bool date_gt_timestamptz(date,timestamptz)", + 3, "t/f:date_gt_timestamptz" }, + { NULL, "bool date_ne_timestamptz(date,timestamptz)", + 3, "t/f:date_ne_timestamptz" }, + /* comparison between timestamptz and date */ + { NULL, "bool timestamptz_lt_date(timestamptz,date)", + 3, "t/f:timestamptz_lt_date" }, + { NULL, "bool timestamptz_le_date(timestamptz,date)", + 3, "t/f:timestamptz_le_date" }, + { NULL, "bool timestamptz_eq_date(timestamptz,date)", + 3, "t/f:timestamptz_eq_date" }, + { NULL, "bool timestamptz_ge_date(timestamptz,date)", + 3, "t/f:timestamptz_ge_date" }, + { NULL, "bool timestamptz_gt_date(timestamptz,date)", + 3, "t/f:timestamptz_gt_date" }, + { NULL, "bool timestamptz_ne_date(timestamptz,date)", + 3, "t/f:timestamptz_ne_date" }, + /* comparison between timestamp and timestamptz */ + { NULL, "bool timestamp_lt_timestamptz(timestamp,timestamptz)", + 2, "t/f:timestamp_lt_timestamptz" }, + { NULL, "bool timestamp_le_timestamptz(timestamp,timestamptz)", + 2, "t/f:timestamp_le_timestamptz" }, + { NULL, "bool timestamp_eq_timestamptz(timestamp,timestamptz)", + 2, "t/f:timestamp_eq_timestamptz" }, + { NULL, "bool timestamp_ge_timestamptz(timestamp,timestamptz)", + 2, "t/f:timestamp_ge_timestamptz" }, + { NULL, "bool timestamp_gt_timestamptz(timestamp,timestamptz)", + 2, "t/f:timestamp_gt_timestamptz" }, + { NULL, "bool timestamp_ne_timestamptz(timestamp,timestamptz)", + 2, "t/f:timestamp_ne_timestamptz" }, + /* comparison between timestamptz and timestamp */ + { NULL, "bool timestamptz_lt_timestamp(timestamptz,timestamp)", + 2, "t/f:timestamptz_lt_timestamp" }, + { NULL, "bool timestamptz_le_timestamp(timestamptz,timestamp)", + 2, "t/f:timestamptz_le_timestamp" }, + { NULL, "bool timestamptz_eq_timestamp(timestamptz,timestamp)", + 2, "t/f:timestamptz_eq_timestamp" }, + { NULL, "bool timestamptz_ge_timestamp(timestamptz,timestamp)", + 2, "t/f:timestamptz_ge_timestamp" }, + { NULL, "bool timestamptz_gt_timestamp(timestamptz,timestamp)", + 2, "t/f:timestamptz_gt_timestamp" }, + { NULL, "bool timestamptz_ne_timestamp(timestamptz,timestamp)", + 2, "t/f:timestamptz_ne_timestamp" }, + /* comparison between intervals */ + { NULL, "bool interval_eq(interval,interval)", 2, "t/f:interval_eq" }, + { NULL, "bool interval_ne(interval,interval)", 2, "t/f:interval_ne" }, + { NULL, "bool interval_lt(interval,interval)", 2, "t/f:interval_lt" }, + { NULL, "bool interval_le(interval,interval)", 2, "t/f:interval_le" }, + { NULL, "bool interval_ge(interval,interval)", 2, "t/f:interval_ge" }, + { NULL, "bool interval_gt(interval,interval)", 2, "t/f:interval_gt" }, + { NULL, "int4 interval_cmp(interval,interval)",2, "t/f:interval_cmp"}, + /* overlaps() */ + { NULL, "bool overlaps(time,time,time,time)", + 20, "t/f:overlaps_time" }, + { NULL, "bool overlaps(timetz,timetz,timetz,timetz)", + 20, "t/f:overlaps_timetz" }, + { NULL, "bool overlaps(timestamp,timestamp,timestamp,timestamp)", + 20, "t/f:overlaps_timestamp" }, + { NULL, "bool overlaps(timestamptz,timestamptz,timestamptz,timestamptz)", + 20, "t/f:overlaps_timestamptz" }, + /* extract() - PG14 changed to return numeric, not float8 */ + { NULL, "float8 date_part(text,timestamp)", + 100, "t/f:date_part_timestamp"}, + { NULL, "float8 date_part(text,timestamptz)", + 100, "t/f:date_part_timestamptz"}, + { NULL, "float8 date_part(text,interval)", + 100, "t/f:date_part_interval"}, + { NULL, "float8 date_part(text,timetz)", + 100, "t/f:date_part_timetz"}, + { NULL, "float8 date_part(text,time)", + 100, "t/f:date_part_time"}, + + { NULL, "numeric extract(text,timestamp)", + 100, "t/f:extract_timestamp"}, + { NULL, "numeric extract(text,timestamptz)", + 100, "t/f:extract_timestamptz"}, + { NULL, "numeric extract(text,time)", + 100, "t/f:extract_time"}, + { NULL, "numeric extract(text,timetz)", + 100, "t/f:extract_timetz"}, + { NULL, "numeric extract(text,interval)", + 100, "t/f:extract_interval"}, + + /* other time and data functions */ + { NULL, "timestamptz now()", 1, "t/f:now" }, + + /* macaddr functions */ + { NULL, "macaddr trunc(macaddr)", 8, "m/f:macaddr_trunc" }, + { NULL, "macaddr macaddr_not(macaddr)", 8, "m/f:macaddr_not" }, + { NULL, "macaddr macaddr_and(macaddr,macaddr)", 8, "m/f:macaddr_and" }, + { NULL, "macaddr macaddr_or(macaddr,macaddr)", 8, "m/f:macaddr_or" }, + + /* inet/cidr functions */ + { NULL, "iner set_masklen(inet,int4)", 8, "m/f:inet_set_masklen" }, + { NULL, "cidr set_masklen(cidr,int4)", 8, "m/f:cidr_set_masklen" }, + { NULL, "int4 family(inet)", 8, "m/f:inet_family" }, + { NULL, "cidr network(inet)", 8, "m/f:network_network" }, + { NULL, "inet netmask(inet)", 8, "m/f:inet_netmask" }, + { NULL, "int4 masklen(inet)", 8, "m/f:inet_masklen" }, + { NULL, "inet broadcast(inet)", 8, "m/f:inet_broadcast" }, + { NULL, "iner hostmask(inet)", 8, "m/f:inet_hostmask" }, + { NULL, "cidr cidr(iner)", 8, "m/f:inet_to_cidr" }, + { NULL, "inet inetnot(inet)", 8, "m/f:inet_not" }, + { NULL, "inet inetand(inet,inet)", 8, "m/f:inet_and" }, + { NULL, "inet inetor(inet,inet)", 8, "m/f:inet_or" }, + { NULL, "inet inetpl(inet,int8)", 8, "m/f:inetpl_int8" }, + { NULL, "inet inetmi_int8(inet,int8)", 8, "m/f:inetmi_int8" }, + { NULL, "int8 inetmi(inet,inet)", 8, "m/f:inetmi" }, + { NULL, "bool inet_same_family(inet,inet)", 8, "m/f:inet_same_family" }, +// { NULL, "inet inet_merge(inet,inet)", 8, "m/f:inet_merge" }, + + /* + * Text functions + */ + { NULL, "bool bpchareq(bpchar,bpchar)", 200, "s/f:bpchareq" }, + { NULL, "bool bpcharne(bpchar,bpchar)", 200, "s/f:bpcharne" }, + { NULL, "bool bpcharlt(bpchar,bpchar)", 200, "sL/f:bpcharlt" }, + { NULL, "bool bpcharle(bpchar,bpchar)", 200, "sL/f:bpcharle" }, + { NULL, "bool bpchargt(bpchar,bpchar)", 200, "sL/f:bpchargt" }, + { NULL, "bool bpcharge(bpchar,bpchar)", 200, "sL/f:bpcharge" }, + { NULL, "int4 bpcharcmp(bpchar,bpchar)",200, "sL/f:type_compare"}, + { NULL, "int4 length(bpchar)", 2, "sL/f:bpcharlen"}, + { NULL, "bool texteq(text,text)", 200, "s/f:texteq" }, + { NULL, "bool textne(text,text)", 200, "s/f:textne" }, + { NULL, "bool text_lt(text,text)", 200, "sL/f:text_lt" }, + { NULL, "bool text_le(text,text)", 200, "sL/f:text_le" }, + { NULL, "bool text_gt(text,text)", 200, "sL/f:text_gt" }, + { NULL, "bool text_ge(text,text)", 200, "sL/f:text_ge" }, + { NULL, "int4 bttextcmp(text,text)", 200, "sL/f:type_compare" }, + /* LIKE operators */ + { NULL, "bool like(text,text)", 9999, "s/f:textlike" }, + { NULL, "bool textlike(text,text)", 9999, "s/f:textlike" }, + { NULL, "bool bpcharlike(bpchar,text)", 9999, "s/f:bpcharlike" }, + { NULL, "bool notlike(text,text)", 9999, "s/f:textnlike" }, + { NULL, "bool textnlike(text,text)", 9999, "s/f:textnlike" }, + { NULL, "bool bpcharnlike(bpchar,text)", 9999, "s/f:bpcharnlike" }, + /* ILIKE operators */ + { NULL, "bool texticlike(text,text)", 9999, "Ls/f:texticlike" }, + { NULL, "bool bpchariclike(text,text)", 9999, "Ls/f:bpchariclike" }, + { NULL, "bool texticnlike(text,text)", 9999, "Ls/f:texticnlike" }, + { NULL, "bool bpcharicnlike(bpchar,text)",9999, "Ls/f:bpcharicnlike" }, + /* string operations */ + { NULL, "int4 length(text)", 2, "s/f:textlen" }, + { NULL, "text textcat(text,text)", + 999, "Cs/f:textcat", + vlbuf_estimate_textcat + }, + { NULL, "text concat(text,text)", + 999, "Cs/f:text_concat2", + vlbuf_estimate_textcat + }, + { NULL, "text concat(text,text,text)", + 999, "Cs/f:text_concat3", + vlbuf_estimate_textcat + }, + { NULL, "text concat(text,text,text,text)", + 999, "Cs/f:text_concat4", + vlbuf_estimate_textcat + }, + { NULL, "text substr(text,int4,int4)", + 10, "Cs/f:text_substring", + vlbuf_estimate_substring + }, + { NULL, "text substring(text,int4,int4)", + 10, "Cs/f:text_substring", + vlbuf_estimate_substring + }, + { NULL, "text substr(text,int4)", + 10, "Cs/f:text_substring_nolen", + vlbuf_estimate_substring + }, + { NULL, "text substring(text,int4)", + 10, "Cs/f:text_substring_nolen", + vlbuf_estimate_substring + }, + /* jsonb operators */ + { NULL, "jsonb jsonb_object_field(jsonb,text)", + 1000, "jC/f:jsonb_object_field", + vlbuf_estimate_jsonb + }, + { NULL, "text jsonb_object_field_text(jsonb,text)", + 1000, "jC/f:jsonb_object_field_text", + vlbuf_estimate_jsonb + }, + { NULL, "jsonb jsonb_array_element(jsonb,int4)", + 1000, "jC/f:jsonb_array_element", + vlbuf_estimate_jsonb + }, + { NULL, "text jsonb_array_element_text(jsonb,int4)", + 1000, "jC/f:jsonb_array_element_text", + vlbuf_estimate_jsonb + }, + { NULL, "bool jsonb_exists(jsonb,text)", + 100, "j/f:jsonb_exists" + }, + /* + * int4range operators + */ + { NULL, "int4 lower(int4range)", 2, "r/f:int4range_lower" }, + { NULL, "int4 upper(int4range)", 2, "r/f:int4range_upper" }, + { NULL, "bool isempty(int4range)", 1, "r/f:int4range_isempty" }, + { NULL, "bool lower_inc(int4range)", 1, "r/f:int4range_lower_inc" }, + { NULL, "bool upper_inc(int4range)", 1, "r/f:int4range_upper_inc" }, + { NULL, "bool lower_inf(int4range)", 1, "r/f:int4range_lower_inf" }, + { NULL, "bool upper_inf(int4range)", 1, "r/f:int4range_upper_inf" }, + { NULL, "bool range_eq(int4range,int4range)", 2, "r/f:int4range_eq" }, + { NULL, "bool range_ne(int4range,int4range)", 2, "r/f:int4range_ne" }, + { NULL, "bool range_lt(int4range,int4range)", 2, "r/f:int4range_lt" }, + { NULL, "bool range_le(int4range,int4range)", 2, "r/f:int4range_le" }, + { NULL, "bool range_gt(int4range,int4range)", 2, "r/f:int4range_gt" }, + { NULL, "bool range_ge(int4range,int4range)", 2, "r/f:int4range_ge" }, + { NULL, "int4 range_cmp(int4range,int4range)",2, "r/f:int4range_cmp"}, + { NULL, "bool range_overlaps(int4range,int4range)", + 4, "r/f:int4range_overlaps" }, + { NULL, "bool range_contains_elem(int4range,int4)", + 4, "r/f:int4range_contains_elem" }, + { NULL, "bool range_contains(int4range,int4range)", + 4, "r/f:int4range_contains" }, + { NULL, "bool elem_contained_by_range(int4,int4range)", + 4, "r/f:elem_contained_by_int4range" }, + { NULL, "bool range_contained_by(int4range,int4range)", + 4, "r/f:int4range_contained_by" }, + { NULL, "bool range_adjacent(int4range,int4range)", + 4, "r/f:int4range_adjacent" }, + { NULL, "bool range_before(int4range,int4range)", + 4, "r/f:int4range_before" }, + { NULL, "bool range_after(int4range,int4range)", + 4, "r/f:int4range_after" }, + { NULL, "bool range_overleft(int4range,int4range)", + 4, "r/f:int4range_overleft" }, + { NULL, "bool range_overright(int4range,int4range)", + 4, "r/f:int4range_overright" }, + { NULL, "int4range range_union(int4range,int4range)", + 4, "r/f:int4range_union" }, + { NULL, "int4range range_merge(int4range,int4range)", + 4, "r/f:int4range_merge" }, + { NULL, "int4range range_intersect(int4range,int4range)", + 4, "r/f:int4range_intersect" }, + { NULL, "int4range range_minus(int4range,int4range)", + 4, "r/f:int4range_minus" }, + /* + * int8range operators + */ + { NULL, "int8 lower(int8range)", 2, "r/f:int8range_lower" }, + { NULL, "int8 upper(int8range)", 2, "r/f:int8range_upper" }, + { NULL, "bool isempty(int8range)", 1, "r/f:int8range_isempty" }, + { NULL, "bool lower_inc(int8range)", 1, "r/f:int8range_lower_inc" }, + { NULL, "bool upper_inc(int8range)", 1, "r/f:int8range_upper_inc" }, + { NULL, "bool lower_inf(int8range)", 1, "r/f:int8range_lower_inf" }, + { NULL, "bool upper_inf(int8range)", 1, "r/f:int8range_upper_inf" }, + { NULL, "bool range_eq(int8range,int8range)", 2, "r/f:int8range_eq" }, + { NULL, "bool range_ne(int8range,int8range)", 2, "r/f:int8range_ne" }, + { NULL, "bool range_lt(int8range,int8range)", 2, "r/f:int8range_lt" }, + { NULL, "bool range_le(int8range,int8range)", 2, "r/f:int8range_le" }, + { NULL, "bool range_gt(int8range,int8range)", 2, "r/f:int8range_gt" }, + { NULL, "bool range_ge(int8range,int8range)", 2, "r/f:int8range_ge" }, + { NULL, "int4 range_cmp(int8range,int8range)",2, "r/f:int8range_cmp"}, + { NULL, "bool range_overlaps(int8range,int8range)", + 4, "r/f:int8range_overlaps" }, + { NULL, "bool range_contains_elem(int8range,int8)", + 4, "r/f:int8range_contains_elem" }, + { NULL, "bool range_contains(int8range,int8range)", + 4, "r/f:int8range_contains" }, + { NULL, "bool elem_contained_by_range(int8,int8range)", + 4, "r/f:elem_contained_by_int8range" }, + { NULL, "bool range_contained_by(int8range,int8range)", + 4, "r/f:int8range_contained_by" }, + { NULL, "bool range_adjacent(int8range,int8range)", + 4, "r/f:int8range_adjacent" }, + { NULL, "bool range_before(int8range,int8range)", + 4, "r/f:int8range_before" }, + { NULL, "bool range_after(int8range,int8range)", + 4, "r/f:int8range_after" }, + { NULL, "bool range_overleft(int8range,int8range)", + 4, "r/f:int8range_overleft" }, + { NULL, "bool range_overright(int8range,int8range)", + 4, "r/f:int8range_overright" }, + { NULL, "int8range range_union(int8range,int8range)", + 4, "r/f:int8range_union" }, + { NULL, "int8range range_merge(int8range,int8range)", + 4, "r/f:int8range_merge" }, + { NULL, "int8range range_intersect(int8range,int8range)", + 4, "r/f:int8range_intersect" }, + { NULL, "int8range range_minus(int8range,int8range)", + 4, "r/f:int8range_minus" }, + /* + * tsrange operators + */ + { NULL, "timestamp lower(tsrange)", 2, "r/f:tsrange_lower" }, + { NULL, "timestamp upper(tsrange)", 2, "r/f:tsrange_upper" }, + { NULL, "bool isempty(tsrange)", 1, "r/f:tsrange_isempty" }, + { NULL, "bool lower_inc(tsrange)", 1, "r/f:tsrange_lower_inc" }, + { NULL, "bool upper_inc(tsrange)", 1, "r/f:tsrange_upper_inc" }, + { NULL, "bool lower_inf(tsrange)", 1, "r/f:tsrange_lower_inf" }, + { NULL, "bool upper_inf(tsrange)", 1, "r/f:tsrange_upper_inf" }, + { NULL, "bool range_eq(tsrange,tsrange)", 2, "r/f:tsrange_eq" }, + { NULL, "bool range_ne(tsrange,tsrange)", 2, "r/f:tsrange_ne" }, + { NULL, "bool range_lt(tsrange,tsrange)", 2, "r/f:tsrange_lt" }, + { NULL, "bool range_le(tsrange,tsrange)", 2, "r/f:tsrange_le" }, + { NULL, "bool range_gt(tsrange,tsrange)", 2, "r/f:tsrange_gt" }, + { NULL, "bool range_ge(tsrange,tsrange)", 2, "r/f:tsrange_ge" }, + { NULL, "int4 range_cmp(tsrange,tsrange)",2, "r/f:tsrange_cmp"}, + { NULL, "bool range_overlaps(tsrange,tsrange)", + 4, "r/f:tsrange_overlaps" }, + { NULL, "bool range_contains_elem(tsrange,timestamp)", + 4, "r/f:tsrange_contains_elem" }, + { NULL, "bool range_contains(tsrange,tsrange)", + 4, "r/f:tsrange_contains" }, + { NULL, "bool elem_contained_by_range(timestamp,tsrange)", + 4, "r/f:elem_contained_by_tsrange" }, + { NULL, "bool range_contained_by(tsrange,tsrange)", + 4, "r/f:tsrange_contained_by" }, + { NULL, "bool range_adjacent(tsrange,tsrange)", + 4, "r/f:tsrange_adjacent" }, + { NULL, "bool range_before(tsrange,tsrange)", + 4, "r/f:tsrange_before" }, + { NULL, "bool range_after(tsrange,tsrange)", + 4, "r/f:tsrange_after" }, + { NULL, "bool range_overleft(tsrange,tsrange)", + 4, "r/f:tsrange_overleft" }, + { NULL, "bool range_overright(tsrange,tsrange)", + 4, "r/f:tsrange_overright" }, + { NULL, "tsrange range_union(tsrange,tsrange)", + 4, "r/f:tsrange_union" }, + { NULL, "tsrange range_merge(tsrange,tsrange)", + 4, "r/f:tsrange_merge" }, + { NULL, "tsrange range_intersect(tsrange,tsrange)", + 4, "r/f:tsrange_intersect" }, + { NULL, "tsrange range_minus(tsrange,tsrange)", + 4, "r/f:tsrange_minus" }, + /* + * tstzrange operators + */ + { NULL, "timestamptz lower(tstzrange)", 2, "r/f:tstzrange_lower" }, + { NULL, "timestamptz upper(tstzrange)", 2, "r/f:tstzrange_upper" }, + { NULL, "bool isempty(tstzrange)", 1, "r/f:tstzrange_isempty" }, + { NULL, "bool lower_inc(tstzrange)", 1, "r/f:tstzrange_lower_inc" }, + { NULL, "bool upper_inc(tstzrange)", 1, "r/f:tstzrange_upper_inc" }, + { NULL, "bool lower_inf(tstzrange)", 1, "r/f:tstzrange_lower_inf" }, + { NULL, "bool upper_inf(tstzrange)", 1, "r/f:tstzrange_upper_inf" }, + { NULL, "bool range_eq(tstzrange,tstzrange)", 2, "r/f:tstzrange_eq" }, + { NULL, "bool range_ne(tstzrange,tstzrange)", 2, "r/f:tstzrange_ne" }, + { NULL, "bool range_lt(tstzrange,tstzrange)", 2, "r/f:tstzrange_lt" }, + { NULL, "bool range_le(tstzrange,tstzrange)", 2, "r/f:tstzrange_le" }, + { NULL, "bool range_gt(tstzrange,tstzrange)", 2, "r/f:tstzrange_gt" }, + { NULL, "bool range_ge(tstzrange,tstzrange)", 2, "r/f:tstzrange_ge" }, + { NULL, "int4 range_cmp(tstzrange,tstzrange)",2, "r/f:tstzrange_cmp"}, + { NULL, "bool range_overlaps(tstzrange,tstzrange)", + 4, "r/f:tstzrange_overlaps" }, + { NULL, "bool range_contains_elem(tstzrange,timestamptz)", + 4, "r/f:tstzrange_contains_elem" }, + { NULL, "bool range_contains(tstzrange,tstzrange)", + 4, "r/f:tstzrange_contains" }, + { NULL, "bool elem_contained_by_range(timestamptz,tstzrange)", + 4, "r/f:elem_contained_by_tstzrange" }, + { NULL, "bool range_contained_by(tstzrange,tstzrange)", + 4, "r/f:tstzrange_contained_by" }, + { NULL, "bool range_adjacent(tstzrange,tstzrange)", + 4, "r/f:tstzrange_adjacent" }, + { NULL, "bool range_before(tstzrange,tstzrange)", + 4, "r/f:tstzrange_before" }, + { NULL, "bool range_after(tstzrange,tstzrange)", + 4, "r/f:tstzrange_after" }, + { NULL, "bool range_overleft(tstzrange,tstzrange)", + 4, "r/f:tstzrange_overleft" }, + { NULL, "bool range_overright(tstzrange,tstzrange)", + 4, "r/f:tstzrange_overright" }, + { NULL, "tstzrange range_union(tstzrange,tstzrange)", + 4, "r/f:tstzrange_union" }, + { NULL, "tstzrange range_merge(tstzrange,tstzrange)", + 4, "r/f:tstzrange_merge" }, + { NULL, "tstzrange range_intersect(tstzrange,tstzrange)", + 4, "r/f:tstzrange_intersect" }, + { NULL, "tstzrange range_minus(tstzrange,tstzrange)", + 4, "r/f:tstzrange_minus" }, + /* + * daterange operators + */ + { NULL, "date lower(daterange)", 2, "r/f:daterange_lower" }, + { NULL, "date upper(daterange)", 2, "r/f:daterange_upper" }, + { NULL, "bool isempty(daterange)", 1, "r/f:daterange_isempty" }, + { NULL, "bool lower_inc(daterange)", 1, "r/f:daterange_lower_inc" }, + { NULL, "bool upper_inc(daterange)", 1, "r/f:daterange_upper_inc" }, + { NULL, "bool lower_inf(daterange)", 1, "r/f:daterange_lower_inf" }, + { NULL, "bool upper_inf(daterange)", 1, "r/f:daterange_upper_inf" }, + { NULL, "bool range_eq(daterange,daterange)", 2, "r/f:daterange_eq" }, + { NULL, "bool range_ne(daterange,daterange)", 2, "r/f:daterange_ne" }, + { NULL, "bool range_lt(daterange,daterange)", 2, "r/f:daterange_lt" }, + { NULL, "bool range_le(daterange,daterange)", 2, "r/f:daterange_le" }, + { NULL, "bool range_gt(daterange,daterange)", 2, "r/f:daterange_gt" }, + { NULL, "bool range_ge(daterange,daterange)", 2, "r/f:daterange_ge" }, + { NULL, "int4 range_cmp(daterange,daterange)",2, "r/f:daterange_cmp"}, + { NULL, "bool range_overlaps(daterange,daterange)", + 4, "r/f:daterange_overlaps" }, + { NULL, "bool range_contains_elem(daterange,date)", + 4, "r/f:daterange_contains_elem" }, + { NULL, "bool range_contains(daterange,daterange)", + 4, "r/f:daterange_contains" }, + { NULL, "bool elem_contained_by_range(date,daterange)", + 4, "r/f:elem_contained_by_daterange" }, + { NULL, "bool range_contained_by(daterange,daterange)", + 4, "r/f:daterange_contained_by" }, + { NULL, "bool range_adjacent(daterange,daterange)", + 4, "r/f:daterange_adjacent" }, + { NULL, "bool range_before(daterange,daterange)", + 4, "r/f:daterange_before" }, + { NULL, "bool range_after(daterange,daterange)", + 4, "r/f:daterange_after" }, + { NULL, "bool range_overleft(daterange,daterange)", + 4, "r/f:daterange_overleft" }, + { NULL, "bool range_overright(daterange,daterange)", + 4, "r/f:daterange_overright" }, + { NULL, "daterange range_union(daterange,daterange)", + 4, "r/f:daterange_union" }, + { NULL, "daterange range_merge(daterange,daterange)", + 4, "r/f:daterange_merge" }, + { NULL, "daterange range_intersect(daterange,daterange)", + 4, "r/f:daterange_intersect" }, + { NULL, "daterange range_minus(daterange,daterange)", + 4, "r/f:daterange_minus" }, + + /* + * PostGIS functions + */ + { POSTGIS3, "geometry st_setsrid(geometry,int4)", + 1, "g/f:st_setsrid" }, + { POSTGIS3, "geometry st_point(float8,float8)", + 10, "gC/f:st_makepoint2", + vlbuf_estimate__st_makepoint }, + { POSTGIS3, "geometry st_makepoint(float8,float8)", + 10, "gC/f:st_makepoint2", + vlbuf_estimate__st_makepoint }, + { POSTGIS3, "geometry st_makepoint(float8,float8,float8)", + 10, "gC/f:st_makepoint3", + vlbuf_estimate__st_makepoint }, + { POSTGIS3, "geometry st_makepoint(float8,float8,float8,float8)", + 10, "gC/f:st_makepoint4", + vlbuf_estimate__st_makepoint }, + { POSTGIS3, "float8 st_distance(geometry,geometry)", + 50, "g/f:st_distance" }, + { POSTGIS3, "bool st_dwithin(geometry,geometry,float8)", + 50, "g/f:st_dwithin" }, + { POSTGIS3, "int4 st_linecrossingdirection(geometry,geometry)", + 50, "g/f:st_linecrossingdirection" }, + { POSTGIS3, "text st_relate(geometry,geometry)", + 999, "g/f:st_relate", + vlbuf_estimate__st_relate }, + { POSTGIS3, "bool st_contains(geometry,geometry)", + 999, "g/f:st_contains" }, + { POSTGIS3, "bool st_crosses(geometry,geometry)", + 999, "g/f:st_crosses" }, + { POSTGIS3, "bool geometry_overlaps(geometry,geometry)", + 10, "g/f:geometry_overlaps" }, + { POSTGIS3, "bool overlaps_2d(box2df,geometry)", + 10, "g/f:box2df_geometry_overlaps" }, + { POSTGIS3, "bool geometry_contains(geometry,geometry)", + 10, "g/f:geometry_contains" }, + { POSTGIS3, "bool contains_2d(box2df,geometry)", + 10, "g/f:box2df_geometry_contains" }, + { POSTGIS3, "bool geometry_within(geometry,geometry)", + 10, "g/f:geometry_within" }, + { POSTGIS3, "bool is_contained_2d(box2df,geometry)", + 10, "g/f:box2df_geometry_within" }, + { POSTGIS3, "geometry st_expand(geometry,float8)", + 20, "gC/f:st_expand", + vlbuf_estimate__st_expand }, + /* + * GpuPreAgg COUNT(distinct KEY) support + */ + { PGSTROM, "int8 hll_hash(int1)", 1, "f:hll_hash_int1" }, + { PGSTROM, "int8 hll_hash(int2)", 1, "f:hll_hash_int2" }, + { PGSTROM, "int8 hll_hash(int4)", 1, "f:hll_hash_int4" }, + { PGSTROM, "int8 hll_hash(int8)", 1, "f:hll_hash_int8" }, + { PGSTROM, "int8 hll_hash(numeric)", 1, "f:hll_hash_numeric" }, + { PGSTROM, "int8 hll_hash(date)", 1, "t/f:hll_hash_date" }, + { PGSTROM, "int8 hll_hash(time)", 1, "t/f:hll_hash_time" }, + { PGSTROM, "int8 hll_hash(timetz)", 1, "t/f:hll_hash_timetz" }, + { PGSTROM, "int8 hll_hash(timestamp)", 1, "t/f:hll_hash_timestamp" }, + { PGSTROM, "int8 hll_hash(timestamptz)", 1, "t/f:hll_hash_timestamptz" }, + { PGSTROM, "int8 hll_hash(bpchar)", 1, "s/f:hll_hash_bpchar" }, + { PGSTROM, "int8 hll_hash(text)", 1, "s/f:hll_hash_text" }, + { PGSTROM, "int8 hll_hash(uuid)", 1, "m/f:hll_hash_uuid"} +}; + +/* default of dfunc->dfunc_varlena_sz if not specified */ +static int +devfunc_generic_result_sz(codegen_context *context, + devfunc_info *dfunc, + Expr **args, int *vl_width) +{ + devtype_info *rtype = dfunc->func_rettype; + + if (rtype->type_length > 0) + return rtype->type_length; + else if (rtype->type_length == -1) + return type_maximum_size(rtype->type_oid, -1); + elog(ERROR, "unexpected type length: %d", rtype->type_length); +} + +static devfunc_info * +__construct_devfunc_info(const char *func_extension, + HeapTuple protup, + devtype_info *dfunc_rettype, + int dfunc_nargs, + devtype_info **dfunc_argtypes, + Oid dfunc_collid, + int func_devcost, + const char *func_template, + devfunc_result_sz_type devfunc_result_sz) +{ + Form_pg_proc proc = (Form_pg_proc) GETSTRUCT(protup); + MemoryContext oldcxt; + devfunc_info *dfunc = NULL; + List *dfunc_args = NIL; + const char *pos; + const char *end; + int32 flags = 0; + int j; + bool has_collation = false; + bool has_callbacks = false; + + /* fetch attribute */ + end = strchr(func_template, '/'); + if (end) + { + for (pos = func_template; pos < end; pos++) + { + switch (*pos) + { + case 'L': + has_collation = true; + break; + case 'C': + has_callbacks = true; + break; + case 'p': + flags |= DEVKERNEL_NEEDS_PRIMITIVE; + break; + case 's': + flags |= DEVKERNEL_NEEDS_TEXTLIB; + break; + case 't': + flags |= DEVKERNEL_NEEDS_TIMELIB; + break; + case 'j': + flags |= DEVKERNEL_NEEDS_JSONLIB; + break; + case 'm': + flags |= DEVKERNEL_NEEDS_MISCLIB; + break; + case 'r': + flags |= DEVKERNEL_NEEDS_RANGETYPE; + break; + case 'g': + flags |= DEVKERNEL_NEEDS_POSTGIS; + break; + default: + elog(NOTICE, + "Bug? unkwnon devfunc property: %c", + *pos); + break; + } + } + func_template = end + 1; + } + if (strncmp(func_template, "f:", 2) != 0) + { + elog(NOTICE, "Bug? unknown device function template: '%s'", + func_template); + return NULL; + } + oldcxt = MemoryContextSwitchTo(devinfo_memcxt); + for (j=0; j < dfunc_nargs; j++) + dfunc_args = lappend(dfunc_args, dfunc_argtypes[j]); + + dfunc = palloc0(sizeof(devfunc_info)); + if (func_extension) + dfunc->func_extension = pstrdup(func_extension); + dfunc->func_oid = PgProcTupleGetOid(protup); + if (has_collation) + { + if (OidIsValid(dfunc_collid) && !lc_collate_is_c(dfunc_collid)) + dfunc->func_is_negative = true; + dfunc->func_collid = dfunc_collid; + } + dfunc->func_is_strict = proc->proisstrict; + dfunc->func_flags = flags; + dfunc->func_args = dfunc_args; + dfunc->func_rettype = dfunc_rettype; + dfunc->func_sqlname = pstrdup(NameStr(proc->proname)); + dfunc->func_devname = func_template + 2; /* const cstring */ + dfunc->func_devcost = func_devcost; + dfunc->devfunc_result_sz = (has_callbacks + ? devfunc_result_sz + : devfunc_generic_result_sz); + /* other fields shall be assigned on the caller side */ + MemoryContextSwitchTo(oldcxt); + + return dfunc; +} + +static devfunc_info * +pgstrom_devfunc_construct_fuzzy(const char *func_extension, + HeapTuple protup, + devtype_info *dfunc_rettype, + int dfunc_nargs, + devtype_info **dfunc_argtypes, + Oid dfunc_collid, + int fuzzy_index_head, + int fuzzy_index_tail) +{ + Form_pg_proc proc = (Form_pg_proc) GETSTRUCT(protup); + char buffer[512]; + int i, j; + + Assert(fuzzy_index_head >= 0 && + fuzzy_index_head <= fuzzy_index_tail && + fuzzy_index_tail < lengthof(devfunc_common_catalog)); + for (i = fuzzy_index_head; i <= fuzzy_index_tail; i++) + { + devfunc_catalog_t *procat = devfunc_common_catalog + i; + devtype_info *dtype; + char *tok; + char *pos; + + if (func_extension) + { + if (!procat->func_extension || + strcmp(procat->func_extension, func_extension) != 0) + continue; + } + else + { + if (procat->func_extension) + continue; + } + strncpy(buffer, procat->func_signature, sizeof(buffer)); + pos = strchr(buffer, ' '); + if (!pos) + continue; + *pos++ = '\0'; + + /* check the function name */ + tok = pos; + pos = strchr(pos, '('); + if (!pos) + continue; + *pos++ = '\0'; + if (strcmp(tok, NameStr(proc->proname)) != 0) + continue; + + /* check the argument types */ + for (j=0; j < dfunc_nargs; j++) + { + tok = pos; + pos = strchr(pos, (j < dfunc_nargs - 1 ? ',' : ')')); + if (!pos) + break; /* not match */ + *pos++ = '\0'; + + dtype = pgstrom_devtype_lookup_by_name(tok); + if (!dtype) + break; /* not match */ + if (dtype->type_oid != dfunc_argtypes[j]->type_oid && + !pgstrom_devtype_can_relabel(dfunc_argtypes[j]->type_oid, + dtype->type_oid)) + break; /* not match */ + } + if (j < dfunc_nargs) + continue; + /* check the result type */ + dtype = pgstrom_devtype_lookup_by_name(buffer); + if (!dtype) + continue; + if (dtype->type_oid != dfunc_rettype->type_oid && + !pgstrom_devtype_can_relabel(dtype->type_oid, + dfunc_rettype->type_oid)) + continue; + + /* Ok, found the fuzzy entry */ + return __construct_devfunc_info(func_extension, + protup, + dfunc_rettype, + dfunc_nargs, + dfunc_argtypes, + dfunc_collid, + procat->func_devcost, + procat->func_template, + procat->devfunc_result_sz); + } + /* not found */ + return NULL; +} + +static devfunc_info * +build_extra_devfunc_info(const char *func_extension, + HeapTuple protup, + devtype_info *dfunc_rettype, + int dfunc_nargs, + devtype_info **dfunc_argtypes, + Oid dfunc_collid) +{ + Form_pg_proc proc_form = (Form_pg_proc) GETSTRUCT(protup); + StringInfoData ident; + devfunc_info __dfunc; + devfunc_info *dfunc = NULL; + List *dfunc_args = NIL; + const char *nsp_name; + int i; + + /* setup devfunc identifier */ + initStringInfo(&ident); + append_string_devtype_identifier(&ident, dfunc_rettype->type_oid); + nsp_name = get_namespace_name(proc_form->pronamespace); + appendStringInfo(&ident, " %s.%s(", + quote_identifier(nsp_name), + quote_identifier(NameStr(proc_form->proname))); + for (i=0; i < dfunc_nargs; i++) + { + devtype_info *dtype = dfunc_argtypes[i]; + + if (i > 0) + appendStringInfoChar(&ident, ','); + append_string_devtype_identifier(&ident, dtype->type_oid); + dfunc_args = lappend(dfunc_args, dtype); + } + appendStringInfoChar(&ident, ')'); + + memset(&__dfunc, 0, sizeof(devfunc_info)); + __dfunc.func_extension = func_extension; + __dfunc.func_oid = PgProcTupleGetOid(protup); + __dfunc.hashvalue = GetSysCacheHashValue(PROCOID, __dfunc.func_oid, 0, 0, 0); + __dfunc.func_collid = dfunc_collid; + __dfunc.func_is_strict = proc_form->proisstrict; + __dfunc.func_args = dfunc_args; + __dfunc.func_rettype = dfunc_rettype; + __dfunc.func_sqlname = NameStr(proc_form->proname); + __dfunc.func_devname = NULL; /* callback must set */ + __dfunc.func_devcost = 0; /* callback must set */ + __dfunc.devfunc_result_sz = NULL; /* callback must set, if any */ + + for (i=0; i < pgstrom_num_users_extra; i++) + { + pgstromUsersExtraDescriptor *extra = &pgstrom_users_extra_desc[i]; + + if (extra->lookup_extra_devfunc && + extra->lookup_extra_devfunc(ident.data, &__dfunc)) + { + MemoryContext oldcxt; + + /* must be */ + if (!__dfunc.func_devname) + { + elog(DEBUG2, "Extra module didn't set device function name for %s", + format_procedure(__dfunc.func_oid)); + continue; + } + oldcxt = MemoryContextSwitchTo(devinfo_memcxt); + dfunc = palloc0(sizeof(devfunc_info)); + dfunc->func_extension = pstrdup(__dfunc.func_extension); + dfunc->func_oid = __dfunc.func_oid; + dfunc->func_collid = __dfunc.func_collid; + dfunc->func_is_negative = __dfunc.func_is_negative; + dfunc->func_is_strict = __dfunc.func_is_strict; + dfunc->func_flags = __dfunc.func_flags; + dfunc->func_args = list_copy(__dfunc.func_args); + dfunc->func_rettype = __dfunc.func_rettype; + dfunc->func_sqlname = pstrdup(__dfunc.func_sqlname); + dfunc->func_devname = pstrdup(__dfunc.func_devname); + if (__dfunc.devfunc_result_sz) + dfunc->devfunc_result_sz = __dfunc.devfunc_result_sz; + else + dfunc->devfunc_result_sz = devfunc_generic_result_sz; + MemoryContextSwitchTo(oldcxt); + break; + } + } + pfree(ident.data); + return dfunc; +} + +static devfunc_info * +pgstrom_devfunc_construct(HeapTuple protup, + Oid func_rettype, + oidvector *func_argtypes, + Oid func_collid) +{ + Form_pg_proc proc = (Form_pg_proc) GETSTRUCT(protup); + const char *func_extension; + StringInfoData sig; + devtype_info *dtype; + devtype_info *dfunc_rettype; + devtype_info **dfunc_argtypes; + devfunc_info *dfunc = NULL; + int fuzzy_index_head = -1; + int fuzzy_index_tail = -1; + int i; + + /* extension name */ + func_extension = get_extension_name_by_object(ProcedureRelationId, + PgProcTupleGetOid(protup)); + /* make a signature string */ + initStringInfo(&sig); + dfunc_rettype = pgstrom_devtype_lookup(func_rettype); + if (!dfunc_rettype) + goto not_found; + appendStringInfo(&sig, "%s %s(", + dfunc_rettype->type_name, + NameStr(proc->proname)); + + dfunc_argtypes = alloca(sizeof(devtype_info *) * func_argtypes->dim1); + for (i=0; i < func_argtypes->dim1; i++) + { + dtype = pgstrom_devtype_lookup(func_argtypes->values[i]); + if (!dtype) + goto not_found; + if (i > 0) + appendStringInfoChar(&sig, ','); + appendStringInfo(&sig, "%s", dtype->type_name); + dfunc_argtypes[i] = dtype; + } + appendStringInfoChar(&sig, ')'); + + for (i=0; i < lengthof(devfunc_common_catalog); i++) + { + devfunc_catalog_t *procat = &devfunc_common_catalog[i]; + + if (func_extension) + { + if (!procat->func_extension || + strcmp(procat->func_extension, func_extension) != 0) + continue; + } + else if (procat->func_extension) + continue; + + if (strcmp(procat->func_signature, sig.data) == 0) + { + dfunc = __construct_devfunc_info(func_extension, + protup, + dfunc_rettype, + func_argtypes->dim1, + dfunc_argtypes, + func_collid, + procat->func_devcost, + procat->func_template, + procat->devfunc_result_sz); + break; + } + else + { + /* + * In case when function name is identical, but argument list + * does not match exactly. ( + */ + const char *sname = strchr(procat->func_signature, ' '); + const char *pname = NameStr(proc->proname); + + if (sname) + { + sname++; + while (*sname != '\0' && + *pname != '\0' && + *sname == *pname) + { + sname++; + pname++; + } + if (*sname == '(' && *pname == '\0') + { + if (fuzzy_index_head < 0) + fuzzy_index_head = i; + fuzzy_index_tail = i; + } + } + } + } + /* try invocation with implicit type relabel */ + if (!dfunc && fuzzy_index_head >= 0) + { + dfunc = pgstrom_devfunc_construct_fuzzy(func_extension, + protup, + dfunc_rettype, + func_argtypes->dim1, + dfunc_argtypes, + func_collid, + fuzzy_index_head, + fuzzy_index_tail); + } + /* extra device function, if any */ + if (!dfunc && pgstrom_num_users_extra > 0) + { + dfunc = build_extra_devfunc_info(func_extension, + protup, + dfunc_rettype, + func_argtypes->dim1, + dfunc_argtypes, + func_collid); + } +not_found: + pfree(sig.data); + return dfunc; +} + +static devfunc_info * +__pgstrom_devfunc_lookup(HeapTuple protup, + Oid func_rettype, + oidvector *func_argtypes, + Oid func_collid) +{ + Oid func_oid = PgProcTupleGetOid(protup); + Form_pg_proc proc = (Form_pg_proc) GETSTRUCT(protup); + devfunc_info *dfunc; + devtype_info *dtype; + ListCell *lc; + cl_uint hashvalue; + int j, hindex; + dlist_iter iter; + bool consider_relabel = false; + + hashvalue = GetSysCacheHashValue(PROCOID, func_oid, 0, 0, 0); + hindex = hashvalue % lengthof(devfunc_info_slot); +retry: + dlist_foreach (iter, &devfunc_info_slot[hindex]) + { + dfunc = dlist_container(devfunc_info, chain, iter.cur); + if (dfunc->func_oid != func_oid) + continue; + if (OidIsValid(dfunc->func_collid) && + dfunc->func_collid != func_collid) + continue; + + dtype = dfunc->func_rettype; + if (dtype->type_oid != func_rettype && + (!consider_relabel || + !pgstrom_devtype_can_relabel(dtype->type_oid, func_rettype))) + continue; + + if (list_length(dfunc->func_args) == func_argtypes->dim1) + { + j = 0; + foreach (lc, dfunc->func_args) + { + dtype = lfirst(lc); + if (dtype->type_oid != func_argtypes->values[j] && + (!consider_relabel || + !pgstrom_devtype_can_relabel(func_argtypes->values[j], + dtype->type_oid))) + break; /* not match */ + j++; + } + if (!lc) + { + if (dfunc->func_is_negative) + return NULL; + return dfunc; + } + } + } + if (!consider_relabel) + { + consider_relabel = true; + goto retry; + } + + /* Not cached, construct a new entry of the device function */ + dfunc = pgstrom_devfunc_construct(protup, + func_rettype, + func_argtypes, + func_collid); + /* Not found, so this function should be a nagative entry */ + if (!dfunc) + { + MemoryContext oldcxt = MemoryContextSwitchTo(devinfo_memcxt); + + /* dummy devtype_info just for oid checks */ + dfunc = palloc0(sizeof(devfunc_info)); + dfunc->func_oid = func_oid; + dfunc->func_is_negative = true; + for (j=0; j < func_argtypes->dim1; j++) + { + dtype = palloc0(sizeof(devtype_info)); + dtype->type_oid = func_argtypes->values[j]; + dfunc->func_args = lappend(dfunc->func_args, dtype); + } + dtype = palloc0(sizeof(devtype_info)); + dtype->type_oid = func_rettype; + dfunc->func_rettype = dtype; + dfunc->func_sqlname = pstrdup(NameStr(proc->proname)); + + MemoryContextSwitchTo(oldcxt); + } + dfunc->hashvalue = hashvalue; + dlist_push_head(&devfunc_info_slot[hindex], &dfunc->chain); + if (dfunc->func_is_negative) + return NULL; + return dfunc; +} + +devfunc_info * +pgstrom_devfunc_lookup(Oid func_oid, + Oid func_rettype, + List *func_args, /* list of expressions */ + Oid func_collid) +{ + devfunc_info *result = NULL; + HeapTuple tup; + + tup = SearchSysCache1(PROCOID, ObjectIdGetDatum(func_oid)); + if (!HeapTupleIsValid(tup)) + elog(ERROR, "cache lookup failed for function %u", func_oid); + PG_TRY(); + { + int func_nargs = list_length(func_args); + oidvector *func_argtypes; + int i = 0; + ListCell *lc; + + func_argtypes = alloca(offsetof(oidvector, values[func_nargs])); + func_argtypes->ndim = 1; + func_argtypes->dataoffset = 0; + func_argtypes->elemtype = OIDOID; + func_argtypes->dim1 = func_nargs; + func_argtypes->lbound1 = 0; + foreach (lc, func_args) + { + Oid type_oid = exprType((Node *)lfirst(lc)); + + func_argtypes->values[i++] = type_oid; + } + SET_VARSIZE(func_argtypes, offsetof(oidvector, values[func_nargs])); + + result = __pgstrom_devfunc_lookup(tup, + func_rettype, + func_argtypes, + func_collid); + } + PG_CATCH(); + { + ReleaseSysCache(tup); + PG_RE_THROW(); + } + PG_END_TRY(); + ReleaseSysCache(tup); + + return result; +} + +devfunc_info * +pgstrom_devfunc_lookup_type_equal(devtype_info *dtype, Oid type_collid) +{ + devfunc_info *result = NULL; + char buffer[offsetof(oidvector, values[2])]; + oidvector *func_argtypes = (oidvector *)buffer; + HeapTuple tup; + Form_pg_proc proc __attribute__((unused)); + + if (!OidIsValid(dtype->type_eqfunc)) + return NULL; + tup = SearchSysCache1(PROCOID, ObjectIdGetDatum(dtype->type_eqfunc)); + if (!HeapTupleIsValid(tup)) + elog(ERROR, "cache lookup failed for function %u", + dtype->type_eqfunc); + PG_TRY(); + { + proc = (Form_pg_proc) GETSTRUCT(tup); + Assert(proc->pronargs == 2); + Assert(proc->prorettype == BOOLOID); + + memset(func_argtypes, 0, offsetof(oidvector, values[2])); + func_argtypes->ndim = 1; + func_argtypes->dataoffset = 0; + func_argtypes->elemtype = OIDOID; + func_argtypes->dim1 = 2; + func_argtypes->lbound1 = 0; + func_argtypes->values[0] = dtype->type_oid; + func_argtypes->values[1] = dtype->type_oid; + SET_VARSIZE(func_argtypes, offsetof(oidvector, values[2])); + + result = __pgstrom_devfunc_lookup(tup, + BOOLOID, + func_argtypes, + type_collid); + } + PG_CATCH(); + { + ReleaseSysCache(tup); + PG_RE_THROW(); + } + PG_END_TRY(); + ReleaseSysCache(tup); + + return result; +} + +devfunc_info * +pgstrom_devfunc_lookup_type_compare(devtype_info *dtype, Oid type_collid) +{ + devfunc_info *result = NULL; + char buffer[offsetof(oidvector, values[2])]; + oidvector *func_argtypes = (oidvector *)buffer; + HeapTuple tup; + Form_pg_proc proc __attribute__((unused)); + + if (!OidIsValid(dtype->type_cmpfunc)) + return NULL; + tup = SearchSysCache1(PROCOID, ObjectIdGetDatum(dtype->type_cmpfunc)); + if (!HeapTupleIsValid(tup)) + elog(ERROR, "cache lookup failed for function %u", + dtype->type_cmpfunc); + PG_TRY(); + { + proc = (Form_pg_proc) GETSTRUCT(tup); + Assert(proc->pronargs == 2); + Assert(proc->prorettype == INT4OID); + + memset(func_argtypes, 0, offsetof(oidvector, values[2])); + func_argtypes->ndim = 1; + func_argtypes->dataoffset = 0; + func_argtypes->elemtype = OIDOID; + func_argtypes->dim1 = 2; + func_argtypes->lbound1 = 0; + func_argtypes->values[0] = dtype->type_oid; + func_argtypes->values[1] = dtype->type_oid; + SET_VARSIZE(func_argtypes, offsetof(oidvector, values[2])); + + result = __pgstrom_devfunc_lookup(tup, + INT4OID, + func_argtypes, + type_collid); + } + PG_CATCH(); + { + ReleaseSysCache(tup); + PG_RE_THROW(); + } + PG_END_TRY(); + ReleaseSysCache(tup); + + return result; +} + +void +pgstrom_devfunc_track(codegen_context *context, devfunc_info *dfunc) +{ + devtype_info *dtype = dfunc->func_rettype; + ListCell *lc; + + /* track device function */ + context->extra_flags |= (dfunc->func_flags | dtype->type_flags); + foreach (lc, dfunc->func_args) + { + dtype = (devtype_info *) lfirst(lc); + context->extra_flags |= dtype->type_flags; + } +} + +/* + * Device cast support + * + * In some cases, a function can be called with different argument types or + * result type from its declaration, if these types are binary compatible. + * PostgreSQL does not have any infrastructure to check data types, it relies + * on the caller which shall give correct data types, and binary-compatible + * types will work without any problems. + * On the other hands, CUDA C++ has strict type checks for function invocation, + * so we need to inject a thin type cast device function even if they are + * binary compatible. + * The thin device function has the following naming convention: + * + * STATIC_INLINE(DESTTYPE) to_DESTTYPE(kcxt, SOURCETYPE) + * + * We have no SQL function on host side because the above device function + * reflects binary-compatible type cast. If cast is COERCION_METHOD_FUNCTION, + * SQL function shall be explicitly used. + * + * In case of COERCION_METHOD_INOUT, expression tree have CoerceViaIO; that + * involves a pair of heavy operation (cstring-out/in). Usually, it is not + * supported on the device code except for small number of exceptions. + * dcast_coerceviaio_callback allows to inject special case handling to run + * the job of CoerceViaIO. + */ +static struct { + Oid src_type_oid; + Oid dst_type_oid; + bool has_domain_checks; + devcast_coerceviaio_callback_f dcast_coerceviaio_callback; +} devcast_catalog[] = { + /* text, varchar, bpchar */ + { TEXTOID, BPCHAROID, false, NULL }, + { TEXTOID, VARCHAROID, false, NULL }, + { VARCHAROID, TEXTOID, false, NULL }, + { VARCHAROID, BPCHAROID, false, NULL }, + /* cidr -> inet, but no reverse type cast */ + { CIDROID, INETOID, false, NULL }, + /* text -> (intX/floatX/numeric), including (jsonb->>'key') reference */ + { TEXTOID, BOOLOID, false, devcast_text2numeric_callback }, + { TEXTOID, INT2OID, false, devcast_text2numeric_callback }, + { TEXTOID, INT4OID, false, devcast_text2numeric_callback }, + { TEXTOID, INT8OID, false, devcast_text2numeric_callback }, + { TEXTOID, FLOAT4OID, false, devcast_text2numeric_callback }, + { TEXTOID, FLOAT8OID, false, devcast_text2numeric_callback }, + { TEXTOID, NUMERICOID, false, devcast_text2numeric_callback }, +}; + +static devcast_info * +build_devcast_info(Oid src_type_oid, Oid dst_type_oid) +{ + devcast_info *dcast = NULL; + devtype_info *dtype_s = NULL; + devtype_info *dtype_d = NULL; + int i; + + dtype_s = pgstrom_devtype_lookup(src_type_oid); + if (!dtype_s) + goto not_found; + dtype_d = pgstrom_devtype_lookup(dst_type_oid); + if (!dtype_d) + goto not_found; + + for (i=0; i < lengthof(devcast_catalog); i++) + { + if (dtype_s->type_oid == devcast_catalog[i].src_type_oid && + dtype_d->type_oid == devcast_catalog[i].dst_type_oid) + { + dcast = MemoryContextAllocZero(devinfo_memcxt, + sizeof(devcast_info)); + dcast->src_type = dtype_s; + dcast->dst_type = dtype_d; + dcast->has_domain_checks = devcast_catalog[i].has_domain_checks; + dcast->dcast_coerceviaio_callback + = devcast_catalog[i].dcast_coerceviaio_callback; + break; + } + } + /* extra type cast */ + if (!dcast) + { + StringInfoData src_ident; + StringInfoData dst_ident; + devcast_info __dcast; + + initStringInfo(&src_ident); + initStringInfo(&dst_ident); + append_string_devtype_identifier(&src_ident, dtype_s->type_oid); + append_string_devtype_identifier(&dst_ident, dtype_d->type_oid); + + memset(&__dcast, 0, sizeof(devcast_info)); + __dcast.src_type = dtype_s; + __dcast.dst_type = dtype_d; + __dcast.has_domain_checks = false; /* extra module must set, if any */ + + for (i=0; i < pgstrom_num_users_extra; i++) + { + pgstromUsersExtraDescriptor *extra = &pgstrom_users_extra_desc[i]; + + if (extra->lookup_extra_devcast && + extra->lookup_extra_devcast(src_ident.data, + dst_ident.data, + &__dcast)) + { + MemoryContext oldcxt = MemoryContextSwitchTo(devinfo_memcxt); + + dcast = pmemdup(&__dcast, sizeof(devcast_info)); + + MemoryContextSwitchTo(oldcxt); + break; + } + } + pfree(src_ident.data); + pfree(dst_ident.data); + } +not_found: + /* negative entry */ + if (!dcast) + { + MemoryContext oldcxt = MemoryContextSwitchTo(devinfo_memcxt); + + if (!dtype_s) + { + dtype_s = palloc0(sizeof(devtype_info)); + dtype_s->type_oid = src_type_oid; + } + if (!dtype_d) + { + dtype_d = palloc0(sizeof(devtype_info)); + dtype_d->type_oid = dst_type_oid; + } + dcast = palloc0(sizeof(devcast_info)); + dcast->src_type = dtype_s; + dcast->dst_type = dtype_d; + dcast->cast_is_negative = true; + MemoryContextSwitchTo(oldcxt); + } + /* sanity checks */ + if (dcast->has_domain_checks && + dcast->dcast_coerceviaio_callback != NULL) + __ELog("Bug? type cast %s -> %s with domain checks must be binary compatible", + format_type_be(dcast->src_type->type_oid), + format_type_be(dcast->dst_type->type_oid)); + return dcast; +} + +devcast_info * +pgstrom_devcast_lookup(Oid src_type_oid, Oid dst_type_oid) +{ + uint32 hashvalue; + int hindex; + devcast_info *dcast; + dlist_iter iter; + + hashvalue = GetSysCacheHashValue(CASTSOURCETARGET, + src_type_oid, + dst_type_oid, + 0, 0); + hindex = hashvalue % lengthof(devcast_info_slot); + dlist_foreach (iter, &devcast_info_slot[hindex]) + { + dcast = dlist_container(devcast_info, chain, iter.cur); + if (dcast->src_type->type_oid == src_type_oid && + dcast->dst_type->type_oid == dst_type_oid) + { + if (dcast->cast_is_negative) + return NULL; + return dcast; + } + } + /* create a new one */ + dcast = build_devcast_info(src_type_oid, dst_type_oid); + dcast->hashvalue = hashvalue; + dlist_push_head(&devcast_info_slot[hindex], &dcast->chain); + if (dcast->cast_is_negative) + return NULL; + return dcast; +} + +bool +pgstrom_devtype_can_relabel(Oid src_type_oid, + Oid dst_type_oid) +{ + devcast_info *dcast; + + dcast = pgstrom_devcast_lookup(src_type_oid, dst_type_oid); + if (dcast && dcast->dcast_coerceviaio_callback == NULL) + return true; + + return false; +} + +/* + * Device index support + * + * devide index handler must be declared as: + * + * DEVICE_FUNCTION(cl_bool) + * pgindex_(kern_context *cxt, + * PageHeaderData *i_page, + * arg1, + * arg2); + */ +static struct { + const char *extname; + const char *signature; + const char *index_kind; + int opstrategy; + const char *index_fname; + const char *ivar_typname; + const char *iarg_typname; +} devindex_catalog[] = { + /* geometry overlap operator */ + { POSTGIS3, "geometry && geometry", + "gist", RTOverlapStrategyNumber, + "gist_geometry_overlap", + "box2df@postgis", + "geometry@postgis", + }, + { POSTGIS3, "box2df && geometry", + "gist", RTOverlapStrategyNumber, + "gist_geometry_overlap", + "box2df@postgis", + "geometry@postgis", + }, + { POSTGIS3, "geometry && box2df", + "gist", RTOverlapStrategyNumber, + "gist_box2df_overlap", + "box2df@postgis", + "box2df@postgis", + }, + { POSTGIS3, "box2df && box2df", + "gist", RTOverlapStrategyNumber, + "gist_box2df_overlap", + "box2df@postgis", + "box2df@postgis", + }, + /* geometry contains operator */ + { POSTGIS3, "geometry ~ geometry", + "gist", RTContainsStrategyNumber, + "gist_geometry_contains", + "box2df@postgis", + "geometry@postgis", + }, + { POSTGIS3, "box2df ~ geometry", + "gist", RTContainsStrategyNumber, + "gist_geometry_contains", + "box2df@postgis", + "geometry@postgis", + }, + { POSTGIS3, "geometry ~ box2df", + "gist", RTContainsStrategyNumber, + "gist_box2df_contains", + "box2df@postgis", + "box2df@postgis", + }, + { POSTGIS3, "box2df ~ box2df", + "gist", RTContainsStrategyNumber, + "gist_box2df_contains", + "box2df@postgis", + "box2df@postgis", + }, + /* geometry contained operator */ + { POSTGIS3, "geometry @ geometry", + "gist", RTContainedByStrategyNumber, + "gist_geometry_contained", + "box2df@postgis", + "geometry@postgis", + }, + { POSTGIS3, "box2df @ geometry", + "gist", RTContainedByStrategyNumber, + "gist_geometry_contained", + "box2df@postgis", + "geometry@postgis", + }, + { POSTGIS3, "geometry @ box2df", + "gist", RTContainedByStrategyNumber, + "gist_box2df_contained", + "box2df@postgis", + "box2df@postgis", + }, + { POSTGIS3, "box2df @ box2df", + "gist", RTContainedByStrategyNumber, + "gist_box2df_contained", + "box2df@postgis", + "box2df@postgis", + }, +}; + +devindex_info * +pgstrom_devindex_lookup(Oid opcode, Oid opfamily) +{ + devindex_info *dindex = NULL; + uint32 hashvalue; + uint32 hindex; + HeapTuple htup; + Form_pg_amop amop; + dlist_iter iter; + const char *extname; + char signature[3*NAMEDATALEN + 100]; + int i; + + hashvalue = GetSysCacheHashValue(AMOPOPID, + ObjectIdGetDatum(opcode), + CharGetDatum(AMOP_SEARCH), + ObjectIdGetDatum(opfamily), 0); + hindex = hashvalue % lengthof(devindex_info_slot); + dlist_foreach(iter, &devindex_info_slot[hindex]) + { + dindex = dlist_container(devindex_info, chain, iter.cur); + if (dindex->opcode == opcode && + dindex->opfamily == opfamily) + goto found; + } + + extname = get_extension_name_by_object(OperatorRelationId, opcode); + htup = SearchSysCache3(AMOPOPID, + ObjectIdGetDatum(opcode), + CharGetDatum(AMOP_SEARCH), + ObjectIdGetDatum(opfamily)); + if (!HeapTupleIsValid(htup)) + elog(ERROR, "operator %u is not a member of opfamily %u", + opcode, opfamily); + amop = (Form_pg_amop) GETSTRUCT(htup); + snprintf(signature, sizeof(signature), "%s %s %s", + get_type_name(amop->amoplefttype, false), + get_opname(opcode), + get_type_name(amop->amoprighttype, false)); + + dindex = NULL; + for (i=0; i < lengthof(devindex_catalog); i++) + { + const char *__extname = devindex_catalog[i].extname; + const char *__signature = devindex_catalog[i].signature; + const char *__ivar_typname = devindex_catalog[i].ivar_typname; + const char *__iarg_typname = devindex_catalog[i].iarg_typname; + devtype_info *ivar_dtype; + devtype_info *iarg_dtype; + + if (__extname) + { + if (!extname || strcmp(__extname, extname) != 0) + continue; + } + else if (extname != NULL) + continue; + + if (strcmp(__signature, signature) != 0) + continue; + + ivar_dtype = pgstrom_devtype_lookup_by_name(__ivar_typname); + if (!ivar_dtype) + continue; + iarg_dtype = pgstrom_devtype_lookup_by_name(__iarg_typname); + if (!iarg_dtype) + continue; + + dindex = MemoryContextAllocZero(devinfo_memcxt, sizeof(devindex_info)); + dindex->oper_extension = extname; + dindex->opcode = opcode; + dindex->opfamily = opfamily; + dindex->opstrategy = amop->amopstrategy; + dindex->index_kind = devindex_catalog[i].index_kind; + dindex->index_fname = devindex_catalog[i].index_fname; + dindex->ivar_dtype = ivar_dtype; + dindex->iarg_dtype = iarg_dtype; + break; + } + + //TODO: call extra module + + /* not supported, add negative entry */ + if (!dindex) + { + dindex = MemoryContextAllocZero(devinfo_memcxt, sizeof(devindex_info)); + dindex->oper_extension = extname; + dindex->opcode = opcode; + dindex->opfamily = opfamily; + dindex->opstrategy = amop->amopstrategy; + dindex->index_is_negative = true; + } + ReleaseSysCache(htup); + + dindex->hashvalue = hashvalue; + dlist_push_head(&devindex_info_slot[hindex], &dindex->chain); +found: + if (dindex->index_is_negative) + return NULL; + return dindex; +} + +/* + * codegen_expression_walker - main logic of run-time code generator + */ +static void codegen_expression_walker(codegen_context *context, + StringInfo body, + Node *node, int *p_varlena_sz); + +static Node *__codegen_current_node = NULL; +static void +__appendStringInfo(StringInfo str, const char *fmt,...) + pg_attribute_printf(2, 3); + +static void +__appendStringInfo(StringInfo str, const char *fmt,...) +{ + int save_errno = errno; + + if (!str) + return; + for (;;) + { + va_list va_args; + int needed; + + errno = save_errno; + va_start(va_args, fmt); + needed = appendStringInfoVA(str, fmt, va_args); + va_end(va_args); + + if (needed == 0) + break; + enlargeStringInfo(str, needed); + } +} + +static inline void +__appendStringInfoChar(StringInfo str, char c) +{ + if (str) + appendStringInfoChar(str, c); +} + +static int +codegen_const_expression(codegen_context *context, + StringInfo body, + Const *con) +{ + devtype_info *dtype; + cl_int index; + cl_int width; + + dtype = pgstrom_devtype_lookup_and_track(con->consttype, context); + if (!dtype) + __ELog("type %s is not device supported", + format_type_be(con->consttype)); + context->used_params = lappend(context->used_params, + copyObject(con)); + index = list_length(context->used_params) - 1; + + __appendStringInfo(body, + "pg_%s_param(kcxt,%d)", + dtype->type_name, index); + if (con->constisnull) + width = 0; + else if (con->constlen > 0) + width = con->constlen; + else if (con->constlen == -1) + width = VARSIZE_ANY_EXHDR(con->constvalue); + else + elog(ERROR, "unexpected type length: %d", con->constlen); + return width; +} + +static int +codegen_param_expression(codegen_context *context, + StringInfo body, + Param *param) +{ + devtype_info *dtype; + ListCell *lc; + int index = 0; + int width; + + if (param->paramkind != PARAM_EXTERN) + __ELog("ParamKind is not PARAM_EXTERN: %d", + (int)param->paramkind); + + dtype = pgstrom_devtype_lookup_and_track(param->paramtype, context); + if (!dtype) + __ELog("type %s is not device supported", + format_type_be(param->paramtype)); + + foreach (lc, context->used_params) + { + if (equal(param, lfirst(lc))) + goto found; + index++; + } + context->used_params = lappend(context->used_params, + copyObject(param)); + index = list_length(context->used_params) - 1; + +found: + __appendStringInfo(body, + "pg_%s_param(kcxt,%d)", + dtype->type_name, index); + if (dtype->type_length > 0) + width = dtype->type_length; + else if (dtype->type_length == -1) + width = type_maximum_size(param->paramtype, + param->paramtypmod) - VARHDRSZ; + else + elog(ERROR, "unexpected type length: %d", dtype->type_length); + + return width; +} + +static int +codegen_varnode_expression(codegen_context *context, + StringInfo body, Var *var) +{ + AttrNumber varattno = var->varattno; + devtype_info *dtype; + ListCell *lc; + int width; + + dtype = pgstrom_devtype_lookup_and_track(var->vartype, context); + if (!dtype) + __ELog("type %s is not device supported", + format_type_be(var->vartype)); + /* + * NOTE: Expression tree at the path-construction time can contain + * references to other tables; which can be eventually replaced by + * replace_nestloop_params(). So, this Var-node shall not be visible + * when we generate the device code. + * We may be able to handle the check well, however, we simply + * prohibit the Var-node which references out of the current scope + * of the relations. + * + * If var->varno == INDEX_VAR, it is obvious that caller is + * responsible to build custom_scan_tlist with adequate source. + */ + if (context->baserel && !IS_SPECIAL_VARNO(var->varno)) + { + RelOptInfo *baserel = context->baserel; + + if (!bms_is_member(var->varno, baserel->relids)) + elog(ERROR, "Var (varno=%d) referred out of expected range %s", + var->varno, bms_to_cstring(baserel->relids)); + } + + /* + * Fixup varattno when pseudo-scan tlist exists, because varattno + * shall be adjusted on setrefs.c, so we have to adjust variable + * name according to the expected attribute number is kernel- + * source shall be constructed prior to setrefs.c / subselect.c + */ + if (context->pseudo_tlist != NIL) + { + foreach (lc, context->pseudo_tlist) + { + TargetEntry *tle = lfirst(lc); + Var *ptv = (Var *) tle->expr; + + if (!IsA(tle->expr, Var) || + ptv->varno != var->varno || + ptv->varattno != var->varattno || + ptv->varlevelsup != var->varlevelsup) + continue; + + varattno = tle->resno; + break; + } + if (!lc) + elog(ERROR, "failed on map Var (%s) on ps_tlist: %s", + nodeToString(var), + nodeToString(context->pseudo_tlist)); + } + if (varattno < 0) + __appendStringInfo(body, "KVAR_S%u", -varattno); + else + __appendStringInfo(body, "KVAR_%u", varattno); + if (!list_member(context->used_vars, var)) + context->used_vars = lappend(context->used_vars, + copyObject(var)); + if (dtype->type_length >= 0) + width = dtype->type_length; + else + width = type_maximum_size(var->vartype, + var->vartypmod) - VARHDRSZ; + return width; +} + +static int +codegen_function_expression(codegen_context *context, + StringInfo body, + devfunc_info *dfunc, List *args) +{ + ListCell *lc1, *lc2; + Expr **fn_args = alloca(sizeof(Expr *) * list_length(args)); + int *vl_width = alloca(sizeof(int) * list_length(args)); + int index = 0; + + __appendStringInfo(body, + "pgfn_%s(kcxt", + dfunc->func_devname); + forboth (lc1, dfunc->func_args, + lc2, args) + { + devtype_info *dtype = lfirst(lc1); + Node *expr = lfirst(lc2); + Oid expr_type_oid = exprType(expr); + + __appendStringInfo(body, ", "); + + if (dtype->type_oid == expr_type_oid) + codegen_expression_walker(context, body, expr, &vl_width[index]); + else if (pgstrom_devtype_can_relabel(expr_type_oid, + dtype->type_oid)) + { + /* + * NOTE: PostgreSQL may pass binary compatible arguments + * without explicit RelabelType, like varchar(N) values + * onto text arguments. + * It is quite right implementation from the PostgreSQL + * function invocation API, however, unable to describe + * the relevant device code, because CUDA C++ has strict + * type checks. So, we have to inject an explicit type + * relabel in this case. + */ + __appendStringInfo(body, "to_%s(", dtype->type_name); + codegen_expression_walker(context, body, expr, &vl_width[index]); + __appendStringInfoChar(body, ')'); + } + else + { + __ELog("Bug? unsupported implicit type cast (%s)->(%s)", + format_type_be(expr_type_oid), + format_type_be(dtype->type_oid)); + } + fn_args[index++] = (Expr *)expr; + } + __appendStringInfoChar(body, ')'); + /* estimation of function result width */ + return dfunc->devfunc_result_sz(context, dfunc, fn_args, vl_width); +} + +static int +codegen_nulltest_expression(codegen_context *context, + StringInfo body, + NullTest *nulltest) +{ + devtype_info *dtype; + Oid typeoid = exprType((Node *)nulltest->arg); + + if (nulltest->argisrow) + __ELog("NullTest towards RECORD data"); + + dtype = pgstrom_devtype_lookup_and_track(typeoid, context); + if (!dtype) + __ELog("type %s is not device supported", + format_type_be(typeoid)); + switch (nulltest->nulltesttype) + { + case IS_NULL: + __appendStringInfo(body, "PG_ISNULL"); + break; + case IS_NOT_NULL: + __appendStringInfo(body, "PG_ISNOTNULL"); + break; + default: + elog(ERROR, "unknown NullTestType: %d", + (int)nulltest->nulltesttype); + } + __appendStringInfo(body, "(kcxt, "); + codegen_expression_walker(context, body, (Node *) nulltest->arg, NULL); + __appendStringInfoChar(body, ')'); + context->devcost += 1; + + return sizeof(cl_bool); +} + +static int +codegen_booleantest_expression(codegen_context *context, + StringInfo body, + BooleanTest *booltest) +{ + const char *func_name; + + if (exprType((Node *)booltest->arg) != BOOLOID) + elog(ERROR, "argument type of BooleanTest is not bool"); + + /* choose one of built-in functions */ + switch (booltest->booltesttype) + { + case IS_TRUE: + func_name = "bool_is_true"; + break; + case IS_NOT_TRUE: + func_name = "bool_is_not_true"; + break; + case IS_FALSE: + func_name = "bool_is_false"; + break; + case IS_NOT_FALSE: + func_name = "bool_is_not_false"; + break; + case IS_UNKNOWN: + func_name = "bool_is_unknown"; + break; + case IS_NOT_UNKNOWN: + func_name = "bool_is_not_unknown"; + break; + default: + elog(ERROR, "unknown BoolTestType: %d", + (int)booltest->booltesttype); + break; + } + __appendStringInfo(body, "pgfn_%s(kcxt, ", func_name); + codegen_expression_walker(context, body, + (Node *) booltest->arg, NULL); + __appendStringInfoChar(body, ')'); + context->devcost += 1; + + return sizeof(cl_bool); +} + +static int +codegen_bool_expression(codegen_context *context, + StringInfo body, BoolExpr *b) +{ + Node *node; + + if (b->boolop == NOT_EXPR) + { + Assert(list_length(b->args) == 1); + node = linitial(b->args); + + __appendStringInfo(body, "NOT("); + codegen_expression_walker(context, body, node, NULL); + __appendStringInfoChar(body, ')'); + } + else if (b->boolop == AND_EXPR || + b->boolop == OR_EXPR) + { + StringInfoData temp; + List *used_vars_saved; + ListCell *lc; + + initStringInfo(&temp); + + used_vars_saved = context->used_vars; + context->used_vars = NIL; + foreach (lc, b->args) + { + Node *node = lfirst(lc); + + if (lc != list_head(b->args)) + __appendStringInfo(&temp, " has_null |= status.isnull;\n"); + __appendStringInfo(&temp, + " status = "); + codegen_expression_walker(context, &temp, node, NULL); + __appendStringInfo(&temp, ";\n" + " if (PG_BOOL_%s(status))\n" + " return status;\n", + (b->boolop == AND_EXPR ? "ISFALSE" : "ISTRUE")); + } + context->decl_count++; + __appendStringInfo( + &context->decl, + "DEVICE_INLINE(pg_bool_t)\n" + "__exprBoolOp_%u(kern_context *kcxt", + context->decl_count); + __appendStringInfo(body, + "__exprBoolOp_%u(kcxt", context->decl_count); + foreach (lc, context->used_vars) + { + devtype_info *dtype; + Var *var = lfirst(lc); + + dtype = pgstrom_devtype_lookup(var->vartype); + if (!dtype) + __ELog("type %s is not device supported", + format_type_be(var->vartype)); + __appendStringInfo( + &context->decl, + ", pg_%s_t &", + dtype->type_name); + codegen_expression_walker(context, + &context->decl, + (Node *)var, NULL); + __appendStringInfo(body, ", "); + codegen_expression_walker(context, body, (Node *)var, NULL); + + if (!list_member(used_vars_saved, var)) + used_vars_saved = lappend(used_vars_saved, var); + } + __appendStringInfo( + &context->decl, + ")\n" + "{\n" + " pg_bool_t status __attribute__((unused));\n" + " cl_bool has_null = false;\n" + "\n" + "%s" + " status.isnull |= has_null;\n" + " return status;\n" + "}\n\n", + temp.data); + __appendStringInfo(body, ")"); + context->used_vars = used_vars_saved; + + pfree(temp.data); + } + else + { + elog(ERROR, "unknown BoolExprType: %d", (int) b->boolop); + } + context->devcost += list_length(b->args); + return sizeof(cl_bool); +} + +static int +codegen_coalesce_expression(codegen_context *context, + StringInfo body, + CoalesceExpr *coalesce) +{ + devtype_info *dtype; + StringInfoData temp; + List *used_vars_saved; + ListCell *lc; + int maxlen = 0; + + initStringInfo(&temp); + dtype = pgstrom_devtype_lookup(coalesce->coalescetype); + if (!dtype) + __ELog("type %s is not device supported", + format_type_be(coalesce->coalescetype)); + + used_vars_saved = context->used_vars; + context->used_vars = NIL; + foreach (lc, coalesce->args) + { + Node *expr = lfirst(lc); + Oid type_oid = exprType(expr); + int width; + + if (dtype->type_oid != type_oid) + __ELog("device type mismatch in COALESCE: %s / %s", + format_type_be(dtype->type_oid), + format_type_be(type_oid)); + __appendStringInfo(&temp, + " retval = "); + codegen_expression_walker(context, &temp, expr, &width); + __appendStringInfo(&temp, + ";\n" + " if (!retval.isnull)\n" + " return retval;\n"); + if (width < 0) + maxlen = -1; + else if (maxlen >= 0) + maxlen = Max(maxlen, width); + context->devcost += 1; + } + + context->decl_count++; + __appendStringInfo( + &context->decl, + "DEVICE_INLINE(pg_%s_t)\n" + "__exprCoalesce_%u(kern_context *kcxt", + dtype->type_name, + context->decl_count); + __appendStringInfo( + body, + "__exprCoalesce_%u(kcxt", + context->decl_count); + + foreach (lc, context->used_vars) + { + devtype_info *__dtype; + Var *var = lfirst(lc); + + __dtype = pgstrom_devtype_lookup(var->vartype); + if (!__dtype) + __ELog("type %s is not device supported", + format_type_be(var->vartype)); + __appendStringInfo( + &context->decl, + ", pg_%s_t &", + __dtype->type_name); + codegen_expression_walker(context, + &context->decl, + (Node *)var, NULL); + __appendStringInfo(body, ", "); + codegen_expression_walker(context, body, (Node *)var, NULL); + + if (!list_member(used_vars_saved, var)) + used_vars_saved = lappend(used_vars_saved, var); + } + __appendStringInfo( + &context->decl, + ")\n" + "{\n" + " pg_%s_t retval __attribute__((unused));\n" + "\n" + " retval.isnull = true;\n" + "%s" + " return retval;\n" + "}\n\n", + dtype->type_name, + temp.data); + __appendStringInfo(body, ")"); + context->used_vars = used_vars_saved; + + pfree(temp.data); + + return maxlen; +} + +static int +codegen_minmax_expression(codegen_context *context, + StringInfo body, + MinMaxExpr *minmax) +{ + devtype_info *dtype; + devfunc_info *dfunc; + List *used_vars_saved; + ListCell *lc; + StringInfoData temp; + int maxlen = 0; + + dtype = pgstrom_devtype_lookup(minmax->minmaxtype); + if (!dtype) + __ELog("type %s is not device supported", + format_type_be(minmax->minmaxtype)); + context->extra_flags |= dtype->type_flags; + + dfunc = pgstrom_devfunc_lookup_type_compare(dtype, minmax->inputcollid); + if (!dfunc) + __ELog("device type %s has no comparison operator", + format_type_be(minmax->minmaxtype)); + context->extra_flags |= dfunc->func_flags; + + initStringInfo(&temp); + used_vars_saved = context->used_vars; + context->used_vars = NIL; + foreach (lc, minmax->args) + { + Node *expr = lfirst(lc); + Oid type_oid = exprType(expr); + int width; + + if (dtype->type_oid != type_oid) + __ELog("device type mismatch in LEAST/GREATEST: %s / %s", + format_type_be(dtype->type_oid), + format_type_be(exprType(expr))); + if (lc == list_head(minmax->args)) + __appendStringInfo(&temp, " r = "); + else + __appendStringInfo(&temp, " x = "); + codegen_expression_walker(context, &temp, expr, &width); + __appendStringInfo(&temp, ";\n"); + + if (lc != list_head(minmax->args)) + { + __appendStringInfo( + &temp, + " if (r.isnull)\n" + " r = x;\n" + " else if (!x.isnull && PG_%s_THAN(pgfn_%s(kcxt, x, r)))\n" + " r = x;\n", + minmax->op == IS_GREATEST ? "GREATER" : "LESS", + dfunc->func_devname); + } + if (width < 0) + maxlen = -1; + else if (maxlen >= 0) + maxlen = Max(maxlen, width); + context->devcost += 1; + } + + context->decl_count++; + __appendStringInfo( + &context->decl, + "DEVICE_INLINE(pg_%s_t)\n" + "__exprMinMax_%u(kern_context *kcxt", + dtype->type_name, + context->decl_count); + __appendStringInfo( + body, + "__exprMinMax_%u(kcxt", + context->decl_count); + + foreach (lc, context->used_vars) + { + devtype_info *__dtype; + Var *var = lfirst(lc); + + __dtype = pgstrom_devtype_lookup(var->vartype); + if (!__dtype) + __ELog("type %s is not device supported", + format_type_be(var->vartype)); + __appendStringInfo( + &context->decl, + ", pg_%s_t &", + __dtype->type_name); + codegen_expression_walker(context, + &context->decl, + (Node *)var, NULL); + __appendStringInfo(body, ", "); + codegen_expression_walker(context, body, (Node *)var, NULL); + + if (!list_member(used_vars_saved, var)) + used_vars_saved = lappend(used_vars_saved, var); + } + __appendStringInfo( + &context->decl, + ")\n" + "{\n" + " pg_%s_t r, x __attribute__((unused));\n" + " pg_int4_t cmp __attribute__((unused));\n" + "\n" + "%s" + " return r;\n" + "}\n\n", + dtype->type_name, + temp.data); + __appendStringInfo(body, ")"); + context->used_vars = used_vars_saved; + + pfree(temp.data); + + return maxlen; +} + +static int +codegen_relabel_expression(codegen_context *context, + StringInfo body, + RelabelType *relabel) +{ + devtype_info *dtype; + Oid stype_oid = exprType((Node *)relabel->arg); + int width; + + dtype = pgstrom_devtype_lookup_and_track(stype_oid, context); + if (!dtype) + __ELog("type %s is not device supported", + format_type_be(stype_oid)); + + dtype = pgstrom_devtype_lookup_and_track(relabel->resulttype, context); + if (!dtype) + __ELog("type %s is not device supported", + format_type_be(relabel->resulttype)); + if (!pgstrom_devtype_can_relabel(stype_oid, dtype->type_oid)) + __ELog("type %s->%s cannot be relabeled on device", + format_type_be(stype_oid), + format_type_be(relabel->resulttype)); + + __appendStringInfo(body, "to_%s(", dtype->type_name); + codegen_expression_walker(context, body, (Node *)relabel->arg, &width); + __appendStringInfoChar(body, ')'); + + return width; +} + +static int +codegen_coerceviaio_expression(codegen_context *context, + StringInfo body, + CoerceViaIO *coerce) +{ + devcast_info *dcast; + Oid stype_oid = exprType((Node *)coerce->arg); + Oid dtype_oid = coerce->resulttype; + + dcast = pgstrom_devcast_lookup(stype_oid, dtype_oid); + if (!dcast || dcast->dcast_coerceviaio_callback == NULL) + __ELog("no device support of coerceviaio (%s -> %s)", + format_type_be(stype_oid), + format_type_be(dtype_oid)); + context->devcost += 8; /* just a rough estimation */ + + return dcast->dcast_coerceviaio_callback(context, body, dcast, coerce); +} + +static int +codegen_coercetodomain_expression(codegen_context *context, + StringInfo body, + CoerceToDomain *coerce_d) +{ + devcast_info *dcast; + Oid stype_oid = exprType((Node *)coerce_d->arg); + Oid dtype_oid = coerce_d->resulttype; + int width; + + dcast = pgstrom_devcast_lookup(stype_oid, dtype_oid); + if (!dcast || dcast->dcast_coerceviaio_callback != NULL) + __ELog("type cast (%s -> %s) is not binary compatible", + format_type_be(stype_oid), + format_type_be(dtype_oid)); + if (!dcast->has_domain_checks) + __ELog("type cast (%s -> %s) has no domain constraint", + format_type_be(stype_oid), + format_type_be(dtype_oid)); + __appendStringInfo(body, "to_%s_domain(kcxt,", + dcast->dst_type->type_name); + codegen_expression_walker(context, body, (Node *)coerce_d->arg, &width); + __appendStringInfoChar(body, ')'); + + return width; +} + +static int +codegen_casewhen_expression(codegen_context *context, + StringInfo body, + CaseExpr *caseexpr) +{ + devtype_info *rtype; /* result type */ + devtype_info *dtype; + StringInfoData temp; + Node *defresult; + List *used_vars_saved; + ListCell *lc; + Oid type_oid; + int width, maxlen = 0; + + /* check result type */ + rtype = pgstrom_devtype_lookup(caseexpr->casetype); + if (!rtype) + __ELog("type %s is not device supported", + format_type_be(caseexpr->casetype)); + if (caseexpr->defresult) + defresult = (Node *)caseexpr->defresult; + else + { + defresult = (Node *)makeConst(rtype->type_oid, + -1, + InvalidOid, + rtype->type_length, + 0UL, + true, /* NULL */ + rtype->type_byval); + } + + initStringInfo(&temp); + used_vars_saved = context->used_vars; + context->used_vars = NIL; + if (caseexpr->arg) + { + /* type compare function internally used */ + type_oid = exprType((Node *) caseexpr->arg); + dtype = pgstrom_devtype_lookup(type_oid); + if (!dtype) + __ELog("type %s is not device supported", + format_type_be(type_oid)); + + __appendStringInfo(&temp, " pg_%s_t CARG = ", dtype->type_name); + codegen_expression_walker(context, &temp, + (Node *)caseexpr->arg, NULL); + __appendStringInfo(&temp, ";\n\n"); + } + + foreach (lc, caseexpr->args) + { + CaseWhen *casewhen = (CaseWhen *) lfirst(lc); + Expr *expr = casewhen->expr; + + Assert(IsA(casewhen, CaseWhen) && + exprType((Node *)expr) == BOOLOID && + exprType((Node *)casewhen->result) == rtype->type_oid); + __appendStringInfo(&temp, " if (EVAL("); + codegen_expression_walker(context, &temp, (Node *)expr, NULL); + __appendStringInfo(&temp, "))\n" " return "); + codegen_expression_walker(context, &temp, + (Node *)casewhen->result, + &width); + __appendStringInfo(&temp, ";\n"); + if (width < 0) + maxlen = -1; + else if (maxlen >= 0) + maxlen = Max(maxlen, width); + context->devcost += 1; + } + __appendStringInfo(&temp, " return "); + codegen_expression_walker(context, &temp, defresult, NULL); + __appendStringInfo(&temp, ";\n"); + + context->decl_count++; + __appendStringInfo( + &context->decl, + "DEVICE_INLINE(pg_%s_t)\n" + "__exprCaseWhen_%u(kern_context *kcxt", + rtype->type_name, + context->decl_count); + __appendStringInfo( + body, + "__exprCaseWhen_%u(kcxt", + context->decl_count); + + foreach (lc, context->used_vars) + { + devtype_info *__dtype; + Var *var = lfirst(lc); + + __dtype = pgstrom_devtype_lookup(var->vartype); + if (!__dtype) + __ELog("type %s is not device supported", + format_type_be(var->vartype)); + __appendStringInfo( + &context->decl, + ", pg_%s_t &", + __dtype->type_name); + codegen_expression_walker(context, + &context->decl, + (Node *)var, NULL); + __appendStringInfo(body, ", "); + codegen_expression_walker(context, body, (Node *)var, NULL); + + if (!list_member(used_vars_saved, var)) + used_vars_saved = lappend(used_vars_saved, var); + } + __appendStringInfo( + &context->decl, + ")\n" + "{\n" + "%s" + "}\n\n", + temp.data); + __appendStringInfo(body, ")"); + context->used_vars = used_vars_saved; + + pfree(temp.data); + + return maxlen; +} + +static int +codegen_casetest_expression(codegen_context *context, + StringInfo body, + CaseTestExpr *ctest) +{ + __appendStringInfo(body, "CARG"); + return 0; +} + +static int +codegen_scalar_array_op_expression(codegen_context *context, + StringInfo body, + ScalarArrayOpExpr *opexpr) +{ + devfunc_info *dfunc; + devtype_info *dtype_s; + devtype_info *dtype_a; + devtype_info *dtype_e; + Node *node_s; + Node *node_a; + HeapTuple fn_tup; + oidvector *fn_argtypes = alloca(offsetof(oidvector, values[2])); + + Assert(list_length(opexpr->args) == 2); + node_s = linitial(opexpr->args); + node_a = lsecond(opexpr->args); + dtype_s = pgstrom_devtype_lookup_and_track(exprType(node_s), context); + if (!dtype_s) + __ELog("type %s is not device supported", + format_type_be(exprType(node_s))); + dtype_a = pgstrom_devtype_lookup_and_track(exprType(node_a), context); + if (!dtype_a) + __ELog("type %s is not device supported", + format_type_be(exprType(node_a))); + dtype_e = dtype_a->type_element; + if (!dtype_e) + __ELog("type %s is not an array data type", + format_type_be(exprType(node_a))); + + /* lookup operator function */ + fn_tup = SearchSysCache1(PROCOID, ObjectIdGetDatum(opexpr->opfuncid)); + if (!HeapTupleIsValid(fn_tup)) + elog(ERROR, "cache lookup failed for function %u", opexpr->opfuncid); + PG_TRY(); + { + memset(fn_argtypes, 0, offsetof(oidvector, values[2])); + fn_argtypes->ndim = 1; + fn_argtypes->dataoffset = 0; + fn_argtypes->elemtype = OIDOID; + fn_argtypes->dim1 = 2; + fn_argtypes->lbound1 = 0; + fn_argtypes->values[0] = dtype_s->type_oid; + fn_argtypes->values[1] = dtype_e->type_oid; + SET_VARSIZE(fn_argtypes, offsetof(oidvector, values[2])); + + dfunc = __pgstrom_devfunc_lookup(fn_tup, + BOOLOID, + fn_argtypes, + opexpr->inputcollid); + if (!dfunc) + __ELog("function %s is not device supported", + format_procedure(opexpr->opfuncid)); + pgstrom_devfunc_track(context, dfunc); + } + PG_CATCH(); + { + ReleaseSysCache(fn_tup); + PG_RE_THROW(); + } + PG_END_TRY(); + ReleaseSysCache(fn_tup); + + __appendStringInfo(body, + "PG_SCALAR_ARRAY_OP(kcxt, pgfn_%s, ", + dfunc->func_devname); + codegen_expression_walker(context, body, node_s, NULL); + __appendStringInfo(body, ", "); + codegen_expression_walker(context, body, node_a, NULL); + __appendStringInfo(body, ", %s, %d, %d)", + opexpr->useOr ? "true" : "false", + dtype_e->type_length, + dtype_e->type_align); + /* + * Cost for PG_SCALAR_ARRAY_OP - It repeats on number of invocation + * of the operator function for each array elements. Tentatively, + * we assume one array has 32 elements in average. + */ + context->devcost += 32 * dfunc->func_devcost; + + return sizeof(cl_bool); +} + +static void +codegen_expression_walker(codegen_context *context, + StringInfo body, + Node *node, int *p_width) +{ + devfunc_info *dfunc; + int width = 0; + Node *__codegen_saved_node; + + if (node == NULL) + return; + /* save the current node for error message */ + __codegen_saved_node = __codegen_current_node; + __codegen_current_node = node; + + switch (nodeTag(node)) + { + case T_Const: + width = codegen_const_expression(context, body, (Const *) node); + break; + + case T_Param: + width = codegen_param_expression(context, body, (Param *) node); + break; + + case T_Var: + width = codegen_varnode_expression(context, body, (Var *) node); + break; + + case T_FuncExpr: + { + FuncExpr *func = (FuncExpr *) node; + + dfunc = pgstrom_devfunc_lookup(func->funcid, + func->funcresulttype, + func->args, + func->inputcollid); + if (!dfunc) + __ELog("function %s is not device supported", + format_procedure(func->funcid)); + pgstrom_devfunc_track(context, dfunc); + width = codegen_function_expression(context, + body, + dfunc, + func->args); + context->devcost += dfunc->func_devcost; + } + break; + + case T_OpExpr: + case T_DistinctExpr: + { + OpExpr *op = (OpExpr *) node; + Oid func_oid = get_opcode(op->opno); + + dfunc = pgstrom_devfunc_lookup(func_oid, + op->opresulttype, + op->args, + op->inputcollid); + if (!dfunc) + __ELog("function %s is not device supported", + format_procedure(func_oid)); + pgstrom_devfunc_track(context, dfunc); + width = codegen_function_expression(context, + body, + dfunc, + op->args); + context->devcost += dfunc->func_devcost; + } + break; + + case T_NullTest: + width = codegen_nulltest_expression(context, + body, + (NullTest *) node); + break; + + case T_BooleanTest: + width = codegen_booleantest_expression(context, + body, + (BooleanTest *) node); + break; + + case T_BoolExpr: + width = codegen_bool_expression(context, + body, + (BoolExpr *) node); + break; + + case T_CoalesceExpr: + width = codegen_coalesce_expression(context, + body, + (CoalesceExpr *) node); + break; + + case T_MinMaxExpr: + width = codegen_minmax_expression(context, + body, + (MinMaxExpr *) node); + break; + + case T_RelabelType: + width = codegen_relabel_expression(context, + body, + (RelabelType *) node); + break; + + case T_CoerceViaIO: + width = codegen_coerceviaio_expression(context, + body, + (CoerceViaIO *) node); + break; + + case T_CoerceToDomain: + width = codegen_coercetodomain_expression(context, + body, + (CoerceToDomain *) node); + break; + + case T_CaseExpr: + width = codegen_casewhen_expression(context, + body, + (CaseExpr *) node); + break; + + case T_CaseTestExpr: + width = codegen_casetest_expression(context, + body, + (CaseTestExpr *) node); + break; + + case T_ScalarArrayOpExpr: + width = codegen_scalar_array_op_expression(context, + body, + (ScalarArrayOpExpr *) node); + break; + default: + __ELog("Bug? unsupported expression: %s", nodeToString(node)); + break; + } + if (p_width) + *p_width = width; + /* restore */ + __codegen_current_node = __codegen_saved_node; +} + +char * +pgstrom_codegen_expression(Node *expr, codegen_context *context) +{ + StringInfoData body; + devtype_info *dtype; + + initStringInfo(&body); + if (IsA(expr, List)) + { + if (list_length((List *)expr) == 1) + expr = (Node *)linitial((List *)expr); + else + expr = (Node *)make_andclause((List *)expr); + } + + PG_TRY(); + { + codegen_expression_walker(context, &body, expr, NULL); + } + PG_CATCH(); + { + errdetail("problematic expression: %s", nodeToString(expr)); + PG_RE_THROW(); + } + PG_END_TRY(); + + /* + * Even if expression itself needs no varlena extra buffer, projection + * code may require the buffer to construct a temporary datum. + * E.g) Numeric datum is encoded to 128bit at the GPU kernel, however, + * projection needs to decode to varlena again. + */ + dtype = pgstrom_devtype_lookup(exprType((Node *) expr)); + if (dtype) + context->extra_bufsz += MAXALIGN(dtype->extra_sz); + + return body.data; +} + +/* + * pgstrom_union_type_declarations + * + * put declaration of a union type which contains all the types in type_oid_list, + * as follows. OID of device types should be unique, must not duplicated. + * + * union { + * pg_bool_t bool_v; + * pg_text_t text_v; + * : + * } NAME; + */ +void +pgstrom_union_type_declarations(StringInfo buf, + const char *name, + List *type_oid_list) +{ + ListCell *lc; + devtype_info *dtype; + bool meet_array_v = false; + + if (type_oid_list == NIL) + return; + appendStringInfo(buf, " union {\n"); + foreach (lc, type_oid_list) + { + Oid type_oid = lfirst_oid(lc); + + dtype = pgstrom_devtype_lookup(type_oid); + if (!dtype) + __ELog("failed to lookup device type: %u", type_oid); + /* + * All the array types have same device type name (pg_array_t) + * regardless of the element type. So, we have to avoid duplication + * of the field name in union, by special handling. + */ + if (dtype->type_element) + { + if (meet_array_v) + continue; + meet_array_v = true; + } + appendStringInfo(buf, + " pg_%s_t %s_v;\n", + dtype->type_name, + dtype->type_name); + } + appendStringInfo(buf, " } %s __attribute__((unused));\n", name); +} + +/* + * __pgstrom_device_expression + * + * It shows a quick decision whether the provided expression tree is + * available to run on CUDA device, or not. + */ +bool +__pgstrom_device_expression(PlannerInfo *root, + RelOptInfo *baserel, + Expr *expr, + int *p_devcost, int *p_extra_sz, + const char *filename, int lineno) +{ + MemoryContext memcxt = CurrentMemoryContext; + codegen_context con; + int dummy = 0; + bool result = true; + + if (!expr) + return false; + pgstrom_init_codegen_context(&con, root, baserel); + PG_TRY(); + { + if (IsA(expr, List)) + { + List *exprsList = (List *)expr; + ListCell *lc; + + foreach (lc, exprsList) + { + Node *node = (Node *)lfirst(lc); + + codegen_expression_walker(&con, NULL, node, &dummy); + } + } + else + { + codegen_expression_walker(&con, NULL, (Node *)expr, &dummy); + } + } + PG_CATCH(); + { + ErrorData *edata; + + MemoryContextSwitchTo(memcxt); + edata = CopyErrorData(); + if (edata->sqlerrcode != ERRCODE_FEATURE_NOT_SUPPORTED) + PG_RE_THROW(); + + FlushErrorState(); + + ereport(DEBUG2, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("%s:%d %s, at %s:%d", + filename, lineno, + edata->message, + edata->filename, edata->lineno), + errdetail("expression: %s", + nodeToString(__codegen_current_node)))); + __codegen_current_node = NULL; + FreeErrorData(edata); + result = false; + } + PG_END_TRY(); + + if (result) + { + if (con.extra_bufsz > KERN_CONTEXT_VARLENA_BUFSZ_LIMIT) + { + elog(DEBUG2, "Expression consumes too much buffer (%u): %s", + con.extra_bufsz, nodeToString(expr)); + return false; + } + Assert(con.devcost >= 0); + if (p_devcost) + *p_devcost = con.devcost; + if (p_extra_sz) + *p_extra_sz = con.extra_bufsz; + } + return result; +} + +/* + * devcast_text2numeric_callback + * ------ + * Special case handling of text->numeric values, including the case of + * jsonb key references. + */ +static int +devcast_text2numeric_callback(codegen_context *context, + StringInfo body, + devcast_info *dcast, + CoerceViaIO *node) +{ + devtype_info *dtype = dcast->dst_type; + Expr *arg = node->arg; + Oid func_oid = InvalidOid; + List *func_args = NIL; + char dfunc_name[100]; + int width; + ListCell *lc; + + /* check special case if jsonb key reference */ + if (IsA(arg, FuncExpr)) + { + FuncExpr *func = (FuncExpr *)arg; + + func_oid = func->funcid; + func_args = func->args; + } + else if (IsA(arg, OpExpr) || IsA(arg, DistinctExpr)) + { + OpExpr *op = (OpExpr *)arg; + + func_oid = get_opcode(op->opno); + func_args = op->args; + } + else + __ELog("Not supported CoerceViaIO with jsonb key reference"); + + switch (func_oid) + { + case F_JSONB_OBJECT_FIELD_TEXT: + snprintf(dfunc_name, sizeof(dfunc_name), + "jsonb_object_field_as_%s", dtype->type_name); + break; + case F_JSONB_ARRAY_ELEMENT_TEXT: + snprintf(dfunc_name, sizeof(dfunc_name), + "jsonb_array_element_as_%s", dtype->type_name); + break; + default: + __ELog("Not supported CoerceViaIO with jsonb key reference"); + } + context->extra_flags |= DEVKERNEL_NEEDS_JSONLIB; + __appendStringInfo(body, "pgfn_%s(kcxt", dfunc_name); + foreach (lc, func_args) + { + Node *expr = lfirst(lc); + int dummy; + + __appendStringInfo(body, ", "); + codegen_expression_walker(context, body, expr, &dummy); + } + __appendStringInfoChar(body, ')'); + if (dtype->type_length > 0) + width = dtype->type_length; + else if (dtype->type_length == -1) + width = -1; /* we don't know max length of a jsonb field */ + else + elog(ERROR, "unexpected type length: %d", dtype->type_length); + + return width; +} + +static void +devtype_cache_invalidator(Datum arg, int cacheid, uint32 hashvalue) +{ + dlist_mutable_iter iter; + int hindex; + + Assert(cacheid == TYPEOID); + if (hashvalue == 0) + { + for (hindex=0; hindex < lengthof(devtype_info_slot); hindex++) + dlist_init(&devtype_info_slot[hindex]); + return; + } + + hindex = hashvalue % lengthof(devtype_info_slot); + dlist_foreach_modify (iter, &devtype_info_slot[hindex]) + { + devtype_info *dtype = dlist_container(devtype_info, + chain, iter.cur); + if (dtype->hashvalue == hashvalue) + { + dlist_delete(&dtype->chain); + memset(&dtype->chain, 0, sizeof(dlist_node)); + } + } +} + +static void +devfunc_cache_invalidator(Datum arg, int cacheid, uint32 hashvalue) +{ + dlist_mutable_iter iter; + int hindex; + + Assert(cacheid == PROCOID); + if (hashvalue == 0) + { + for (hindex=0; hindex < lengthof(devfunc_info_slot); hindex++) + dlist_init(&devfunc_info_slot[hindex]); + return; + } + + hindex = hashvalue % lengthof(devfunc_info_slot); + dlist_foreach_modify (iter, &devfunc_info_slot[hindex]) + { + devfunc_info *dfunc = dlist_container(devfunc_info, + chain, iter.cur); + if (dfunc->hashvalue == hashvalue) + { + dlist_delete(&dfunc->chain); + memset(&dfunc->chain, 0, sizeof(dlist_node)); + } + } +} + +static void +devcast_cache_invalidator(Datum arg, int cacheid, uint32 hashvalue) +{ + dlist_mutable_iter iter; + int hindex; + + Assert(cacheid == CASTSOURCETARGET); + if (hashvalue == 0) + { + for (hindex=0; hindex < lengthof(devcast_info_slot); hindex++) + dlist_init(&devcast_info_slot[hindex]); + return; + } + + hindex = hashvalue % lengthof(devcast_info_slot); + dlist_foreach_modify (iter, &devcast_info_slot[hindex]) + { + devcast_info *dcast = dlist_container(devcast_info, + chain, iter.cur); + if (dcast->hashvalue == hashvalue) + { + dlist_delete(&dcast->chain); + memset(&dcast->chain, 0, sizeof(dlist_node)); + } + } +} + +static void +devindex_cache_invalidator(Datum arg, int cacheid, uint32 hashvalue) +{ + dlist_mutable_iter iter; + int hindex; + + Assert(cacheid == AMOPOPID); + if (hashvalue == 0) + { + for (hindex=0; hindex < lengthof(devindex_info_slot); hindex++) + dlist_init(&devindex_info_slot[hindex]); + return; + } + + hindex = hashvalue % lengthof(devcast_info_slot); + dlist_foreach_modify (iter, &devcast_info_slot[hindex]) + { + devindex_info *dindex = dlist_container(devindex_info, + chain, iter.cur); + if (dindex->hashvalue == hashvalue) + { + dlist_delete(&dindex->chain); + memset(&dindex->chain, 0, sizeof(dlist_node)); + } + } +} + +void +pgstrom_init_codegen_context(codegen_context *context, + PlannerInfo *root, + RelOptInfo *baserel) +{ + memset(context, 0, sizeof(codegen_context)); + initStringInfo(&context->decl); + context->root = root; + context->baserel = baserel; +} + +void +pgstrom_init_codegen(void) +{ + int i; + + for (i=0; i < lengthof(devtype_info_slot); i++) + dlist_init(&devtype_info_slot[i]); + for (i=0; i < lengthof(devfunc_info_slot); i++) + dlist_init(&devfunc_info_slot[i]); + for (i=0; i < lengthof(devcast_info_slot); i++) + dlist_init(&devcast_info_slot[i]); + for (i=0; i < lengthof(devindex_info_slot); i++) + dlist_init(&devindex_info_slot[i]); + + devinfo_memcxt = AllocSetContextCreate(CacheMemoryContext, + "device type/func info cache", + ALLOCSET_DEFAULT_SIZES); + CacheRegisterSyscacheCallback(PROCOID, devfunc_cache_invalidator, 0); + CacheRegisterSyscacheCallback(TYPEOID, devtype_cache_invalidator, 0); + CacheRegisterSyscacheCallback(CASTSOURCETARGET, devcast_cache_invalidator, 0); + CacheRegisterSyscacheCallback(AMOPOPID, devindex_cache_invalidator, 0); +} diff --git a/src/cuda_basetype.h b/old/cuda_basetype.h similarity index 100% rename from src/cuda_basetype.h rename to old/cuda_basetype.h diff --git a/src/cuda_codegen.h b/old/cuda_codegen.h similarity index 100% rename from src/cuda_codegen.h rename to old/cuda_codegen.h diff --git a/src/cuda_common.cu b/old/cuda_common.cu similarity index 100% rename from src/cuda_common.cu rename to old/cuda_common.cu diff --git a/old/cuda_common.h b/old/cuda_common.h new file mode 100644 index 000000000..0a593cda2 --- /dev/null +++ b/old/cuda_common.h @@ -0,0 +1,1837 @@ +/* + * cuda_common.h + * + * A common header for CUDA device code + * -- + * Copyright 2011-2021 (C) KaiGai Kohei + * Copyright 2014-2021 (C) PG-Strom Developers Team + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the PostgreSQL License. + */ +#ifndef CUDA_COMMON_H +#define CUDA_COMMON_H + +/* ---- Check minimum required CUDA version ---- */ +#ifdef __CUDACC__ +#if __CUDACC_VER_MAJOR__ < 9 || \ + (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ < 2) +#error PG-Strom requires CUDA 9.2 or later. Use newer version. +#endif /* >=CUDA9.2 */ +#include +#include +#endif /* __CUDACC__ */ + +/* check MAXIMUM_ALIGNOF */ +#if MAXIMUM_ALIGNOF == 8 +#define MAXIMUM_ALIGNOF_SHIFT 3 +#else +#error Unexpected MAXIMUM_ALIGNOF definition +#endif + +/* + * Basic type definition - because of historical reason, we use "cl_" + * prefix for the definition of data types below. It might imply + * something related to OpenCL, but what we intend at this moment is + * "CUDA Language". + */ +typedef char cl_bool; +typedef char cl_char; +typedef unsigned char cl_uchar; +typedef short cl_short; +typedef unsigned short cl_ushort; +typedef int cl_int; +typedef unsigned int cl_uint; +#ifdef __CUDACC__ +typedef long long cl_long; +typedef unsigned long long cl_ulong; +#else /* __CUDACC__ */ +typedef long cl_long; +typedef unsigned long cl_ulong; +#endif /* !__CUDACC__ */ +#ifdef __CUDACC__ +#include +typedef __half cl_half; +#else +/* Host code has no __half definition, so put dummy definition */ +typedef unsigned short cl_half; +#endif /* __CUDACC__ */ +typedef float cl_float; +typedef double cl_double; + +#define CL_SHORT_NBITS (sizeof(cl_short) * BITS_PER_BYTE) +#define CL_INT_NBITS (sizeof(cl_int) * BITS_PER_BYTE) +#define CL_LONG_NBITS (sizeof(cl_long) * BITS_PER_BYTE) + +/* PG's utility macros */ +#ifdef __CUDACC__ +#ifdef offsetof +#undef offsetof +#endif /* offsetof */ +#define offsetof(TYPE,FIELD) ((long) &((TYPE *)0UL)->FIELD) + +/* + * At CUDA10, we found nvcc replaces the offsetof above by __builtin_offsetof + * regardless of our macro definitions. It is mostly equivalent, however, it + * does not support offset calculation which includes run-time values. + * E.g) offsetof(kds, colmeta[kds->ncols]) made an error. + */ +#ifdef __NVCC__ +#define __builtin_offsetof(TYPE,FIELD) ((long) &((TYPE *)0UL)->FIELD) +#endif /* __NVCC__ */ + +#ifdef lengthof +#undef lengthof +#endif +#define lengthof(ARRAY) (sizeof(ARRAY) / sizeof((ARRAY)[0])) + +#ifdef container_of +#undef container_of +#endif +#define container_of(TYPE,FIELD,PTR) \ + ((TYPE *)((char *) (PTR) - offsetof(TYPE, FIELD))) + +#ifndef true +#define true ((cl_bool) 1) +#endif +#ifndef false +#define false ((cl_bool) 0) +#endif +#ifdef __CUDACC__ +#undef FLEXIBLE_ARRAY_MEMBER +#define FLEXIBLE_ARRAY_MEMBER 1 +#elif !defined(FLEXIBLE_ARRAY_MEMBER) +#define FLEXIBLE_ARRAY_MEMBER 1 +#endif /* __CUDACC__ */ + +/* + * If NVCC includes this file, some inline function needs declarations of + * basic utility functions. + */ +#ifndef __CUDACC_RTC__ +#include +#include +#endif /* __CUDACC_RTC__ */ + +#define Assert(cond) assert(cond) + +/* Another basic type definitions */ +typedef cl_ulong hostptr_t; +typedef cl_ulong Datum; +typedef struct nameData +{ + char data[NAMEDATALEN]; +} NameData; + +#define PointerGetDatum(X) ((Datum) (X)) +#define DatumGetPointer(X) ((char *) (X)) + +#define SET_1_BYTE(value) (((Datum) (value)) & 0x000000ffL) +#define SET_2_BYTES(value) (((Datum) (value)) & 0x0000ffffL) +#define SET_4_BYTES(value) (((Datum) (value)) & 0xffffffffL) +#define SET_8_BYTES(value) ((Datum) (value)) + +#define READ_INT8_PTR(addr) SET_1_BYTE(*((cl_uchar *)(addr))) +#define READ_INT16_PTR(addr) SET_2_BYTES(*((cl_ushort *)(addr))) +#define READ_INT32_PTR(addr) SET_4_BYTES(*((cl_uint *)(addr))) +#define READ_INT64_PTR(addr) SET_8_BYTES(*((cl_ulong *)(addr))) + +#define INT64CONST(x) ((cl_long) x##L) +#define UINT64CONST(x) ((cl_ulong) x##UL) + +#define Max(a,b) ((a) > (b) ? (a) : (b)) +#define Max3(a,b,c) ((a) > (b) ? Max((a),(c)) : Max((b),(c))) +#define Max4(a,b,c,d) Max(Max((a),(b)),Max((c),(d))) + +#define Min(a,b) ((a) < (b) ? (a) : (b)) +#define Min3(a,b,c) ((a) < (b) ? Min((a),(c)) : Min((b),(c))) +#define Min4(a,b,c,d) Min(Min((a),(b)),Min((c),(d))) + +#define Add(a,b) ((a) + (b)) +#define Add3(a,b,c) ((a) + (b) + (c)) +#define Add4(a,b,c,d) ((a) + (b) + (c) + (d)) + +#define Compare(a,b) ((a) > (b) ? 1 : ((a) < (b) ? -1 : 0)) + +/* same as host side get_next_log2() */ +#define get_next_log2(value) \ + ((value) == 0 ? 0 : (sizeof(cl_ulong) * BITS_PER_BYTE - \ + __clzll((cl_ulong)(value) - 1))) +/* + * Limitation of types + */ +#ifndef SCHAR_MAX +#define SCHAR_MAX 127 +#endif +#ifndef SCHAR_MIN +#define SCHAR_MIN (-128) +#endif +#ifndef UCHAR_MAX +#define UCHAR_MAX 255 +#endif +#ifndef SHRT_MAX +#define SHRT_MAX 32767 +#endif +#ifndef SHRT_MIN +#define SHRT_MIN (-32767-1) +#endif +#ifndef USHRT_MAX +#define USHRT_MAX 65535 +#endif +#ifndef INT_MAX +#define INT_MAX 2147483647 +#endif +#ifndef INT_MIN +#define INT_MIN (-INT_MAX - 1) +#endif +#ifndef UINT_MAX +#define UINT_MAX 4294967295U +#endif +#ifndef LONG_MAX +#define LONG_MAX 0x7FFFFFFFFFFFFFFFLL +#endif +#ifndef LONG_MIN +#define LONG_MIN (-LONG_MAX - 1LL) +#endif +#ifndef ULONG_MAX +#define ULONG_MAX 0xFFFFFFFFFFFFFFFFULL +#endif +#ifndef HALF_MAX +#define HALF_MAX __short_as_half(0x7bff) +#endif +#ifndef HALF_MIN +#define HALF_MIN __short_as_half(0x0400) +#endif +#ifndef HALF_INFINITY +#define HALF_INFINITY __short_as_half(0x0x7c00) +#endif +#ifndef FLT_MAX +#define FLT_MAX __int_as_float(0x7f7fffffU) +#endif +#ifndef FLT_MIN +#define FLT_MIN __int_as_float(0x00800000U) +#endif +#ifndef FLT_INFINITY +#define FLT_INFINITY __int_as_float(0x7f800000U) +#endif +#ifndef FLT_NAN +#define FLT_NAN __int_as_float(0x7fffffffU) +#endif +#ifndef DBL_MAX +#define DBL_MAX __longlong_as_double(0x7fefffffffffffffULL) +#endif +#ifndef DBL_MIN +#define DBL_MIN __longlong_as_double(0x0010000000000000ULL) +#endif +#ifndef DBL_INFINITY +#define DBL_INFINITY __longlong_as_double(0x7ff0000000000000ULL) +#endif +#ifndef DBL_NAN +#define DBL_NAN __longlong_as_double(0x7fffffffffffffffULL) +#endif + +/* + * Alignment macros + */ +#define TYPEALIGN(ALIGNVAL,LEN) \ + (((cl_ulong) (LEN) + ((ALIGNVAL) - 1)) & ~((cl_ulong) ((ALIGNVAL) - 1))) +#define TYPEALIGN_DOWN(ALIGNVAL,LEN) \ + (((cl_ulong) (LEN)) & ~((cl_ulong) ((ALIGNVAL) - 1))) +#define INTALIGN(LEN) TYPEALIGN(sizeof(cl_int), (LEN)) +#define INTALIGN_DOWN(LEN) TYPEALIGN_DOWN(sizeof(cl_int), (LEN)) +#define LONGALIGN(LEN) TYPEALIGN(sizeof(cl_long), (LEN)) +#define LONGALIGN_DOWN(LEN) TYPEALIGN_DOWN(sizeof(cl_long), (LEN)) +#define MAXALIGN(LEN) TYPEALIGN(MAXIMUM_ALIGNOF, (LEN)) +#define MAXALIGN_DOWN(LEN) TYPEALIGN_DOWN(MAXIMUM_ALIGNOF, (LEN)) +#endif /* __CUDACC__ */ + +/* wider alignment */ +#define STROMALIGN_LEN 16 +#define STROMALIGN(LEN) TYPEALIGN(STROMALIGN_LEN,(LEN)) +#define STROMALIGN_DOWN(LEN) TYPEALIGN_DOWN(STROMALIGN_LEN,(LEN)) + +#define GPUMEMALIGN_LEN 1024 +#define GPUMEMALIGN(LEN) TYPEALIGN(GPUMEMALIGN_LEN,(LEN)) +#define GPUMEMALIGN_DOWN(LEN) TYPEALIGN_DOWN(GPUMEMALIGN_LEN,(LEN)) + +#define BLCKALIGN(LEN) TYPEALIGN(BLCKSZ,(LEN)) +#define BLCKALIGN_DOWN(LEN) TYPEALIGN_DOWN(BLCKSZ,(LEN)) + +#ifdef __CUDACC__ +/* + * MEMO: We takes dynamic local memory using cl_ulong data-type because of + * alignment problem. The nvidia's driver adjust alignment of local memory + * according to the data type; 1byte for cl_char, 4bytes for cl_uint and + * so on. Unexpectedly, void * pointer has 1byte alignment even if it is + * expected to be casted another data types. + * A pragma option __attribute__((aligned)) didn't work at least driver + * version 340.xx. So, we declared the local_workmem as cl_ulong * pointer + * as a workaround. + */ +#define SHARED_WORKMEM(TYPE) ((TYPE *) __pgstrom_dynamic_shared_workmem) +extern __shared__ cl_ulong __pgstrom_dynamic_shared_workmem[]; + +/* + * Thread index like OpenCL style. + * + * Be careful to use this convenient alias if grid/block size may become + * larger than INT_MAX, because threadIdx and blockDim are declared as + * 32bit integer, thus, it makes overflow during intermediation results + * if it is larger than INT_MAX. + */ +#define get_group_id() (blockIdx.x) +#define get_num_groups() (gridDim.x) +#define get_local_id() (threadIdx.x) +#define get_local_size() (blockDim.x) +#define get_global_id() (threadIdx.x + blockIdx.x * blockDim.x) +#define get_global_size() (blockDim.x * gridDim.x) +#define get_global_base() (blockIdx.x * blockDim.x) +#define get_warp_id() (threadIdx.x / warpSize) +#define get_lane_id() (threadIdx.x & (warpSize-1)) +#else /* __CUDACC__ */ +typedef cl_ulong hostptr_t; +#endif /* !__CUDACC__ */ + +/* + * Template of static function declarations + * + * CUDA compilar raises warning if static functions are not used, but + * we can restain this message with"unused" attribute of function/values. + * STATIC_INLINE / STATIC_FUNCTION packs common attributes to be + * assigned on host/device functions + */ +#define MAXTHREADS_PER_BLOCK 1024 +#define MAXWARPS_PER_BLOCK (MAXTHREADS_PER_BLOCK / 32) +#ifdef __CUDACC__ +#define STATIC_INLINE(RET_TYPE) \ + __device__ __host__ __forceinline__ \ + static RET_TYPE __attribute__ ((unused)) +#define STATIC_FUNCTION(RET_TYPE) \ + __device__ __host__ \ + static RET_TYPE +#define DEVICE_INLINE(RET_TYPE) \ + __device__ __forceinline__ \ + static RET_TYPE __attribute__ ((unused)) +#define DEVICE_FUNCTION(RET_TYPE) \ + __device__ RET_TYPE __attribute__ ((unused)) +#define PUBLIC_FUNCTION(RET_TYPE) \ + __device__ __host__ RET_TYPE +#define KERNEL_FUNCTION(RET_TYPE) \ + extern "C" __global__ RET_TYPE +#else /* __CUDACC__ */ +#define STATIC_INLINE(RET_TYPE) static inline RET_TYPE +#define STATIC_FUNCTION(RET_TYPE) static inline RET_TYPE +#define PUBLIC_FUNCTION(RET_TYPE) RET_TYPE +#endif /* !__CUDACC__ */ + +/* + * __Fetch - access macro regardless of memory alignment + */ +#ifdef __CUDA_ARCH__ +template +DEVICE_INLINE(T) +__Fetch(const T *ptr) +{ + T temp; + /* + * (2019/06/01) Originally, this function used direct pointer access + * using *ptr, if pointer is aligned. However, it looks NVCC/NVRTC + * optimization generates binary code that accesses unaligned pointer. + * '--device-debug' eliminates the strange behavior, and 'volatile' + * qualification also stop the behavior. + * Maybe, future version of CUDA and NVCC/NVRTC will fix the problem. + */ + memcpy(&temp, ptr, sizeof(T)); + + return temp; +} +#else /* __CUDA_ARCH__ */ +#define __Fetch(PTR) (*(PTR)) +#endif /* !__CUDA_ARCH__ */ + +#ifdef __CUDA_ARCH__ +template +DEVICE_INLINE(T) +__volatileRead(const volatile T *ptr) +{ + return *ptr; +} + +template +DEVICE_INLINE(void) +__volatileWrite(volatile T *ptr, T val) +{ + *ptr = val; +} +#endif + +/* + * Error code definition + * + * MEMO: SQL ERRCODE_* uses 0-29bits. We also use 30bit for a flag of + * CPU fallback. Host code tries CPU fallback if this flag is set and + * pg_strom.cpu_fallback_enabled is set. + */ +#ifndef MAKE_SQLSTATE +#define PGSIXBIT(ch) (((ch) - '0') & 0x3F) +#define MAKE_SQLSTATE(ch1,ch2,ch3,ch4,ch5) \ + (PGSIXBIT(ch1) + (PGSIXBIT(ch2) << 6) + (PGSIXBIT(ch3) << 12) + \ + (PGSIXBIT(ch4) << 18) + (PGSIXBIT(ch5) << 24)) +#endif /* MAKE_SQLSTATE */ +#include "utils/errcodes.h" +#define ERRCODE_FLAGS_CPU_FALLBACK (1U<<30) +#define ERRCODE_STROM_SUCCESS 0 +#define ERRCODE_STROM_DATASTORE_NOSPACE MAKE_SQLSTATE('H','D','B','0','4') +#define ERRCODE_STROM_WRONG_CODE_GENERATION MAKE_SQLSTATE('H','D','B','0','5') +#define ERRCODE_STROM_DATA_CORRUPTION MAKE_SQLSTATE('H','D','B','0','7') +#define ERRCODE_STROM_VARLENA_UNSUPPORTED MAKE_SQLSTATE('H','D','B','0','8') +#define ERRCODE_STROM_RECURSION_TOO_DEEP MAKE_SQLSTATE('H','D','B','0','9') + +#define KERN_ERRORBUF_FILENAME_LEN 24 +#define KERN_ERRORBUF_FUNCNAME_LEN 64 +#define KERN_ERRORBUF_MESSAGE_LEN 200 +typedef struct +{ + cl_int errcode; /* one of the ERRCODE_* */ + cl_int lineno; + char filename[KERN_ERRORBUF_FILENAME_LEN]; + char funcname[KERN_ERRORBUF_FUNCNAME_LEN]; + char message[KERN_ERRORBUF_MESSAGE_LEN]; +} kern_errorbuf; + +/* + * kern_context - a set of run-time information + */ +struct kern_parambuf; + +typedef struct +{ + cl_int errcode; + const char *error_filename; + cl_int error_lineno; + const char *error_funcname; + const char *error_message; /* !!only const static cstring!! */ + struct kern_parambuf *kparams; + void *stack_bounds; + cl_char *vlpos; + cl_char *vlend; + cl_char vlbuf[1]; +} kern_context; + +/* + * Usually, kern_context is declared at the auto-generated portion, + * then its pointer shall be passed to the pre-built GPU binary part. + * Its vlbuf length shall be determined on run-time compilation using + * the macro below. + */ +#define KERN_CONTEXT_VARLENA_BUFSZ_LIMIT 8192 +#ifdef __CUDACC_RTC__ +#define DECL_KERNEL_CONTEXT(NAME) \ + union { \ + kern_context kcxt; \ + char __dummy__[offsetof(kern_context, vlbuf) + \ + MAXALIGN(KERN_CONTEXT_VARLENA_BUFSZ)]; \ + } NAME +#endif /* __CUDACC_RTC__ */ + +#define INIT_KERNEL_CONTEXT(kcxt,__kparams) \ + do { \ + memset(kcxt, 0, offsetof(kern_context, vlbuf)); \ + (kcxt)->kparams = (__kparams); \ + assert((cl_ulong)(__kparams) == MAXALIGN(__kparams)); \ + (kcxt)->stack_bounds = (char *)(kcxt) - KERN_CONTEXT_STACK_LIMIT; \ + (kcxt)->vlpos = (kcxt)->vlbuf; \ + (kcxt)->vlend = (kcxt)->vlbuf + KERN_CONTEXT_VARLENA_BUFSZ; \ + } while(0) + +#define PTR_ON_VLBUF(kcxt,ptr,len) \ + ((char *)(ptr) >= (kcxt)->vlbuf && \ + (char *)(ptr) + (len) <= (kcxt)->vlend) + +STATIC_INLINE(void *) +kern_context_alloc(kern_context *kcxt, size_t len) +{ + char *pos = (char *)MAXALIGN(kcxt->vlpos); + + if (pos >= kcxt->vlbuf && pos + len <= kcxt->vlend) + { + kcxt->vlpos = pos + len; + return pos; + } + return NULL; +} + +#define CHECK_KERNEL_STACK_DEPTH(kcxt) \ + (((cl_ulong)((kcxt)->stack_bounds)) > ((cl_ulong)(&(kcxt)))) + +#ifdef __CUDA_ARCH__ +/* + * It sets an error code unless no significant error code is already set. + * Also, CpuReCheck has higher priority than RowFiltered because CpuReCheck + * implies device cannot run the given expression completely. + * (Usually, due to compressed or external varlena datum) + */ +STATIC_INLINE(void) +__STROM_EREPORT(kern_context *kcxt, cl_int errcode, + const char *filename, cl_int lineno, + const char *funcname, const char *message) +{ + cl_int oldcode = kcxt->errcode; + + if (oldcode == ERRCODE_STROM_SUCCESS && + errcode != ERRCODE_STROM_SUCCESS) + { + const char *pos; + + for (pos=filename; *pos != '\0'; pos++) + { + if (pos[0] == '/' && pos[1] != '\0') + filename = pos + 1; + } + if (!message) + message = "GPU kernel internal error"; + kcxt->errcode = errcode; + kcxt->error_filename = filename; + kcxt->error_lineno = lineno; + kcxt->error_funcname = funcname; + kcxt->error_message = message; + } +} + +#define STROM_ELOG(kcxt, message) \ + __STROM_EREPORT((kcxt),ERRCODE_INTERNAL_ERROR, \ + __FILE__,__LINE__,__FUNCTION__,(message)) +#define STROM_EREPORT(kcxt, errcode, message) \ + __STROM_EREPORT((kcxt),(errcode), \ + __FILE__,__LINE__,__FUNCTION__,(message)) +#define STROM_CPU_FALLBACK(kcxt, errcode, message) \ + __STROM_EREPORT((kcxt),(errcode) | ERRCODE_FLAGS_CPU_FALLBACK, \ + __FILE__,__LINE__,__FUNCTION__,(message)) + +STATIC_INLINE(void) +__strncpy(char *d, const char *s, cl_uint n) +{ + cl_uint i, m = n-1; + + for (i=0; i < m && s[i] != '\0'; i++) + d[i] = s[i]; + while (i < n) + d[i++] = '\0'; +} + +/* + * kern_writeback_error_status + */ +STATIC_INLINE(void) +kern_writeback_error_status(kern_errorbuf *result, kern_context *kcxt) +{ + /* + * It writes back a thread local error status only when the global + * error status is not set yet and the caller thread contains any + * error status. Elsewhere, we don't involves any atomic operation + * in the most of code path. + */ + if (kcxt->errcode != ERRCODE_STROM_SUCCESS && + atomicCAS(&result->errcode, + ERRCODE_STROM_SUCCESS, + kcxt->errcode) == ERRCODE_STROM_SUCCESS) + { + result->errcode = kcxt->errcode; + result->lineno = kcxt->error_lineno; + __strncpy(result->filename, + kcxt->error_filename, + KERN_ERRORBUF_FILENAME_LEN); + __strncpy(result->funcname, + kcxt->error_funcname, + KERN_ERRORBUF_FUNCNAME_LEN); + __strncpy(result->message, + kcxt->error_message, + KERN_ERRORBUF_MESSAGE_LEN); + } +} +#elif defined(__CUDACC__) +#define STROM_EREPORT(kcxt, errcode, message) \ + do { \ + fprintf(stderr, "%s:%d %s (code=%d)\n", \ + __FUNCTION__, __LINE__, \ + message, errcode); \ + exit(1); \ + } while(0) +#define STROM_CPU_FALLBACK(a,b,c) STROM_EREPORT((a),(b),(c)) +#else /* !__CUDA_ARCH__ && !__CUDACC__ == gcc by pg_config */ +#define STROM_EREPORT(kcxt, errcode, message) \ + elog(ERROR, "%s:%d %s (code=%d)", \ + __FUNCTION__, __LINE__, \ + message, errcode) +#define STROM_CPU_FALLBACK(a,b,c) STROM_EREPORT((a),(b),(c)) +#endif /* !__CUDA_ARCH__ && !__CUDACC__ */ + +#ifdef __CUDACC__ +/* definitions at storage/block.h */ +typedef cl_uint BlockNumber; +#define InvalidBlockNumber ((BlockNumber) 0xFFFFFFFF) +#define MaxBlockNumber ((BlockNumber) 0xFFFFFFFE) + +/* details are defined at cuda_gpuscan.h */ +struct PageHeaderData; + +/* definitions at access/htup_details.h */ +typedef struct { + struct { + cl_ushort bi_hi; + cl_ushort bi_lo; + } ip_blkid; + cl_ushort ip_posid; +} ItemPointerData; + +DEVICE_INLINE(cl_bool) +ItemPointerEquals(ItemPointerData *ip1, ItemPointerData *ip2) +{ + return (ip1->ip_blkid.bi_hi == ip2->ip_blkid.bi_hi && + ip1->ip_blkid.bi_lo == ip2->ip_blkid.bi_lo && + ip1->ip_posid == ip2->ip_posid); +} + +typedef struct HeapTupleFields +{ + cl_uint t_xmin; /* inserting xact ID */ + cl_uint t_xmax; /* deleting or locking xact ID */ + union + { + cl_uint t_cid; /* inserting or deleting command ID, or both */ + cl_uint t_xvac; /* old-style VACUUM FULL xact ID */ + } t_field3; +} HeapTupleFields; + +typedef struct DatumTupleFields +{ + cl_int datum_len_; /* varlena header (do not touch directly!) */ + cl_int datum_typmod; /* -1, or identifier of a record type */ + cl_uint datum_typeid; /* composite type OID, or RECORDOID */ +} DatumTupleFields; + +typedef struct { + union { + HeapTupleFields t_heap; + DatumTupleFields t_datum; + } t_choice; + + ItemPointerData t_ctid; /* current TID of this or newer tuple */ + + cl_ushort t_infomask2; /* number of attributes + various flags */ + cl_ushort t_infomask; /* various flag bits, see below */ + cl_uchar t_hoff; /* sizeof header incl. bitmap, padding */ + /* ^ - 23 bytes - ^ */ + cl_uchar t_bits[1]; /* bitmap of NULLs -- VARIABLE LENGTH */ +} HeapTupleHeaderData; + +#define att_isnull(ATT, BITS) (!((BITS)[(ATT) >> 3] & (1 << ((ATT) & 0x07)))) +#define BITMAPLEN(NATTS) (((int)(NATTS) + BITS_PER_BYTE - 1) / BITS_PER_BYTE) + +/* + * information stored in t_infomask: + */ +#define HEAP_HASNULL 0x0001 /* has null attribute(s) */ +#define HEAP_HASVARWIDTH 0x0002 /* has variable-width attribute(s) */ +#define HEAP_HASEXTERNAL 0x0004 /* has external stored attribute(s) */ +#define HEAP_HASOID 0x0008 /* has an object-id field */ +#define HEAP_XMAX_KEYSHR_LOCK 0x0010 /* xmax is a key-shared locker */ +#define HEAP_COMBOCID 0x0020 /* t_cid is a combo cid */ +#define HEAP_XMAX_EXCL_LOCK 0x0040 /* xmax is exclusive locker */ +#define HEAP_XMAX_LOCK_ONLY 0x0080 /* xmax, if valid, is only a locker */ + +#define HEAP_XMIN_COMMITTED 0x0100 /* t_xmin committed */ +#define HEAP_XMIN_INVALID 0x0200 /* t_xmin invalid/aborted */ +#define HEAP_XMAX_COMMITTED 0x0400 /* t_xmax committed */ +#define HEAP_XMAX_INVALID 0x0800 /* t_xmax invalid/aborted */ +#define HEAP_XMAX_IS_MULTI 0x1000 /* t_xmax is a MultiXactId */ +#define HEAP_UPDATED 0x2000 /* this is UPDATEd version of row */ +#define HEAP_MOVED_OFF 0x4000 /* unused in GPU */ +#define HEAP_MOVED_IN 0x8000 /* unused in GPU */ + +/* + * information stored in t_infomask2: + */ +#define HEAP_NATTS_MASK 0x07FF /* 11 bits for number of attributes */ +#define HEAP_KEYS_UPDATED 0x2000 /* tuple was updated and key cols + * modified, or tuple deleted */ +#define HEAP_HOT_UPDATED 0x4000 /* tuple was HOT-updated */ +#define HEAP_ONLY_TUPLE 0x8000 /* this is heap-only tuple */ +#define HEAP2_XACT_MASK 0xE000 /* visibility-related bits */ + +/* + * Index tuple header structure + * + * All index tuples start with IndexTupleData. If the HasNulls bit is set, + * this is followed by an IndexAttributeBitMapData. The index attribute + * values follow, beginning at a MAXALIGN boundary. + */ +typedef struct IndexTupleData +{ + ItemPointerData t_tid; /* reference TID to heap tuple */ + + /* --------------- + * t_info is laid out in the following fashion: + * + * 15th (high) bit: has nulls + * 14th bit: has var-width attributes + * 13th bit: AM-defined meaning + * 12-0 bit: size of tuple + * --------------- + */ + unsigned short t_info; + + char data[1]; /* data or IndexAttributeBitMapData */ +} IndexTupleData; + +typedef struct IndexAttributeBitMapData +{ + cl_uchar bits[(INDEX_MAX_KEYS + 8 - 1) / 8]; +} IndexAttributeBitMapData; + +#define INDEX_SIZE_MASK 0x1fff +#define INDEX_VAR_MASK 0x4000 +#define INDEX_NULL_MASK 0x8000 + +/* + * Below is routines to support KDS_FORMAT_BLOCKS - This KDS format is used + * to load raw PostgreSQL heap blocks to GPU without modification by CPU. + * All CPU has to pay attention is, not to load rows which should not be + * visible to the current scan snapshot. + */ +typedef cl_uint TransactionId; +#define InvalidTransactionId ((TransactionId) 0) +#define FrozenTransactionId ((TransactionId) 2) +#define InvalidCommandId (~0U) +#else +#include "access/htup_details.h" +#endif /* __CUDACC__ */ + +typedef struct +{ + cl_int vl_len_; + cl_int ndim; /* always 1 for xidvector */ + cl_int dataoffset; /* always 0 for xidvector */ + cl_uint elemtype; /* XIDOID */ + cl_int dim1; /* number of items */ + cl_int lbound1; /* always 1 for xidvector */ + TransactionId values[FLEXIBLE_ARRAY_MEMBER]; +} xidvector; + +#ifdef __CUDACC__ +/* definitions at storage/itemid.h */ +typedef struct ItemIdData +{ + unsigned lp_off:15, /* offset to tuple (from start of page) */ + lp_flags:2, /* state of item pointer, see below */ + lp_len:15; /* byte length of tuple */ +} ItemIdData; + +#define LP_UNUSED 0 /* unused (should always have lp_len=0) */ +#define LP_NORMAL 1 /* used (should always have lp_len>0) */ +#define LP_REDIRECT 2 /* HOT redirect (should have lp_len=0) */ +#define LP_DEAD 3 /* dead, may or may not have storage */ + +#define ItemIdGetOffset(itemId) ((itemId)->lp_off) +#define ItemIdGetLength(itemId) ((itemId)->lp_len) +#define ItemIdIsUsed(itemId) ((itemId)->lp_flags != LP_UNUSED) +#define ItemIdIsNormal(itemId) ((itemId)->lp_flags == LP_NORMAL) +#define ItemIdIsRedirected(itemId) ((itemId)->lp_flags == LP_REDIRECT) +#define ItemIdIsDead(itemId) ((itemId)->lp_flags == LP_DEAD) +#define ItemIdHasStorage(itemId) ((itemId)->lp_len != 0) +#define ItemIdSetUnused(itemId) \ + do { \ + (itemId)->lp_flags = LP_UNUSED; \ + (itemId)->lp_off = 0; \ + (itemId)->lp_len = 0; \ + } while(0) + +/* definitions at storage/off.h */ +typedef cl_ushort OffsetNumber; + +#define InvalidOffsetNumber ((OffsetNumber) 0) +#define FirstOffsetNumber ((OffsetNumber) 1) +#define MaxOffsetNumber ((OffsetNumber) (BLCKSZ / sizeof(ItemIdData))) +#define OffsetNumberMask (0xffff) /* valid uint16 bits */ + +#define OffsetNumberNext(offsetNumber) \ + ((OffsetNumber) (1 + (offsetNumber))) + +/* definitions at storage/bufpage.h */ +typedef cl_ushort LocationIndex; + +typedef struct PageHeaderData +{ +#if 0 + /* + * NOTE: device code (ab-)uses this field to track parent block/item + * when GiST index is loaded. Without this hack, hard to implement + * depth-first search at GpuJoin. + */ + PageXLogRecPtr pd_lsn; /* LSN: next byte after last byte of xlog + * record for last change to this page */ +#else + cl_uint pd_parent_blkno; + cl_uint pd_parent_item; +#endif + cl_ushort pd_checksum; /* checksum */ + cl_ushort pd_flags; /* flag bits, see below */ + LocationIndex pd_lower; /* offset to start of free space */ + LocationIndex pd_upper; /* offset to end of free space */ + LocationIndex pd_special; /* offset to start of special space */ + cl_ushort pd_pagesize_version; + TransactionId pd_prune_xid; /* oldest prunable XID, or zero if none */ + ItemIdData pd_linp[FLEXIBLE_ARRAY_MEMBER]; /* line pointer array */ +} PageHeaderData; + +#define SizeOfPageHeaderData (offsetof(PageHeaderData, pd_linp)) + +#define PD_HAS_FREE_LINES 0x0001 /* are there any unused line pointers? */ +#define PD_PAGE_FULL 0x0002 /* not enough free space for new tuple? */ +#define PD_ALL_VISIBLE 0x0004 /* all tuples on page are visible to + * everyone */ +#define PD_VALID_FLAG_BITS 0x0007 /* OR of all valid pd_flags bits */ + +#define PageGetItemId(page, offsetNumber) \ + (&((PageHeaderData *)(page))->pd_linp[(offsetNumber) - 1]) +#define PageGetItem(page, lpp) \ + ((HeapTupleHeaderData *)((char *)(page) + ItemIdGetOffset(lpp))) +STATIC_INLINE(cl_uint) +PageGetMaxOffsetNumber(PageHeaderData *page) +{ + cl_uint pd_lower = page->pd_lower; + + return (pd_lower <= SizeOfPageHeaderData ? 0 : + (pd_lower - SizeOfPageHeaderData) / sizeof(ItemIdData)); +} + +/* + * GiST index specific structures and labels + */ +#define F_LEAF (1 << 0) /* leaf page */ +#define F_DELETED (1 << 1) /* the page has been deleted */ +#define F_TUPLES_DELETED (1 << 2) /* some tuples on the page were deleted */ +#define F_FOLLOW_RIGHT (1 << 3) /* page to the right has no downlink */ +#define F_HAS_GARBAGE (1 << 4) /* some tuples on the page are dead */ + +#define GIST_PAGE_ID 0xFF81 + +typedef struct GISTPageOpaqueData +{ + struct { + cl_uint xlogid; + cl_uint xrecoff; + } nsn; + BlockNumber rightlink; /* next page if any */ + cl_ushort flags; /* see bit definitions above */ + cl_ushort gist_page_id; /* for identification of GiST indexes */ +} GISTPageOpaqueData; + +STATIC_INLINE(GISTPageOpaqueData *) +GistPageGetOpaque(PageHeaderData *page) +{ + return (GISTPageOpaqueData *)((char *)page + page->pd_special); +} + +STATIC_INLINE(cl_bool) +GistPageIsLeaf(PageHeaderData *page) +{ + return (GistPageGetOpaque(page)->flags & F_LEAF) != 0; +} + +STATIC_INLINE(cl_bool) +GistPageIsDeleted(PageHeaderData *page) +{ + return (GistPageGetOpaque(page)->flags & F_DELETED) != 0; +} + +STATIC_INLINE(cl_bool) +GistFollowRight(PageHeaderData *page) +{ + return (GistPageGetOpaque(page)->flags & F_FOLLOW_RIGHT) != 0; +} + +/* root page of a gist index */ +#define GIST_ROOT_BLKNO 0 + +#endif /* __CUDACC__ */ + +/* + * kern_data_store + */ +#include "arrow_defs.h" + +#define TYPE_KIND__NULL 'n' /* unreferenced column */ +#define TYPE_KIND__BASE 'b' +#define TYPE_KIND__ARRAY 'a' +#define TYPE_KIND__COMPOSITE 'c' +#define TYPE_KIND__DOMAIN 'd' +#define TYPE_KIND__ENUM 'e' +#define TYPE_KIND__PSEUDO 'p' +#define TYPE_KIND__RANGE 'r' + +struct kern_colmeta { + /* true, if column is held by value. Elsewhere, a reference */ + cl_char attbyval; + /* alignment; 1,2,4 or 8, not characters in pg_attribute */ + cl_char attalign; + /* length of attribute */ + cl_short attlen; + /* attribute number */ + cl_short attnum; + /* offset of attribute location, if deterministic */ + cl_short attcacheoff; + /* oid of the SQL data type */ + cl_uint atttypid; + /* typmod of the SQL data type */ + cl_int atttypmod; + /* one of TYPE_KIND__* */ + cl_char atttypkind; + /* + * (for array and composite types) + * Some of types contain sub-fields like array or composite type. + * We carry type definition information (kern_colmeta) using the + * kds->colmeta[] array next to the top-level fields. + * An array type has relevant element type. So, its @num_subattrs + * is always 1, and kds->colmeta[@idx_subattrs] informs properties + * of the element type. + * A composite type has several fields. + * kds->colmeta[@idx_subattrs ... @idx_subattrs + @num_subattrs -1] + * carries its sub-fields properties. + */ + cl_ushort idx_subattrs; + cl_ushort num_subattrs; + + /* column name */ + NameData attname; + + /* + * (only arrow/column format) + * @attoptions keeps extra information of Apache Arrow type. Unlike + * PostgreSQL types, it can have variation of data accuracy in time + * related data types, or precision in decimal data type. + */ + ArrowTypeOptions attopts; + cl_uint nullmap_offset; + cl_uint nullmap_length; + cl_uint values_offset; + cl_uint values_length; + cl_uint extra_offset; + cl_uint extra_length; +}; +typedef struct kern_colmeta kern_colmeta; + +/* + * kern_tupitem - individual items for KDS_FORMAT_ROW + */ +struct kern_tupitem +{ + cl_uint t_len; /* length of tuple */ + cl_uint rowid; /* unique Id of this item */ + HeapTupleHeaderData htup; +}; +typedef struct kern_tupitem kern_tupitem; + +/* + * kern_hashitem - individual items for KDS_FORMAT_HASH + */ +struct kern_hashitem +{ + cl_uint hash; /* 32-bit hash value */ + cl_uint next; /* offset of the next (PACKED) */ + kern_tupitem t; /* HeapTuple of this entry */ +}; +typedef struct kern_hashitem kern_hashitem; + +#define KDS_FORMAT_ROW 1 +#define KDS_FORMAT_SLOT 2 +#define KDS_FORMAT_HASH 3 /* inner hash table for GpuHashJoin */ +#define KDS_FORMAT_BLOCK 4 /* raw blocks for direct loading */ +#define KDS_FORMAT_COLUMN 5 /* columnar based storage format */ +#define KDS_FORMAT_ARROW 6 /* apache arrow format */ + +struct kern_data_store { + size_t length; /* length of this data-store */ + /* + * NOTE: {nitems + usage} must be aligned to 64bit because these pair of + * values can be updated atomically using cmpxchg. + */ + cl_uint nitems; /* number of rows in this store */ + cl_uint usage; /* usage of this data-store (PACKED) */ + cl_uint nrooms; /* number of available rows in this store */ + cl_uint ncols; /* number of columns in this store */ + cl_char format; /* one of KDS_FORMAT_* above */ + cl_char has_varlena; /* true, if any varlena attribute */ + cl_char tdhasoid; /* copy of TupleDesc.tdhasoid */ + cl_uint tdtypeid; /* copy of TupleDesc.tdtypeid */ + cl_int tdtypmod; /* copy of TupleDesc.tdtypmod */ + cl_uint table_oid; /* OID of the table (only if GpuScan) */ + cl_uint nslots; /* width of hash-slot (only HASH format) */ + cl_uint nrows_per_block; /* average number of rows per + * PostgreSQL block (only BLOCK format) */ + cl_uint nr_colmeta; /* number of colmeta[] array elements; + * maybe, >= ncols, if any composite types */ + kern_colmeta colmeta[FLEXIBLE_ARRAY_MEMBER]; /* metadata of columns */ +}; +typedef struct kern_data_store kern_data_store; + +/* + * kern_data_extra - extra buffer of KDS_FORMAT_COLUMN + */ +struct kern_data_extra +{ + cl_ulong length; + cl_ulong usage; + char data[FLEXIBLE_ARRAY_MEMBER]; +}; +typedef struct kern_data_extra kern_data_extra; + +/* attribute number of system columns */ +#ifndef SYSATTR_H +#define SelfItemPointerAttributeNumber (-1) +#define ObjectIdAttributeNumber (-2) +#define MinTransactionIdAttributeNumber (-3) +#define MinCommandIdAttributeNumber (-4) +#define MaxTransactionIdAttributeNumber (-5) +#define MaxCommandIdAttributeNumber (-6) +#define TableOidAttributeNumber (-7) +#define FirstLowInvalidHeapAttributeNumber (-8) +#endif /* !SYSATTR_H */ + +/* + * MEMO: Support of 32GB KDS - KDS with row-, hash- and column-format + * internally uses 32bit offset value from the head or base address. + * We have assumption here - any objects pointed by the offset value + * is always aligned to MAXIMUM_ALIGNOF boundary (64bit). + * It means we can use 32bit offset to represent up to 32GB range (35bit). + */ +STATIC_INLINE(cl_uint) +__kds_packed(size_t offset) +{ + + Assert((offset & ~(0xffffffffUL << MAXIMUM_ALIGNOF_SHIFT)) == 0); + return (cl_uint)(offset >> MAXIMUM_ALIGNOF_SHIFT); +} + +STATIC_INLINE(size_t) +__kds_unpack(cl_uint offset) +{ + return (size_t)offset << MAXIMUM_ALIGNOF_SHIFT; +} +#define KDS_OFFSET_MAX_SIZE ((size_t)UINT_MAX << MAXIMUM_ALIGNOF_SHIFT) + +/* 'nslots' estimation; 25% larger than nitems, but 128 at least */ +#define __KDS_NSLOTS(nitems) \ + Max(128, ((nitems) * 5) >> 2) +/* + * NOTE: For strict correctness, header portion of kern_data_store may + * have larger number of colmeta[] items than 'ncols', if array or composite + * types are in the field definition. + * However, it is relatively rare, and 'ncols' == 'nr_colmeta' in most cases. + * The macros below are used for just cost estimation; no need to be strict + * connect for size estimatino. + */ +// use KDS_calculateHeadSize() instead +#define KDS_ESTIMATE_HEAD_LENGTH(ncols) \ + STROMALIGN(offsetof(kern_data_store, colmeta[(ncols)])) +#define KDS_ESTIMATE_ROW_LENGTH(ncols,nitems,htup_sz) \ + (KDS_ESTIMATE_HEAD_LENGTH(ncols) + \ + STROMALIGN(sizeof(cl_uint) * (nitems)) + \ + STROMALIGN(MAXALIGN(offsetof(kern_tupitem, \ + htup) + htup_sz) * (nitems))) +#define KDS_ESTIMATE_HASH_LENGTH(ncols,nitems,htup_sz) \ + (KDS_ESTIMATE_HEAD_LENGTH(ncols) + \ + STROMALIGN(sizeof(cl_uint) * (nitems)) + \ + STROMALIGN(sizeof(cl_uint) * __KDS_NSLOTS(nitems)) + \ + STROMALIGN(MAXALIGN(offsetof(kern_hashitem, \ + t.htup) + htup_sz) * (nitems))) + +/* Length of the header postion of kern_data_store */ +STATIC_INLINE(size_t) +KERN_DATA_STORE_HEAD_LENGTH(kern_data_store *kds) +{ + return STROMALIGN(offsetof(kern_data_store, + colmeta[kds->nr_colmeta])); +} +/* Base address of the data body */ +STATIC_INLINE(char *) +KERN_DATA_STORE_BODY(kern_data_store *kds) +{ + return (char *)kds + KERN_DATA_STORE_HEAD_LENGTH(kds); +} + +/* access function for row- and hash-format */ +STATIC_INLINE(cl_uint *) +KERN_DATA_STORE_ROWINDEX(kern_data_store *kds) +{ + Assert(kds->format == KDS_FORMAT_ROW || + kds->format == KDS_FORMAT_HASH); + return (cl_uint *)KERN_DATA_STORE_BODY(kds); +} + +/* access function for hash-format */ +STATIC_INLINE(cl_uint *) +KERN_DATA_STORE_HASHSLOT(kern_data_store *kds) +{ + Assert(kds->format == KDS_FORMAT_HASH); + return (cl_uint *)(KERN_DATA_STORE_BODY(kds) + + STROMALIGN(sizeof(cl_uint) * kds->nrooms)); +} + +/* access function for row- and hash-format */ +STATIC_INLINE(kern_tupitem *) +KERN_DATA_STORE_TUPITEM(kern_data_store *kds, cl_uint kds_index) +{ + size_t offset = KERN_DATA_STORE_ROWINDEX(kds)[kds_index]; + + if (!offset) + return NULL; + return (kern_tupitem *)((char *)kds + __kds_unpack(offset)); +} + +/* access macro for row-format by tup-offset */ +STATIC_INLINE(HeapTupleHeaderData *) +KDS_ROW_REF_HTUP(kern_data_store *kds, + cl_uint tup_offset, + ItemPointerData *p_self, + cl_uint *p_len) +{ + kern_tupitem *tupitem; + + Assert(kds->format == KDS_FORMAT_ROW || + kds->format == KDS_FORMAT_HASH); + if (tup_offset == 0) + return NULL; + tupitem = (kern_tupitem *)((char *)(kds) + + __kds_unpack(tup_offset) + - offsetof(kern_tupitem, htup)); + if (p_self) + *p_self = tupitem->htup.t_ctid; + if (p_len) + *p_len = tupitem->t_len; + return &tupitem->htup; +} + +STATIC_INLINE(kern_hashitem *) +KERN_HASH_FIRST_ITEM(kern_data_store *kds, cl_uint hash) +{ + cl_uint *slot = KERN_DATA_STORE_HASHSLOT(kds); + size_t offset = __kds_unpack(slot[hash % kds->nslots]); + + if (offset == 0) + return NULL; + Assert(offset < kds->length); + return (kern_hashitem *)((char *)kds + offset); +} + +STATIC_INLINE(kern_hashitem *) +KERN_HASH_NEXT_ITEM(kern_data_store *kds, kern_hashitem *khitem) +{ + size_t offset; + + if (!khitem || khitem->next == 0) + return NULL; + offset = __kds_unpack(khitem->next); + Assert(offset < kds->length); + return (kern_hashitem *)((char *)kds + offset); +} + +/* access macro for tuple-slot format */ +STATIC_INLINE(size_t) +KERN_DATA_STORE_SLOT_LENGTH(kern_data_store *kds, cl_uint nitems) +{ + size_t headsz = KERN_DATA_STORE_HEAD_LENGTH(kds); + size_t unitsz = LONGALIGN((sizeof(Datum) + sizeof(char)) * kds->ncols); + + return headsz + unitsz * nitems; +} + +STATIC_INLINE(Datum *) +KERN_DATA_STORE_VALUES(kern_data_store *kds, cl_uint row_index) +{ + size_t offset = KERN_DATA_STORE_SLOT_LENGTH(kds, row_index); + + return (Datum *)((char *)kds + offset); +} + +STATIC_INLINE(cl_char *) +KERN_DATA_STORE_DCLASS(kern_data_store *kds, cl_uint row_index) +{ + Datum *values = KERN_DATA_STORE_VALUES(kds, row_index); + + return (cl_char *)(values + kds->ncols); +} + +/* access macro for block format */ +#define KERN_DATA_STORE_PARTSZ(kds) \ + Min(((kds)->nrows_per_block + \ + warpSize - 1) & ~(warpSize - 1), \ + get_local_size()) +#define KERN_DATA_STORE_BLOCK_BLCKNR(kds,kds_index) \ + (((BlockNumber *)KERN_DATA_STORE_BODY(kds))[kds_index]) +#define KERN_DATA_STORE_BLOCK_PGPAGE(kds,kds_index) \ + ((struct PageHeaderData *) \ + (KERN_DATA_STORE_BODY(kds) + \ + STROMALIGN(sizeof(BlockNumber) * (kds)->nrooms) + \ + BLCKSZ * kds_index)) + +/* + * KDS_BLOCK_REF_HTUP + * + * It pulls a HeapTupleHeader by a pair of KDS and lp_offset; + */ +STATIC_INLINE(HeapTupleHeaderData *) +KDS_BLOCK_REF_HTUP(kern_data_store *kds, + cl_uint lp_offset, + ItemPointerData *p_self, + cl_uint *p_len) +{ + /* + * NOTE: lp_offset is not packed offset! + * KDS_FORMAT_BLOCK will be never larger than 4GB. + */ + ItemIdData *lpp = (ItemIdData *)((char *)kds + lp_offset); + cl_uint head_size; + cl_uint block_id; + BlockNumber block_nr; + PageHeaderData *pg_page; + + Assert(kds->format == KDS_FORMAT_BLOCK); + if (lp_offset == 0) + return NULL; + head_size = (KERN_DATA_STORE_HEAD_LENGTH(kds) + + STROMALIGN(sizeof(BlockNumber) * kds->nrooms)); + Assert(lp_offset >= head_size && + lp_offset < head_size + BLCKSZ * kds->nitems); + block_id = (lp_offset - head_size) / BLCKSZ; + block_nr = KERN_DATA_STORE_BLOCK_BLCKNR(kds, block_id); + pg_page = KERN_DATA_STORE_BLOCK_PGPAGE(kds, block_id); + + Assert(lpp >= pg_page->pd_linp && + lpp - pg_page->pd_linp < PageGetMaxOffsetNumber(pg_page)); + if (p_self) + { + p_self->ip_blkid.bi_hi = block_nr >> 16; + p_self->ip_blkid.bi_lo = block_nr & 0xffff; + p_self->ip_posid = lpp - pg_page->pd_linp; + } + if (p_len) + *p_len = ItemIdGetLength(lpp); + return (HeapTupleHeaderData *)PageGetItem(pg_page, lpp); +} + +/* access functions for apache arrow format */ +STATIC_INLINE(void *) +kern_fetch_simple_datum_arrow(kern_colmeta *cmeta, + char *base, + cl_uint index, + cl_uint unitsz) +{ + cl_char *nullmap = NULL; + cl_char *values; + + if (cmeta->nullmap_offset) + { + nullmap = base + __kds_unpack(cmeta->nullmap_offset); + if (att_isnull(index, nullmap)) + return NULL; + } + Assert(cmeta->values_offset > 0); + Assert(cmeta->extra_offset == 0); + Assert(cmeta->extra_length == 0); + Assert(unitsz * (index+1) <= __kds_unpack(cmeta->values_length)); + values = base + __kds_unpack(cmeta->values_offset); + return values + unitsz * index; +} + +STATIC_INLINE(void *) +kern_fetch_varlena_datum_arrow(kern_colmeta *cmeta, + char *base, + cl_uint index, + cl_uint *p_length) +{ + cl_char *nullmap; + cl_uint *offset; + cl_char *extra; + + if (cmeta->nullmap_offset) + { + nullmap = base + __kds_unpack(cmeta->nullmap_offset); + if (att_isnull(index, nullmap)) + return NULL; + } + Assert(cmeta->values_offset > 0 && + cmeta->extra_offset > 0 && + sizeof(cl_uint) * (index+1) <= __kds_unpack(cmeta->values_length)); + offset = (cl_uint *)(base + __kds_unpack(cmeta->values_offset)); + extra = base + __kds_unpack(cmeta->extra_offset); + + Assert(offset[index] <= offset[index+1] && + offset[index+1] <= __kds_unpack(cmeta->extra_length)); + *p_length = offset[index+1] - offset[index]; + return (extra + offset[index]); +} + +/* + * kern_parambuf + * + * Const and Parameter buffer. It stores constant values during a particular + * scan, so it may make sense if it is obvious length of kern_parambuf is + * less than constant memory (NOTE: not implemented yet). + */ +typedef struct kern_parambuf +{ + /* + * Fields of system information on execution + */ + cl_long xactStartTimestamp; /* timestamp when transaction start */ + cl_uint xactIdVector; /* offset to xidvector */ + + /* variable length parameters / constants */ + cl_uint length; /* total length of parambuf */ + cl_uint nparams; /* number of parameters */ + cl_uint poffset[FLEXIBLE_ARRAY_MEMBER]; /* offset of params */ +} kern_parambuf; + +STATIC_INLINE(void *) +kparam_get_value(kern_parambuf *kparams, cl_uint pindex) +{ + if (pindex >= kparams->nparams) + return NULL; + if (kparams->poffset[pindex] == 0) + return NULL; + return (char *)kparams + kparams->poffset[pindex]; +} + +STATIC_INLINE(cl_bool) +pointer_on_kparams(void *ptr, kern_parambuf *kparams) +{ + return kparams && ((char *)ptr >= (char *)kparams && + (char *)ptr < (char *)kparams + kparams->length); +} + +/* + * PostgreSQL varlena related definitions + * + * Unlike host code, device code cannot touch external and/or compressed + * toast datum. All the format device code can understand is usual + * in-memory form; 4-bytes length is put on the head and contents follows. + * So, it is a responsibility of host code to decompress the toast values + * if device code may access compressed varlena. + * In case when device code touches unsupported format, calculation result + * shall be postponed to calculate on the host side. + * + * Note that it is harmless to have external and/or compressed toast datam + * unless it is NOT referenced in the device code. It can understand the + * length of these values, unlike contents. + */ +typedef struct varlena varlena; +#ifndef POSTGRES_H +struct varlena { + cl_char vl_len_[4]; /* Do not touch this field directly! */ + cl_char vl_dat[1]; +}; + +#define VARHDRSZ ((int) sizeof(cl_int)) +#define VARDATA(PTR) VARDATA_4B(PTR) +#define VARSIZE(PTR) VARSIZE_4B(PTR) +#define VARSIZE_EXHDR(PTR) (VARSIZE(PTR) - VARHDRSZ) + +#define VARSIZE_SHORT(PTR) VARSIZE_1B(PTR) +#define VARDATA_SHORT(PTR) VARDATA_1B(PTR) + +typedef union +{ + struct /* Normal varlena (4-byte length) */ + { + cl_uint va_header; + cl_char va_data[1]; + } va_4byte; + struct /* Compressed-in-line format */ + { + cl_uint va_header; + cl_uint va_rawsize; /* Original data size (excludes header) */ + cl_char va_data[1]; /* Compressed data */ + } va_compressed; +} varattrib_4b; + +typedef struct +{ + cl_uchar va_header; + cl_char va_data[1]; /* Data begins here */ +} varattrib_1b; + +/* inline portion of a short varlena pointing to an external resource */ +typedef struct +{ + cl_uchar va_header; /* Always 0x80 or 0x01 */ + cl_uchar va_tag; /* Type of datum */ + cl_char va_data[1]; /* Data (of the type indicated by va_tag) */ +} varattrib_1b_e; + +typedef enum vartag_external +{ + VARTAG_INDIRECT = 1, + VARTAG_ONDISK = 18 +} vartag_external; + +#define VARHDRSZ_SHORT offsetof(varattrib_1b, va_data) +#define VARATT_SHORT_MAX 0x7F + +typedef struct varatt_external +{ + cl_int va_rawsize; /* Original data size (includes header) */ + cl_int va_extsize; /* External saved size (doesn't) */ + cl_int va_valueid; /* Unique ID of value within TOAST table */ + cl_int va_toastrelid; /* RelID of TOAST table containing it */ +} varatt_external; + +typedef struct varatt_indirect +{ + hostptr_t pointer; /* Host pointer to in-memory varlena */ +} varatt_indirect; + +#define VARTAG_SIZE(tag) \ + ((tag) == VARTAG_INDIRECT ? sizeof(varatt_indirect) : \ + (tag) == VARTAG_ONDISK ? sizeof(varatt_external) : \ + 0 /* should not happen */) + +#define VARHDRSZ_EXTERNAL offsetof(varattrib_1b_e, va_data) +#define VARTAG_EXTERNAL(PTR) VARTAG_1B_E(PTR) +#define VARSIZE_EXTERNAL(PTR) \ + (VARHDRSZ_EXTERNAL + VARTAG_SIZE(VARTAG_EXTERNAL(PTR))) + +/* + * compressed varlena format + */ +typedef struct toast_compress_header +{ + cl_int vl_len_; /* varlena header (do not touch directly!) */ + cl_int rawsize; +} toast_compress_header; + +#define TOAST_COMPRESS_HDRSZ ((cl_int)sizeof(toast_compress_header)) +#define TOAST_COMPRESS_RAWSIZE(ptr) \ + (((toast_compress_header *) (ptr))->rawsize) +#define TOAST_COMPRESS_RAWDATA(ptr) \ + (((char *) (ptr)) + TOAST_COMPRESS_HDRSZ) +#define TOAST_COMPRESS_SET_RAWSIZE(ptr, len) \ + (((toast_compress_header *) (ptr))->rawsize = (len)) + +/* basic varlena macros */ +#define VARATT_IS_4B(PTR) \ + ((((varattrib_1b *) (PTR))->va_header & 0x01) == 0x00) +#define VARATT_IS_4B_U(PTR) \ + ((((varattrib_1b *) (PTR))->va_header & 0x03) == 0x00) +#define VARATT_IS_4B_C(PTR) \ + ((((varattrib_1b *) (PTR))->va_header & 0x03) == 0x02) +#define VARATT_IS_1B(PTR) \ + ((((varattrib_1b *) (PTR))->va_header & 0x01) == 0x01) +#define VARATT_IS_1B_E(PTR) \ + ((((varattrib_1b *) (PTR))->va_header) == 0x01) +#define VARATT_IS_COMPRESSED(PTR) VARATT_IS_4B_C(PTR) +#define VARATT_IS_EXTERNAL(PTR) VARATT_IS_1B_E(PTR) +#define VARATT_IS_EXTERNAL_ONDISK(PTR) \ + (VARATT_IS_EXTERNAL(PTR) && VARTAG_EXTERNAL(PTR) == VARTAG_ONDISK) +#define VARATT_IS_EXTERNAL_INDIRECT(PTR) \ + (VARATT_IS_EXTERNAL(PTR) && VARTAG_EXTERNAL(PTR) == VARTAG_INDIRECT) +#define VARATT_IS_SHORT(PTR) VARATT_IS_1B(PTR) +#define VARATT_IS_EXTENDED(PTR) (!VARATT_IS_4B_U(PTR)) +#define VARATT_NOT_PAD_BYTE(PTR) (*((cl_uchar *) (PTR)) != 0) + +#define VARSIZE_4B(PTR) \ + ((__Fetch(&((varattrib_4b *)(PTR))->va_4byte.va_header)>>2) & 0x3FFFFFFF) +#define VARSIZE_1B(PTR) \ + ((((varattrib_1b *) (PTR))->va_header >> 1) & 0x7F) +#define VARTAG_1B_E(PTR) \ + (((varattrib_1b_e *) (PTR))->va_tag) + +#define VARRAWSIZE_4B_C(PTR) \ + __Fetch(&((varattrib_4b *) (PTR))->va_compressed.va_rawsize) + +#define VARSIZE_ANY_EXHDR(PTR) \ + (VARATT_IS_1B_E(PTR) ? VARSIZE_EXTERNAL(PTR)-VARHDRSZ_EXTERNAL : \ + (VARATT_IS_1B(PTR) ? VARSIZE_1B(PTR)-VARHDRSZ_SHORT : \ + VARSIZE_4B(PTR)-VARHDRSZ)) + +#define VARSIZE_ANY(PTR) \ + (VARATT_IS_1B_E(PTR) ? VARSIZE_EXTERNAL(PTR) : \ + (VARATT_IS_1B(PTR) ? VARSIZE_1B(PTR) : \ + VARSIZE_4B(PTR))) + +#define VARDATA_4B(PTR) (((varattrib_4b *) (PTR))->va_4byte.va_data) +#define VARDATA_1B(PTR) (((varattrib_1b *) (PTR))->va_data) +#define VARDATA_ANY(PTR) \ + (VARATT_IS_1B(PTR) ? VARDATA_1B(PTR) : VARDATA_4B(PTR)) + +#define SET_VARSIZE(PTR, len) \ + (((varattrib_4b *)(PTR))->va_4byte.va_header = (((cl_uint) (len)) << 2)) +#endif /* POSTGRES_H */ + +#ifndef ARRAY_H +/* + * Definitions of array internal of PostgreSQL + */ +typedef struct +{ + /* + * NOTE: We assume 4bytes varlena header for array type. It allows + * aligned references to the array elements. Unlike CPU side, we + * cannot have extra malloc to ensure 4bytes varlena header. It is + * the reason why our ScalarArrayOp implementation does not support + * array data type referenced by Var node; which is potentially has + * short format. + */ + cl_uint vl_len_; /* don't touch this field */ + cl_int ndim; /* # of dimensions */ + cl_int dataoffset; /* offset to data, or 0 if no bitmap */ + cl_uint elemtype; /* element type OID */ +} ArrayType; + +typedef struct +{ + cl_int ndim; /* # of dimensions */ + cl_int dataoffset; /* offset to data, or 0 if no bitmap */ + cl_uint elemtype; /* element type OID */ +} ArrayTypeData; + +#define ARR_SIZE(a) VARSIZE_ANY(a) +#define ARR_BODY(a) ((ArrayTypeData *)VARDATA_ANY(a)) +#define ARR_NDIM(a) __Fetch(&ARR_BODY(a)->ndim) +#define ARR_DATAOFFSET(a) __Fetch(&ARR_BODY(a)->dataoffset) +#define ARR_HASNULL(a) (ARR_DATAOFFSET(a) != 0) +#define ARR_ELEMTYPE(a) __Fetch(&ARR_BODY(a)->elemtype) +#define ARR_DIMS(a) \ + ((int *)((char *)VARDATA_ANY(a) + sizeof(ArrayTypeData))) +#define ARR_LBOUND(a) (ARR_DIMS(a) + ARR_NDIM(a)) +#define ARR_NULLBITMAP(a) \ + (ARR_HASNULL(a) ? (char *)(ARR_DIMS(a) + 2 * ARR_NDIM(a)) : (char *)NULL) +#define ARR_DATA_PTR(a) \ + ((char *)VARDATA_ANY(a) + \ + (ARR_HASNULL(a) ? (ARR_DATAOFFSET(a) - VARHDRSZ) \ + : (sizeof(ArrayTypeData) + 2 * sizeof(int) * ARR_NDIM(a)))) + +/* + * The total array header size (in bytes) for an array with the specified + * number of dimensions and total number of items. + * NOTE: This macro assume 4-bytes varlena header + */ +#define ARR_OVERHEAD_NONULLS(ndims) \ + MAXALIGN(sizeof(ArrayType) + 2 * sizeof(int) * (ndims)) +#define ARR_OVERHEAD_WITHNULLS(ndims, nitems) \ + MAXALIGN(sizeof(ArrayType) + 2 * sizeof(int) * (ndims) + \ + ((nitems) + 7) / 8) + +#endif /* ARRAY_H */ + +/* ---------------------------------------------------------------- + * + * About GPU Projection Support + * + * A typical projection code path is below: + * + * 1. Extract values from heap-tuple or column-store onto tup_dclass[] and + * tup_values[] array, and calculate length of the new heap-tuple. + * 2. Allocation of the destination buffer, per threads-group + * 3. Write out the heap-tuple + * + * Step-1 is usually handled by auto-generated code. In some case, it is not + * reasonable to extract values to in-storage format prior to allocation of + * the destination buffer, like a long text value that references a source + * buffer in Apache Arrow. + * Right now, we pay attention on simple varlena (Binary of Arrow that is + * bytes in PG, and Utf8 of Arrow that is text in PG), and array of fixed- + * length values (List of Arrow). + * If tup_values[] hold a pointer to pg_varlena_t or pg_array_t, not raw- + * varlena image, tup_dclass[] will have special flag to inform indirect + * reference to the value. + * + * pg_XXXX_datum_ref() routine of types are responsible to transform disk + * format to internal representation. + * pg_XXXX_datum_store() routine of types are also responsible to transform + * internal representation to disk format. We need to pay attention on + * projection stage. If and when GPU code tries to store expressions which + * are not simple Var, Const or Param, these internal representation must + * be written to extra-buffer first. + * + * Also note that KDS_FORMAT_SLOT is designed to have compatible layout to + * pair of tup_dclass[] / tup_values[] array if all the items have NULL or + * NORMAL state. Other state should be normalized prior to CPU writeback. + * + * ---------------------------------------------------------------- + */ +#define DATUM_CLASS__NORMAL 0 /* datum is normal value */ +#define DATUM_CLASS__NULL 1 /* datum is NULL */ +#define DATUM_CLASS__VARLENA 2 /* datum is pg_varlena_t reference */ +#define DATUM_CLASS__ARRAY 3 /* datum is pg_array_t reference */ +#define DATUM_CLASS__COMPOSITE 4 /* datum is pg_composite_t reference */ +#define DATUM_CLASS__GEOMETRY 5 /* datum is pg_geometry_t reference */ + +/* + * device functions in cuda_common.fatbin + */ +#ifdef __CUDACC__ +/* lightweight hash */ +DEVICE_FUNCTION(cl_uint) +pg_hash_any(const cl_uchar *k, cl_int keylen); +/* little bit heavy, but more randomized SipHash */ +DEVICE_FUNCTION(cl_ulong) +pg_siphash_any(const unsigned char *buf, const size_t len); +#endif /* __CUDACC__ */ + +/* + * Macro to extract a heap-tuple + * + * usage: + * char *addr; + * + * EXTRACT_HEAP_TUPLE_BEGIN(kds, htup, addr) + * -> addr shall point the device pointer of the first field, or NULL + * EXTRACT_HEAP_TUPLE_NEXT(addr) + * -> addr shall point the device pointer of the second field, or NULL + * : + * EXTRACT_HEAP_TUPLE_END() + * + * EXTRACT_HEAP_READ_XXXX() + * -> load raw values to dclass[]/values[], and update extras[] + */ +#define EXTRACT_HEAP_TUPLE_BEGIN(KDS,HTUP,NATTRS) \ + do { \ + kern_colmeta *__cmeta; \ + cl_int __colidx; \ + cl_int __ncols; \ + cl_uchar *__nullmap = NULL; \ + char *__pos; \ + void *addr; \ + \ + if (!(HTUP)) \ + __ncols = 0; /* to be considered as NULL */ \ + else \ + { \ + if (((HTUP)->t_infomask & HEAP_HASNULL) != 0) \ + __nullmap = (HTUP)->t_bits; \ + __ncols = ((HTUP)->t_infomask2 & HEAP_NATTS_MASK); \ + __ncols = Min((KDS)->ncols, __ncols); \ + __pos = (char *)(HTUP) + (HTUP)->t_hoff; \ + assert(__pos == (char *)MAXALIGN(__pos)); \ + } \ + \ + for (__colidx=0; __colidx < (NATTRS); __colidx++) \ + { \ + if (__colidx < __ncols && \ + (!__nullmap || !att_isnull(__colidx, __nullmap))) \ + { \ + __cmeta = &((KDS)->colmeta[__colidx]); \ + if (__cmeta->attlen > 0) \ + __pos = (char *)TYPEALIGN(__cmeta->attalign, __pos); \ + else if (!VARATT_NOT_PAD_BYTE(__pos)) \ + __pos = (char *)TYPEALIGN(__cmeta->attalign, __pos); \ + addr = __pos; \ + __pos += (__cmeta->attlen > 0 ? \ + __cmeta->attlen : \ + VARSIZE_ANY(__pos)); \ + } \ + else \ + addr = NULL + +#define EXTRACT_HEAP_TUPLE_END() \ + } \ + } while(0) + +#define EXTRACT_HEAP_READ_8BIT(ADDR,ATT_DCLASS,ATT_VALUES) \ + do { \ + if (!(ADDR)) \ + (ATT_DCLASS) = DATUM_CLASS__NULL; \ + else \ + { \ + (ATT_DCLASS) = DATUM_CLASS__NORMAL; \ + (ATT_VALUES) = *((cl_uchar *)(ADDR)); \ + } \ + } while(0) + +#define EXTRACT_HEAP_READ_16BIT(ADDR,ATT_DCLASS,ATT_VALUES) \ + do { \ + if (!(ADDR)) \ + (ATT_DCLASS) = DATUM_CLASS__NULL; \ + else \ + { \ + (ATT_DCLASS) = DATUM_CLASS__NORMAL; \ + (ATT_VALUES) = *((cl_ushort *)(ADDR)); \ + } \ + } while(0) + +#define EXTRACT_HEAP_READ_32BIT(ADDR,ATT_DCLASS,ATT_VALUES) \ + do { \ + if (!(ADDR)) \ + (ATT_DCLASS) = DATUM_CLASS__NULL; \ + else \ + { \ + (ATT_DCLASS) = DATUM_CLASS__NORMAL; \ + (ATT_VALUES) = *((cl_uint *)(ADDR)); \ + } \ + } while(0) + +#define EXTRACT_HEAP_READ_64BIT(ADDR,ATT_DCLASS,ATT_VALUES) \ + do { \ + if (!(ADDR)) \ + (ATT_DCLASS) = DATUM_CLASS__NULL; \ + else \ + { \ + (ATT_DCLASS) = DATUM_CLASS__NORMAL; \ + (ATT_VALUES) = *((cl_ulong *)(ADDR)); \ + } \ + } while(0) + +#define EXTRACT_HEAP_READ_POINTER(ADDR,ATT_DCLASS,ATT_VALUES) \ + do { \ + if (!(ADDR)) \ + (ATT_DCLASS) = DATUM_CLASS__NULL; \ + else \ + { \ + (ATT_DCLASS) = DATUM_CLASS__NORMAL; \ + (ATT_VALUES) = PointerGetDatum(ADDR); \ + } \ + } while(0) + +/* + * Similar macro to extract IndexTuple + */ +#define EXTRACT_INDEX_TUPLE_BEGIN(ADDR,KDS,itup) \ + do { \ + const kern_colmeta *__cmeta = (KDS)->colmeta; \ + cl_uint __ncols = (KDS)->ncols; \ + cl_uint __colidx = 0; \ + cl_uchar *__nullmap = NULL; \ + char *__pos; \ + \ + if (!(itup)) \ + __ncols = 0; \ + else if (((itup)->t_info & INDEX_NULL_MASK) == 0) \ + __pos = itup->data; \ + else \ + { \ + __nullmap = (cl_uchar *)(itup)->data; \ + __pos = (itup)->data + MAXALIGN(BITMAPLEN(__ncols)); \ + } \ + if (__colidx < __ncols && \ + (!__nullmap || !att_isnull(__colidx, __nullmap))) \ + { \ + (ADDR) = __pos; \ + __pos += (__cmeta->attlen > 0 ? \ + __cmeta->attlen : \ + VARSIZE_ANY(__pos)); \ + } \ + else \ + (ADDR) = NULL + +#define EXTRACT_INDEX_TUPLE_NEXT(ADDR,KDS) \ + __colidx++; \ + if (__colidx < __ncols && \ + (!__nullmap || !att_isnull(__colidx, __nullmap))) \ + { \ + __cmeta = &(KDS)->colmeta[__colidx]; \ + \ + if (__cmeta->attlen > 0) \ + __pos = (char *)TYPEALIGN(__cmeta->attalign, __pos); \ + else if (!VARATT_NOT_PAD_BYTE(__pos)) \ + __pos = (char *)TYPEALIGN(__cmeta->attalign, __pos); \ + (ADDR) = __pos; \ + __pos += (__cmeta->attlen > 0 ? \ + __cmeta->attlen : \ + VARSIZE_ANY(__pos)); \ + } \ + else \ + (ADDR) = NULL + +#define EXTRACT_INDEX_TUPLE_END() \ + } while(0) + +#ifdef __CUDACC__ +/* + * device functions to decompress a toast datum + */ +DEVICE_FUNCTION(size_t) +toast_raw_datum_size(kern_context *kcxt, varlena *attr); +DEVICE_FUNCTION(cl_int) +pglz_decompress(const char *source, cl_int slen, + char *dest, cl_int rawsize); +DEVICE_FUNCTION(cl_bool) +toast_decompress_datum(char *buffer, cl_uint buflen, + const varlena *datum); +/* + * device functions to reference a particular datum in a tuple + */ +DEVICE_FUNCTION(void *) +kern_get_datum_tuple(kern_colmeta *colmeta, + HeapTupleHeaderData *htup, + cl_uint colidx); +DEVICE_FUNCTION(void *) +kern_get_datum_column(kern_data_store *kds, + kern_data_extra *extra, + cl_uint colidx, cl_uint rowidx); +DEVICE_FUNCTION(cl_bool) +kern_check_visibility_column(kern_context *kcxt, + kern_data_store *kds, + cl_uint rowidx); +/* + * device functions to form/deform HeapTuple + */ +DEVICE_FUNCTION(cl_uint) +__compute_heaptuple_size(kern_context *kcxt, + kern_colmeta *__cmeta, + cl_bool heap_hasoid, + cl_uint ncols, + cl_char *tup_dclass, + Datum *tup_values); +DEVICE_FUNCTION(void) +deform_kern_heaptuple(cl_int nattrs, + kern_colmeta *tup_attrs, + HeapTupleHeaderData *htup, + cl_char *tup_dclass, + Datum *tup_values); +DEVICE_FUNCTION(cl_uint) +__form_kern_heaptuple(kern_context *kcxt, + void *buffer, /* out */ + cl_int ncols, /* in */ + kern_colmeta *colmeta, /* in */ + cl_uint comp_typeid, /* in */ + cl_int comp_typmod, /* in */ + ItemPointerData *tup_self,/* in */ + cl_char *tup_dclass, /* in */ + Datum *tup_values); /* in */ +/* + * support function for KDS_FORMAT_SLOT + */ +DEVICE_FUNCTION(cl_uint) +kds_slot_compute_extra(kern_context *kcxt, + kern_data_store *kds, + cl_char *tup_dclass, + Datum *tup_values); +DEVICE_FUNCTION(void) +kds_slot_store_values(kern_context *kcxt, + kern_data_store *kds_dst, + cl_uint dst_index, + char *dst_extra, + cl_char *tup_dclass, + Datum *tup_values); +/* + * Reduction Operations + */ +DEVICE_FUNCTION(cl_uint) +pgstromStairlikeSum(cl_uint my_value, cl_uint *total_sum); +DEVICE_FUNCTION(cl_uint) +pgstromStairlikeBinaryCount(int predicate, cl_uint *total_count); +#endif /* __CUDACC__ */ + +/* base type definitions and templates */ +#include "cuda_basetype.h" +/* numeric functions support (must be here) */ +#include "cuda_numeric.h" +/* text functions support (must be here) */ +#include "cuda_textlib.h" +/* time functions support (must be here) */ +#include "cuda_timelib.h" +/* static inline and c++ template functions */ +#include "cuda_utils.h" + +#endif /* CUDA_COMMON_H */ diff --git a/src/cuda_gcache.cu b/old/cuda_gcache.cu similarity index 100% rename from src/cuda_gcache.cu rename to old/cuda_gcache.cu diff --git a/src/cuda_gcache.h b/old/cuda_gcache.h similarity index 100% rename from src/cuda_gcache.h rename to old/cuda_gcache.h diff --git a/old/cuda_gpujoin.cu b/old/cuda_gpujoin.cu new file mode 100644 index 000000000..deeb40446 --- /dev/null +++ b/old/cuda_gpujoin.cu @@ -0,0 +1,1927 @@ +/* + * cuda_gpujoin.cu + * + * GPU accelerated parallel relations join based on hash-join or + * nested-loop logic. + * -- + * Copyright 2011-2021 (C) KaiGai Kohei + * Copyright 2014-2021 (C) PG-Strom Developers Team + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the PostgreSQL License. + */ +#include "cuda_common.h" +#include "cuda_gpujoin.h" + +/* + * static shared variables + */ +static __shared__ cl_bool scan_done; +static __shared__ cl_int base_depth; +static __shared__ cl_uint src_read_pos; +static __shared__ cl_uint dst_base_index; +static __shared__ size_t dst_base_usage; +extern __shared__ cl_uint wip_count[0]; /* [GPUJOIN_MAX_DEPTH+1] items */ +extern __shared__ cl_uint read_pos[0]; /* [GPUJOIN_MAX_DEPTH+1] items */ +extern __shared__ cl_uint write_pos[0]; /* [GPUJOIN_MAX_DEPTH+1] items */ +extern __shared__ cl_uint temp_pos[0]; /* [GPUJOIN_MAX_DEPTH+1] items */ +extern __shared__ cl_uint gist_pos[0]; /* [(GPUJOIN_MAX_DEPTH+1)*32] items */ +static __shared__ cl_uint stat_source_nitems; +extern __shared__ cl_uint stat_nitems[0]; /* [GPUJOIN_MAX_DEPTH+1] items */ +extern __shared__ cl_uint stat_nitems2[0]; /* [GPUJOIN_MAX_DEPTH+1] items */ + +/* + * gpujoin_suspend_context + */ +STATIC_FUNCTION(void) +gpujoin_suspend_context(kern_gpujoin *kgjoin, + cl_int depth, cl_uint *l_state, cl_bool *matched) +{ + gpujoinSuspendContext *sb; + cl_int i, max_depth = kgjoin->num_rels; + + sb = KERN_GPUJOIN_SUSPEND_CONTEXT(kgjoin, get_group_id()); + if (get_local_id() == 0) + { + sb->depth = depth; + sb->scan_done = scan_done; + sb->src_read_pos = src_read_pos; + sb->stat_source_nitems = stat_source_nitems; + } + + for (i=get_local_id(); i <= max_depth; i+=get_local_size()) + { + sb->pd[i].wip_count = wip_count[i]; + sb->pd[i].read_pos = read_pos[i]; + sb->pd[i].write_pos = write_pos[i]; + sb->pd[i].temp_pos = temp_pos[i]; + memcpy(sb->pd[i].gist_pos, gist_pos + i * MAXWARPS_PER_BLOCK, + sizeof(cl_uint) * MAXWARPS_PER_BLOCK); + sb->pd[i].stat_nitems = stat_nitems[i]; + sb->pd[i].stat_nitems2 = stat_nitems2[i]; + } + + for (i=0; i <= max_depth; i++) + { + sb->pd[i].l_state[get_local_id()] = l_state[i]; + sb->pd[i].matched[get_local_id()] = matched[i]; + } + /* tells host-code GPU kernel needs to be resumed */ + if (get_local_id() == 0) + atomicAdd(&kgjoin->suspend_count, 1); + __syncthreads(); +} + +/* + * gpujoin_resume_context + */ +STATIC_FUNCTION(cl_int) +gpujoin_resume_context(kern_gpujoin *kgjoin, + cl_uint *l_state, cl_bool *matched) +{ + gpujoinSuspendContext *sb; + cl_int i, max_depth = kgjoin->num_rels; + + sb = KERN_GPUJOIN_SUSPEND_CONTEXT(kgjoin, get_group_id()); + if (get_local_id() == 0) + { + scan_done = sb->scan_done; + src_read_pos = sb->src_read_pos; + stat_source_nitems = sb->stat_source_nitems; + } + + for (i=get_local_id(); i <= max_depth; i+=get_local_size()) + { + wip_count[i] = sb->pd[i].wip_count; + read_pos[i] = sb->pd[i].read_pos; + write_pos[i] = sb->pd[i].write_pos; + temp_pos[i] = sb->pd[i].temp_pos; + memcpy(gist_pos + i * MAXWARPS_PER_BLOCK, sb->pd[i].gist_pos, + sizeof(cl_uint) * MAXWARPS_PER_BLOCK); + stat_nitems[i] = sb->pd[i].stat_nitems; + stat_nitems2[i] = sb->pd[i].stat_nitems2; + } + + for (i=0; i <= max_depth; i++) + { + l_state[i] = sb->pd[i].l_state[get_local_id()]; + matched[i] = sb->pd[i].matched[get_local_id()]; + } + return sb->depth; +} + +/* + * gpujoin_rewind_stack + */ +STATIC_INLINE(cl_int) +gpujoin_rewind_stack(kern_gpujoin *kgjoin, cl_int depth, + cl_uint *l_state, cl_bool *matched) +{ + cl_int max_depth = kgjoin->num_rels; + static __shared__ cl_int __depth; + + assert(depth >= base_depth && depth <= max_depth); + __syncthreads(); + if (get_local_id() == 0) + { + __depth = depth; + for (;;) + { + /* + * At the time of rewind, all the upper tuples (outer combinations + * from the standpoint of deeper depth) are already processed. + * So, we can safely rewind the read/write index of this depth. + */ + read_pos[__depth] = 0; + write_pos[__depth] = 0; + + /* + * If any of outer combinations are in progress to find out + * matching inner tuple, we have to resume the task, prior + * to the increment of read pointer. + */ + if (wip_count[__depth] > 0) + break; + if (__depth == base_depth || + read_pos[__depth-1] < write_pos[__depth-1]) + break; + __depth--; + } + } + __syncthreads(); + depth = __depth; + if (depth < max_depth) + { + memset(l_state + depth + 1, 0, + sizeof(cl_uint) * (max_depth - depth)); + memset(matched + depth + 1, 0, + sizeof(cl_bool) * (max_depth - depth)); + } + if (scan_done && depth == base_depth) + return -1; + return depth; +} + +/* + * gpujoin_load_source + */ +STATIC_FUNCTION(cl_int) +gpujoin_load_source(kern_context *kcxt, + kern_gpujoin *kgjoin, + kern_data_store *kds_src, + kern_data_extra *kds_extra, + cl_uint *wr_stack, + cl_uint *l_state) +{ + cl_uint t_offset = UINT_MAX; + cl_bool visible = false; + cl_uint count; + cl_uint wr_index; + + /* extract a HeapTupleHeader */ + if (kds_src->format == KDS_FORMAT_ROW) + { + kern_tupitem *tupitem; + cl_uint row_index; + + /* fetch next window */ + if (get_local_id() == 0) + src_read_pos = atomicAdd(&kgjoin->src_read_pos, + get_local_size()); + __syncthreads(); + row_index = src_read_pos + get_local_id(); + + if (row_index < __ldg(&kds_src->nitems)) + { + tupitem = KERN_DATA_STORE_TUPITEM(kds_src, row_index); + t_offset = __kds_packed((char *)&tupitem->htup - + (char *)kds_src); + visible = gpujoin_quals_eval(kcxt, + kds_src, + &tupitem->htup.t_ctid, + &tupitem->htup); + } + assert(wip_count[0] == 0); + } + else if (kds_src->format == KDS_FORMAT_BLOCK) + { + cl_uint part_sz = KERN_DATA_STORE_PARTSZ(kds_src); + cl_uint n_parts = get_local_size() / part_sz; + cl_uint part_id; + cl_uint line_no; + cl_uint n_lines; + cl_uint loops = l_state[0]++; + + /* fetch next window, if needed */ + if (loops == 0 && get_local_id() == 0) + src_read_pos = atomicAdd(&kgjoin->src_read_pos, n_parts); + __syncthreads(); + part_id = src_read_pos + get_local_id() / part_sz; + line_no = get_local_id() % part_sz + loops * part_sz + 1; + + if (part_id < __ldg(&kds_src->nitems) && + get_local_id() < part_sz * n_parts) + { + PageHeaderData *pg_page; + BlockNumber block_nr; + ItemPointerData t_self; + HeapTupleHeaderData *htup; + + pg_page = KERN_DATA_STORE_BLOCK_PGPAGE(kds_src, part_id); + n_lines = PageGetMaxOffsetNumber(pg_page); + block_nr = KERN_DATA_STORE_BLOCK_BLCKNR(kds_src, part_id); + + if (line_no <= n_lines) + { + ItemIdData *lpp = PageGetItemId(pg_page, line_no); + if (ItemIdIsNormal(lpp)) + { + t_offset = (cl_uint)((char *)lpp - (char *)kds_src); + t_self.ip_blkid.bi_hi = block_nr >> 16; + t_self.ip_blkid.bi_lo = block_nr & 0xffff; + t_self.ip_posid = line_no; + + htup = PageGetItem(pg_page, lpp); + + visible = gpujoin_quals_eval(kcxt, + kds_src, + &t_self, + htup); + } + } + } + } + else if (kds_src->format == KDS_FORMAT_ARROW) + { + cl_uint row_index; + + /* fetch next window */ + if (get_local_id() == 0) + src_read_pos = atomicAdd(&kgjoin->src_read_pos, + get_local_size()); + __syncthreads(); + row_index = src_read_pos + get_local_id(); + + if (row_index < __ldg(&kds_src->nitems)) + { + t_offset = row_index + 1; + visible = gpujoin_quals_eval_arrow(kcxt, + kds_src, + row_index); + } + assert(wip_count[0] == 0); + } + else if (kds_src->format == KDS_FORMAT_COLUMN) + { + cl_uint row_index; + + /* fetch next window */ + if (get_local_id() == 0) + src_read_pos = atomicAdd(&kgjoin->src_read_pos, + get_local_size()); + __syncthreads(); + + row_index = src_read_pos + get_local_id(); + if (row_index < kds_src->nitems && + kern_check_visibility_column(kcxt, kds_src, row_index)) + { + t_offset = row_index + 1; + visible = gpujoin_quals_eval_column(kcxt, + kds_src, + kds_extra, + row_index); + } + assert(wip_count[0] == 0); + } + else + { + STROM_ELOG(kcxt, "unsupported KDS format"); + } + /* error checks */ + if (__syncthreads_count(kcxt->errcode) > 0) + return -1; + /* statistics */ + count = __syncthreads_count(t_offset != UINT_MAX); + if (get_local_id() == 0) + { + if (__ldg(&kds_src->format) == KDS_FORMAT_BLOCK) + wip_count[0] = count; + stat_source_nitems += count; + } + + /* store the source tuple if visible */ + wr_index = pgstromStairlikeBinaryCount(visible, &count); + if (count > 0) + { + wr_index += write_pos[0]; + __syncthreads(); + if (get_local_id() == 0) + { + write_pos[0] += count; + stat_nitems[0] += count; + } + if (visible) + wr_stack[wr_index] = t_offset; + __syncthreads(); + + /* + * An iteration can fetch up to get_local_size() tuples + * at once, thus, we try to dive into deeper depth prior + * to the next outer tuples. + */ + if (write_pos[0] + get_local_size() > GPUJOIN_PSEUDO_STACK_NROOMS) + return 1; + __syncthreads(); + } + else + { + /* no tuples we could fetch */ + assert(write_pos[0] + get_local_size() <= GPUJOIN_PSEUDO_STACK_NROOMS); + l_state[0] = 0; + __syncthreads(); + } + + /* End of the outer relation? */ + if (src_read_pos >= kds_src->nitems) + { + /* don't rewind the stack any more */ + if (get_local_id() == 0) + scan_done = true; + __syncthreads(); + + /* + * We may have to dive into the deeper depth if we still have + * pending join combinations. + */ + if (write_pos[0] == 0) + { + cl_int max_depth = kgjoin->num_rels; + + for (cl_int depth=1; depth <= max_depth; depth++) + { + if (temp_pos[depth] > 0) + return depth; + if (read_pos[depth] < write_pos[depth]) + return depth+1; + } + return -1; + } + return 1; + } + return 0; +} + +/* + * gpujoin_load_outer + */ +STATIC_FUNCTION(cl_int) +gpujoin_load_outer(kern_context *kcxt, + kern_gpujoin *kgjoin, + kern_multirels *kmrels, + cl_int outer_depth, + cl_uint *wr_stack, + cl_uint *l_state) +{ + kern_data_store *kds_in = KERN_MULTIRELS_INNER_KDS(kmrels, outer_depth); + cl_bool *ojmap = KERN_MULTIRELS_OUTER_JOIN_MAP(kmrels, outer_depth); + HeapTupleHeaderData *htup = NULL; + kern_tupitem *tupitem; + cl_uint t_offset; + cl_uint row_index; + cl_uint wr_index; + cl_uint count; + + assert(ojmap != NULL); + + if (get_local_id() == 0) + src_read_pos = atomicAdd(&kgjoin->src_read_pos, + get_local_size()); + __syncthreads(); + row_index = src_read_pos + get_local_id(); + + /* pickup inner rows, if unreferenced */ + if (row_index < kds_in->nitems && !ojmap[row_index]) + { + tupitem = KERN_DATA_STORE_TUPITEM(kds_in, row_index); + t_offset = __kds_packed((char *)&tupitem->htup - + (char *)kds_in); + htup = &tupitem->htup; + } + wr_index = write_pos[outer_depth]; + wr_index += pgstromStairlikeBinaryCount(htup != NULL, &count); + __syncthreads(); + if (count > 0) + { + if (get_local_id() == 0) + { + write_pos[outer_depth] += count; + stat_nitems[outer_depth] += count; + } + if (htup) + { + wr_stack += wr_index * (outer_depth + 1); + memset(wr_stack, 0, sizeof(cl_uint) * outer_depth); + wr_stack[outer_depth] = t_offset; + } + __syncthreads(); + } + + /* end of the inner relation? */ + if (src_read_pos >= kds_in->nitems) + { + /* don't rewind the stack any more */ + if (get_local_id() == 0) + scan_done = true; + __syncthreads(); + + /* + * We may have to dive into the deeper depth if we still have + * pending join combinations. + */ + if (write_pos[outer_depth] == 0) + { + cl_int max_depth = kgjoin->num_rels; + + for (cl_int depth=outer_depth + 1; depth <= max_depth; depth++) + { + if (read_pos[depth] < write_pos[depth]) + return depth+1; + } + return -1; + } + return outer_depth+1; + } + return outer_depth; +} + +/* + * gpujoin_projection_row + */ +STATIC_FUNCTION(cl_int) +gpujoin_projection_row(kern_context *kcxt, + kern_gpujoin *kgjoin, + kern_multirels *kmrels, + kern_data_store *kds_src, + kern_data_extra *kds_extra, + kern_data_store *kds_dst, + cl_uint *rd_stack, + cl_uint *l_state, + cl_bool *matched) +{ + cl_uint nrels = kgjoin->num_rels; + cl_uint read_index; + cl_uint dest_index; + size_t dest_offset; + cl_uint count; + cl_uint nvalids; + cl_uint required; + cl_char *tup_dclass; + Datum *tup_values; + cl_int needs_suspend = 0; + + /* sanity checks */ + assert(rd_stack != NULL); + + /* Any more result rows to be written? */ + if (read_pos[nrels] >= write_pos[nrels]) + return gpujoin_rewind_stack(kgjoin, nrels, l_state, matched); + + /* Allocation of tup_dclass/values */ + tup_dclass = (cl_char *) + kern_context_alloc(kcxt, sizeof(cl_char) * kds_dst->ncols); + tup_values = (Datum *) + kern_context_alloc(kcxt, sizeof(Datum) * kds_dst->ncols); + if (!tup_dclass || !tup_values) + STROM_EREPORT(kcxt, ERRCODE_OUT_OF_MEMORY, "out of memory"); + if (__syncthreads_count(kcxt->errcode) > 0) + return -1; /* bailout GpuJoin */ + + /* pick up combinations from the pseudo-stack */ + nvalids = Min(write_pos[nrels] - read_pos[nrels], + get_local_size()); + read_index = read_pos[nrels] + get_local_id(); + __syncthreads(); + + /* step.1 - compute length of the result tuple to be written */ + if (read_index < write_pos[nrels]) + { + rd_stack += read_index * (nrels + 1); + + gpujoin_projection(kcxt, + kds_src, + kds_extra, + kmrels, + rd_stack, + kds_dst, + tup_dclass, + tup_values, + NULL); + required = MAXALIGN(offsetof(kern_tupitem, htup) + + compute_heaptuple_size(kcxt, + kds_dst, + tup_dclass, + tup_values)); + } + else + required = 0; + + if (__syncthreads_count(kcxt->errcode) > 0) + return -1; /* bailout */ + + /* step.2 - increments nitems/usage of the kds_dst */ + dest_offset = pgstromStairlikeSum(required, &count); + assert(count > 0); + if (get_local_id() == 0) + { + union { + struct { + cl_uint nitems; + cl_uint usage; + } i; + cl_ulong v64; + } oldval, curval, newval; + + needs_suspend = 0; + curval.i.nitems = kds_dst->nitems; + curval.i.usage = kds_dst->usage; + do { + newval = oldval = curval; + newval.i.nitems += nvalids; + newval.i.usage += __kds_packed(count); + + if (KERN_DATA_STORE_HEAD_LENGTH(kds_dst) + + STROMALIGN(sizeof(cl_uint) * newval.i.nitems) + + __kds_unpack(newval.i.usage) > kds_dst->length) + { + needs_suspend = 1; + break; + } + } while ((curval.v64 = atomicCAS((cl_ulong *)&kds_dst->nitems, + oldval.v64, + newval.v64)) != oldval.v64); + dst_base_index = oldval.i.nitems; + dst_base_usage = __kds_unpack(oldval.i.usage); + } + if (__syncthreads_count(needs_suspend) > 0) + { + /* No space left on the kds_dst, suspend the GPU kernel and bailout */ + gpujoin_suspend_context(kgjoin, nrels+1, l_state, matched); + return -2; /* <-- not to update statistics */ + } + dest_index = dst_base_index + get_local_id(); + dest_offset += dst_base_usage + required; + + /* step.3 - write out HeapTuple on the destination buffer */ + if (required > 0) + { + cl_uint *row_index = KERN_DATA_STORE_ROWINDEX(kds_dst); + kern_tupitem *tupitem = (kern_tupitem *) + ((char *)kds_dst + kds_dst->length - dest_offset); + form_kern_heaptuple(kcxt, + tupitem, + kds_dst, + NULL, /* ItemPointerData */ + tup_dclass, + tup_values); + tupitem->rowid = dest_index; + row_index[dest_index] = __kds_packed(kds_dst->length - dest_offset); + } + if (__syncthreads_count(kcxt->errcode) > 0) + return -1; /* bailout */ + + /* step.4 - make advance the read position */ + if (get_local_id() == 0) + read_pos[nrels] += nvalids; + return nrels + 1; +} + +/* to be defined by gpupreagg.c */ +DEVICE_FUNCTION(void) +gpupreagg_projection_slot(kern_context *kcxt_gpreagg, + cl_char *src_dclass, + Datum *src_values, + cl_char *dst_dclass, + Datum *dst_values); + +/* + * gpujoin_projection_slot + */ +STATIC_FUNCTION(cl_int) +gpujoin_projection_slot(kern_context *kcxt, + kern_parambuf *kparams_gpreagg, + kern_gpujoin *kgjoin, + kern_multirels *kmrels, + kern_data_store *kds_src, + kern_data_extra *kds_extra, + kern_data_store *kds_dst, + cl_uint *rd_stack, + cl_uint *l_state, + cl_bool *matched) +{ + kern_parambuf *kparams_saved = kcxt->kparams; + cl_uint nrels = kgjoin->num_rels; + cl_uint read_index; + cl_uint dest_index; + size_t dest_offset; + cl_uint count; + cl_uint nvalids; + cl_bool tup_is_valid = false; + cl_char *tup_dclass = NULL; + Datum *tup_values = NULL; + cl_uint *tup_extras = NULL; + cl_uint extra_sz = 0; + cl_int needs_suspend = 0; + + /* sanity checks */ + assert(rd_stack != NULL); + + /* Any more result rows to be written? */ + if (read_pos[nrels] >= write_pos[nrels]) + return gpujoin_rewind_stack(kgjoin, nrels, l_state, matched); + + /* Allocation of tup_dclass/values/extra */ + tup_dclass = (cl_char *) + kern_context_alloc(kcxt, sizeof(cl_char) * kds_dst->ncols); + tup_values = (Datum *) + kern_context_alloc(kcxt, sizeof(Datum) * kds_dst->ncols); + tup_extras = (cl_uint *) + kern_context_alloc(kcxt, sizeof(cl_uint) * kds_dst->ncols); + if (!tup_dclass || !tup_values || !tup_extras) + STROM_EREPORT(kcxt, ERRCODE_OUT_OF_MEMORY, "out of memory"); + if (__syncthreads_count(kcxt->errcode) > 0) + return -1; /* bailout GpuJoin */ + + /* pick up combinations from the pseudo-stack */ + nvalids = Min(write_pos[nrels] - read_pos[nrels], + get_local_size()); + read_index = read_pos[nrels] + get_local_id(); + __syncthreads(); + + /* step.1 - projection by GpuJoin */ + if (read_index < write_pos[nrels]) + { + rd_stack += read_index * (nrels + 1); + + extra_sz = gpujoin_projection(kcxt, + kds_src, + kds_extra, + kmrels, + rd_stack, + kds_dst, + tup_dclass, + tup_values, + tup_extras); + tup_is_valid = true; + } + + /* step.2 - increments nitems/usage of the kds_dst */ + dest_offset = pgstromStairlikeSum(extra_sz, &count); + if (get_local_id() == 0) + { + union { + struct { + cl_uint nitems; + cl_uint usage; + } i; + cl_ulong v64; + } oldval, curval, newval; + + needs_suspend = 0; + curval.i.nitems = kds_dst->nitems; + curval.i.usage = kds_dst->usage; + do { + newval = oldval = curval; + newval.i.nitems += nvalids; + newval.i.usage += __kds_packed(count); + + if (KERN_DATA_STORE_SLOT_LENGTH(kds_dst, newval.i.nitems) + + __kds_unpack(newval.i.usage) > kds_dst->length) + { + needs_suspend = 1; + break; + } + } while ((curval.v64 = atomicCAS((cl_ulong *)&kds_dst->nitems, + oldval.v64, + newval.v64)) != oldval.v64); + dst_base_index = oldval.i.nitems; + dst_base_usage = __kds_unpack(oldval.i.usage); + } + if (__syncthreads_count(needs_suspend) > 0) + { + /* No space left on the kds_dst, suspend the GPU kernel and bailout */ + gpujoin_suspend_context(kgjoin, nrels+1, l_state, matched); + return -2; /* <-- not to update statistics */ + } + dest_index = dst_base_index + get_local_id(); + dest_offset += dst_base_usage + extra_sz; + + /* step.3 - projection by GpuPreAgg on the destination buffer */ + if (tup_is_valid) + { + cl_char *dst_dclass = KERN_DATA_STORE_DCLASS(kds_dst, dest_index); + Datum *dst_values = KERN_DATA_STORE_VALUES(kds_dst, dest_index); + + /* + * Fixup pointers, if it points out of kds_src/kmrels because these + * variables must be visible to the next GpuPreAgg kernel. + */ + if (extra_sz > 0) + { + char *dpos = (char *)kds_dst + kds_dst->length - dest_offset; + char *addr; + cl_int extra_sum = 0; + cl_int len; + + for (int j=0; j < kds_dst->ncols; j++) + { + len = tup_extras[j]; + if (len == 0) + continue; + addr = DatumGetPointer(tup_values[j]); + memcpy(dpos, addr, len); + tup_values[j] = PointerGetDatum(dpos); + dpos += MAXALIGN(len); + extra_sum += MAXALIGN(len); + } + assert(extra_sz == extra_sum); + } + /* + * Initial projection by GpuPreAgg + * + * This code block is generated by gpupreagg.c; that may reference + * const/parameters of GpuPreAgg, not GpuJoin. So, we temporarily + * switch kparams of the current context. + */ + kcxt->kparams = kparams_gpreagg; + gpupreagg_projection_slot(kcxt, + tup_dclass, + tup_values, + dst_dclass, + dst_values); + kcxt->kparams = kparams_saved; + } + if (__syncthreads_count(kcxt->errcode) > 0) + return -1; /* bailout */ + + /* step.4 - make advance the read position */ + if (get_local_id() == 0) + read_pos[nrels] += nvalids; //get_local_size(); + return nrels + 1; +} + +/* + * gpujoin_exec_nestloop + */ +STATIC_FUNCTION(cl_int) +gpujoin_exec_nestloop(kern_context *kcxt, + kern_gpujoin *kgjoin, + kern_multirels *kmrels, + kern_data_store *kds_src, + kern_data_extra *kds_extra, + cl_int depth, + cl_uint *rd_stack, + cl_uint *wr_stack, + cl_uint *l_state, + cl_bool *matched) +{ + kern_data_store *kds_in = KERN_MULTIRELS_INNER_KDS(kmrels, depth); + cl_bool *oj_map = KERN_MULTIRELS_OUTER_JOIN_MAP(kmrels, depth); + kern_tupitem *tupitem = NULL; + cl_int max_depth = kgjoin->num_rels; + cl_uint x_unitsz; + cl_uint y_unitsz; + cl_uint x_index; /* outer index */ + cl_uint y_index; /* inner index */ + cl_uint wr_index; + cl_uint count; + cl_bool result = false; + __shared__ cl_bool matched_sync[MAXTHREADS_PER_BLOCK]; + + assert(kds_in->format == KDS_FORMAT_ROW); + assert(depth >= 1 && depth <= max_depth); + if (read_pos[depth-1] >= write_pos[depth-1]) + { + /* + * When this depth has enough room (even if all the threads generate + * join combinations on the next try), upper depth may be able to + * generate more outer tuples; which shall be used to input for the + * next depth. + * It is mostly valuable to run many combinations on the next depth. + */ + assert(wip_count[depth] == 0); + if (write_pos[depth] + get_local_size() <= GPUJOIN_PSEUDO_STACK_NROOMS) + { + cl_int __depth = gpujoin_rewind_stack(kgjoin, depth-1, + l_state, matched); + if (__depth >= base_depth) + return __depth; + } + /* elsewhere, dive into the deeper depth or projection */ + return depth + 1; + } + __syncthreads(); + x_unitsz = Min(write_pos[depth-1], get_local_size()); + y_unitsz = get_local_size() / x_unitsz; + + x_index = get_local_id() % x_unitsz; + y_index = get_local_id() / x_unitsz; + + if (y_unitsz * l_state[depth] >= kds_in->nitems) + { + /* + * In case of LEFT OUTER JOIN, we need to check whether the outer + * combination had any matched inner tuples, or not. + */ + if (KERN_MULTIRELS_LEFT_OUTER_JOIN(kmrels, depth)) + { + if (get_local_id() < x_unitsz) + matched_sync[get_local_id()] = false; + __syncthreads(); + if (matched[depth]) + matched_sync[x_index] = true; + if (__syncthreads_count(!matched_sync[x_index]) > 0) + { + if (y_index == 0 && y_index < y_unitsz) + result = !matched_sync[x_index]; + else + result = false; + /* adjust x_index and rd_stack as usual */ + x_index += read_pos[depth-1]; + assert(x_index < write_pos[depth-1]); + rd_stack += (x_index * depth); + /* don't generate LEFT OUTER tuple any more */ + matched[depth] = true; + goto left_outer; + } + } + l_state[depth] = 0; + matched[depth] = false; + if (get_local_id() == 0) + { + wip_count[depth] = 0; + read_pos[depth-1] += x_unitsz; + } + return depth; + } + x_index += read_pos[depth-1]; + rd_stack += (x_index * depth); + if (x_index < write_pos[depth-1] && y_index < y_unitsz) + { + y_index += y_unitsz * l_state[depth]; + if (y_index < kds_in->nitems) + { + tupitem = KERN_DATA_STORE_TUPITEM(kds_in, y_index); + + result = gpujoin_join_quals(kcxt, + kds_src, + kds_extra, + kmrels, + depth, + rd_stack, + &tupitem->htup, + NULL); + if (result) + { + matched[depth] = true; + if (oj_map && !oj_map[y_index]) + oj_map[y_index] = true; + } + } + } + l_state[depth]++; + +left_outer: + wr_index = write_pos[depth]; + wr_index += pgstromStairlikeBinaryCount(result, &count); + if (get_local_id() == 0) + { + wip_count[depth] = get_local_size(); + write_pos[depth] += count; + stat_nitems[depth] += count; + } + wr_stack += wr_index * (depth + 1); + if (result) + { + memcpy(wr_stack, rd_stack, sizeof(cl_uint) * depth); + wr_stack[depth] = (!tupitem ? 0 : __kds_packed((char *)&tupitem->htup - + (char *)kds_in)); + } + __syncthreads(); + /* + * If we have enough room to store the combinations more, execute this + * depth one more. Elsewhere, dive into a deeper level to flush results. + */ + if (write_pos[depth] + get_local_size() <= GPUJOIN_PSEUDO_STACK_NROOMS) + return depth; + return depth + 1; +} + +/* + * gpujoin_exec_hashjoin + */ +STATIC_FUNCTION(cl_int) +gpujoin_exec_hashjoin(kern_context *kcxt, + kern_gpujoin *kgjoin, + kern_multirels *kmrels, + kern_data_store *kds_src, + kern_data_extra *kds_extra, + cl_int depth, + cl_uint *rd_stack, + cl_uint *wr_stack, + cl_uint *l_state, + cl_bool *matched) +{ + kern_data_store *kds_hash = KERN_MULTIRELS_INNER_KDS(kmrels, depth); + cl_bool *oj_map = KERN_MULTIRELS_OUTER_JOIN_MAP(kmrels, depth); + kern_hashitem *khitem = NULL; + cl_int max_depth = kgjoin->num_rels; + cl_uint t_offset = UINT_MAX; + cl_uint hash_value; + cl_uint rd_index; + cl_uint wr_index; + cl_uint count; + cl_bool result; + + assert(kds_hash->format == KDS_FORMAT_HASH); + assert(depth >= 1 && depth <= max_depth); + + if (__syncthreads_count(l_state[depth] != UINT_MAX) == 0) + { + /* + * OK, all the threads reached to the end of hash-slot chain + * Move to the next outer window. + */ + if (get_local_id() == 0) + read_pos[depth-1] += get_local_size(); + l_state[depth] = 0; + matched[depth] = false; + return depth; + } + else if (read_pos[depth-1] >= write_pos[depth-1]) + { + /* + * When this depth has enough room (even if all the threads generate + * join combinations on the next try), upper depth may be able to + * generate more outer tuples; which shall be used to input for the + * next depth. + * It is mostly valuable to run many combinations on the next depth. + */ + assert(wip_count[depth] == 0); + if (write_pos[depth] + get_local_size() <= GPUJOIN_PSEUDO_STACK_NROOMS) + { + cl_int __depth = gpujoin_rewind_stack(kgjoin, depth-1, + l_state, matched); + if (__depth >= base_depth) + return __depth; + } + /* elsewhere, dive into the deeper depth or projection */ + return depth + 1; + } + rd_index = read_pos[depth-1] + get_local_id(); + rd_stack += (rd_index * depth); + + if (l_state[depth] == 0) + { + /* first touch to the hash-slot */ + if (rd_index < write_pos[depth-1]) + { + cl_bool is_null_keys; + + hash_value = gpujoin_hash_value(kcxt, + kds_src, + kds_extra, + kmrels, + depth, + rd_stack, + &is_null_keys); + /* MEMO: NULL-keys will never match to inner-join */ + if (!is_null_keys) + khitem = KERN_HASH_FIRST_ITEM(kds_hash, hash_value); + /* rewind the varlena buffer */ + kcxt->vlpos = kcxt->vlbuf; + } + else + { + /* + * MEMO: We must ensure the threads without outer tuple don't + * generate any LEFT OUTER results. + */ + l_state[depth] = UINT_MAX; + } + } + else if (l_state[depth] != UINT_MAX) + { + /* walks on the hash-slot chain */ + khitem = (kern_hashitem *)((char *)kds_hash + + __kds_unpack(l_state[depth]) + - offsetof(kern_hashitem, t.htup)); + hash_value = khitem->hash; + + /* pick up next one if any */ + khitem = KERN_HASH_NEXT_ITEM(kds_hash, khitem); + } + + while (khitem && khitem->hash != hash_value) + khitem = KERN_HASH_NEXT_ITEM(kds_hash, khitem); + + if (khitem) + { + cl_bool joinquals_matched; + + assert(khitem->hash == hash_value); + + result = gpujoin_join_quals(kcxt, + kds_src, + kds_extra, + kmrels, + depth, + rd_stack, + &khitem->t.htup, + &joinquals_matched); + assert(result == joinquals_matched); + if (joinquals_matched) + { + /* No LEFT/FULL JOIN are needed */ + matched[depth] = true; + /* No RIGHT/FULL JOIN are needed */ + assert(khitem->t.rowid < kds_hash->nitems); + if (oj_map && !oj_map[khitem->t.rowid]) + oj_map[khitem->t.rowid] = true; + } + t_offset = __kds_packed((char *)&khitem->t.htup - + (char *)kds_hash); + } + else if (KERN_MULTIRELS_LEFT_OUTER_JOIN(kmrels, depth) && + l_state[depth] != UINT_MAX && + !matched[depth]) + { + /* No matched outer rows, but LEFT/FULL OUTER */ + result = true; + } + else + result = false; + + /* save the current hash item */ + l_state[depth] = t_offset; + wr_index = write_pos[depth]; + wr_index += pgstromStairlikeBinaryCount(result, &count); + if (get_local_id() == 0) + { + write_pos[depth] += count; + stat_nitems[depth] += count; + } + wr_stack += wr_index * (depth + 1); + if (result) + { + memcpy(wr_stack, rd_stack, sizeof(cl_uint) * depth); + wr_stack[depth] = (!khitem ? 0U : t_offset); + } + /* count number of threads still in-progress */ + count = __syncthreads_count(khitem != NULL); + if (get_local_id() == 0) + wip_count[depth] = count; + /* + * (2019/05/25) We saw a strange behavior on Tesla T4 (CUDA 10.1 with + * driver 418.67), but never seen at Pascal/Volta devices. + * Even though "write_pos[depth]" is updated by the leader thread above, + * then __syncthreads_count() shall synchronize all the local threads, + * a part of threads read different value from this variable. + * I doubt compiler may have some optimization problem here, therefore, + * the code below avoid to reference "write_pos[depth]" directly. + * It loads this value to local variable once, then injects a barrier + * synchronization explicitly. + * + * We should check whether the future version of CUDA can fix the problem. + */ + wr_index = write_pos[depth]; + __syncthreads(); + if (wr_index + get_local_size() <= GPUJOIN_PSEUDO_STACK_NROOMS) + return depth; + return depth+1; +} + +//#include "cuda_postgis.h" + +/* + * gpujoin_prep_gistindex + * + * MEMO: We must load the entire GiST-index, but part of the leaf items indicate + * invalid items because a part of inner rows can be filtered out already. + * So, this kernel function preliminary invalidates these items on the inner + * preload timing. + */ +KERNEL_FUNCTION(void) +gpujoin_prep_gistindex(kern_multirels *kmrels, int depth) +{ + kern_data_store *kds_hash = KERN_MULTIRELS_INNER_KDS(kmrels, depth); + kern_data_store *kds_gist = KERN_MULTIRELS_GIST_INDEX(kmrels, depth); + BlockNumber block_nr; + OffsetNumber i, maxoff; + + assert(kds_hash->format == KDS_FORMAT_HASH && + kds_gist->format == KDS_FORMAT_BLOCK); + assert(depth >= 1 && depth <= kmrels->nrels); + + for (block_nr = get_group_id(); + block_nr < kds_gist->nrooms; + block_nr += get_num_groups()) + { + PageHeaderData *gist_page; + ItemIdData *lpp; + IndexTupleData *itup; + kern_hashitem *khitem; + cl_uint hash, t_off; + + gist_page = KERN_DATA_STORE_BLOCK_PGPAGE(kds_gist, block_nr); + if (!GistPageIsLeaf(gist_page)) + continue; + maxoff = PageGetMaxOffsetNumber(gist_page); + for (i = get_local_id(); i < maxoff; i += get_local_size()) + { + lpp = PageGetItemId(gist_page, i+1); + if (ItemIdIsDead(lpp)) + continue; + itup = (IndexTupleData *)PageGetItem(gist_page, lpp); + + /* lookup kds_hash */ + hash = pg_hash_any((cl_uchar *)&itup->t_tid, + sizeof(ItemPointerData)); + for (khitem = KERN_HASH_FIRST_ITEM(kds_hash, hash); + khitem != NULL; + khitem = KERN_HASH_NEXT_ITEM(kds_hash, khitem)) + { + if (ItemPointerEquals(&khitem->t.htup.t_ctid, &itup->t_tid)) + { + t_off = __kds_packed((char *)&khitem->t.htup - + (char *)kds_hash); + itup->t_tid.ip_blkid.bi_hi = (t_off >> 16); + itup->t_tid.ip_blkid.bi_lo = (t_off & 0x0000ffffU); + itup->t_tid.ip_posid = USHRT_MAX; + break; + } + } + /* invalidate this leaf item, if not exist on kds_hash */ + if (!khitem) + lpp->lp_flags = LP_DEAD; + } + } +} + +/* + * gpujoin_gist_getnext + */ +STATIC_INLINE(ItemPointerData *) +gpujoin_gist_getnext(kern_context *kcxt, + kern_gpujoin *kgjoin, + cl_int depth, + kern_data_store *kds_gist, + void *gist_keys, + cl_uint *p_item_offset) +{ + PageHeaderData *gist_base = KERN_DATA_STORE_BLOCK_PGPAGE(kds_gist, 0); + PageHeaderData *gist_page; + cl_char *vlpos_saved = kcxt->vlpos; + OffsetNumber start; + OffsetNumber index; + OffsetNumber maxoff; + ItemIdData *lpp = NULL; + IndexTupleData *itup = NULL; + cl_bool rv = false; + + assert(kds_gist->format == KDS_FORMAT_BLOCK); + + /* + * Setup starting point of GiST-index lookup + */ + if (*p_item_offset == UINT_MAX) + { + /* this warp already reached to the end */ + return NULL; + } + else if (*p_item_offset == 0) + { + /* walk on GiST index from the root page */ + start = FirstOffsetNumber + LaneId(); + gist_page = KERN_DATA_STORE_BLOCK_PGPAGE(kds_gist, GIST_ROOT_BLKNO); + assert(gist_page->pd_parent_blkno == InvalidBlockNumber && + gist_page->pd_parent_item == InvalidOffsetNumber); + } + else + { + /* walk on GiST index from the next item */ + PageHeaderData *gist_base = KERN_DATA_STORE_BLOCK_PGPAGE(kds_gist, 0); + size_t off; + + assert(*p_item_offset < kds_gist->length); + lpp = (ItemIdData *)((char *)kds_gist + *p_item_offset); + off = (((char *)lpp - (char *)gist_base) & (BLCKSZ - 1)); + gist_page = (PageHeaderData *)((char *)lpp - off); + start = (lpp - gist_page->pd_linp) + 1 + warpSize; + } +restart: + assert((((char *)gist_page - (char *)gist_base) & (BLCKSZ - 1)) == 0); + + if (GistPageIsDeleted(gist_page)) + maxoff = InvalidOffsetNumber; /* skip any entries */ + else + maxoff = PageGetMaxOffsetNumber(gist_page); + + rv = false; + for (index=start; index <= maxoff; index += warpSize) + { + lpp = PageGetItemId(gist_page, index); + if (ItemIdIsDead(lpp)) + continue; + itup = (IndexTupleData *) PageGetItem(gist_page, lpp); + + kcxt->vlpos = vlpos_saved; /* rewind */ + rv = gpujoin_gist_index_quals(kcxt, depth, + kds_gist, gist_page, + itup, gist_keys); + if (rv) + break; + } + kcxt->vlpos = vlpos_saved; /* rewind */ + + assert(__activemask() == ~0U); + if (__any_sync(__activemask(), rv)) + { + /* By here, one or more threads meet the matched entry */ + if (!GistPageIsLeaf(gist_page)) + { + /* dive into deeper tree node */ + BlockNumber blkno_curr; + BlockNumber blkno_next; + PageHeaderData *gist_next; + OffsetNumber least_index = (rv ? index : UINT_MAX); + OffsetNumber buddy_index; + + for (int mask=1; mask <= 16; mask *= 2) + { + buddy_index = __shfl_xor_sync(__activemask(), least_index, mask); + least_index = Min(least_index, buddy_index); + } + __syncwarp(~0U); + assert(least_index <= maxoff); + + lpp = PageGetItemId(gist_page, least_index); + itup = (IndexTupleData *) PageGetItem(gist_page, lpp); + blkno_curr = ((char *)gist_page - (char *)gist_base) / BLCKSZ; + blkno_next = ((BlockNumber)itup->t_tid.ip_blkid.bi_hi << 16 | + (BlockNumber)itup->t_tid.ip_blkid.bi_lo); + assert(blkno_next < kds_gist->nrooms); + gist_next = KERN_DATA_STORE_BLOCK_PGPAGE(kds_gist, blkno_next); + assert(gist_next->pd_parent_blkno == blkno_curr && + gist_next->pd_parent_item == least_index); + gist_page = gist_next; + start = FirstOffsetNumber + LaneId(); + goto restart; + } + + /* this is matched */ + if (rv) + { + assert((char *)lpp >= (char *)gist_page && + (char *)lpp < (char *)gist_page + BLCKSZ); + *p_item_offset = (cl_uint)((char *)lpp - (char *)kds_gist); + + return &itup->t_tid; + } + + /* + * this is not matched - ensure the next call skips the main loop + * above, we set next offset of the 'maxoff' onto the p_item_offset. + */ + lpp = PageGetItemId(gist_page, maxoff+1); + *p_item_offset = (cl_uint)((char *)lpp - (char *)kds_gist); + + return NULL; + } + + /* + * By here, nobody meet any entries in this page + */ + if (gist_page != gist_base) + { + /* pop up to the parent */ + BlockNumber blkno_next = gist_page->pd_parent_blkno; + + assert(blkno_next < kds_gist->nrooms); + start = gist_page->pd_parent_item + 1 + LaneId(); + gist_page = KERN_DATA_STORE_BLOCK_PGPAGE(kds_gist, blkno_next); + goto restart; + } + /* cannot pop up from the root page */ + assert(gist_page->pd_parent_blkno == InvalidBlockNumber && + gist_page->pd_parent_item == InvalidOffsetNumber); + *p_item_offset = UINT_MAX; + + return NULL; +} + +/* + * gpujoin_exec_gistindex + */ +STATIC_FUNCTION(cl_int) +gpujoin_exec_gistindex(kern_context *kcxt, + kern_gpujoin *kgjoin, + kern_multirels *kmrels, + kern_data_store *kds_src, + kern_data_extra *kds_extra, + cl_int depth, + cl_uint *__rd_stack_base, + cl_uint *__wr_stack_base, + cl_uint *l_state, + cl_bool *matched) +{ + kern_data_store *kds_hash = KERN_MULTIRELS_INNER_KDS(kmrels, depth); + kern_data_store *kds_gist = KERN_MULTIRELS_GIST_INDEX(kmrels, depth); + cl_bool *oj_map = KERN_MULTIRELS_OUTER_JOIN_MAP(kmrels, depth); + cl_uint *wr_stack; + cl_uint *temp_stack; + cl_uint rd_index; + cl_uint wr_index; + cl_uint temp_index; + cl_uint count; + void *gist_keys; + cl_char *vlpos_saved_1 = kcxt->vlpos; + + assert(kds_hash->format == KDS_FORMAT_HASH); + assert(depth >= 1 && depth <= kgjoin->num_rels); + + if (__syncthreads_count(l_state[depth] != UINT_MAX && + l_state[depth] != 0) == 0 && + read_pos[depth-1] >= write_pos[depth-1]) + { + if (write_pos[depth] + get_local_size() <= GPUJOIN_PSEUDO_STACK_NROOMS) + { + cl_int __depth = gpujoin_rewind_stack(kgjoin, depth-1, + l_state, matched); + if (__depth >= base_depth) + return __depth; + } + /* flush if temporary index search results still remain */ + if (scan_done && temp_pos[depth] > 0) + goto bailout; + /* elsewhere, dive into the deeper depth or projection */ + return depth + 1; + } + __syncthreads(); + +reload: + kcxt->vlpos = vlpos_saved_1; /* rewind */ + assert(__activemask() == ~0U); + if (__all_sync(__activemask(), l_state[depth] == UINT_MAX) || + __all_sync(__activemask(), l_state[depth] == 0)) + { + /* + * all the threads in warp reached in the tail of GiST-index tree, so move to + * the next index key. + */ + if (LaneId() == 0) + { + rd_index = atomicAdd(&read_pos[depth-1], 1); + gist_pos[depth * MAXWARPS_PER_BLOCK + get_local_id() / warpSize] = rd_index; + } + __syncwarp(~0U); + rd_index = __shfl_sync(__activemask(), rd_index, 0); + l_state[depth] = 0; + } + else + { + /* resume the index-key */ + rd_index = gist_pos[depth * MAXWARPS_PER_BLOCK + get_local_id() / warpSize]; + } + /* threads in a warp must load exactly same index-key */ + assert(rd_index == __shfl_sync(__activemask(), rd_index, 0)); + + if (rd_index < write_pos[depth-1]) + { + cl_uint *rd_stack = __rd_stack_base + (rd_index * depth); + cl_char *vlpos_saved_2; + + gist_keys = gpujoin_gist_load_keys(kcxt, + kmrels, + kds_src, + kds_extra, + depth, + rd_stack); + assert(__activemask() == ~0U); + if (__any_sync(__activemask(), kcxt->errcode != 0)) + goto bailout; /* error */ + assert(gist_keys != NULL); + + /* + * MEMO: Cost to run gpujoin_gist_getnext highly depends on the key value. + * If key never matches any bounding-box, gpujoin_gist_getnext() returns + * immediately. If key matches some entries, thus walks down into the leaf + * of R-tree, it takes longer time than the above misshit cases. + * In case when individual warps have various execution time, in general, + * we should not put __syncthreads() because the warps that returned + * immediately from the gpujoin_gist_getnext() are blocked until completion + * of someone's R-tree index search. + * So, we don't put any __syncthreads() in the loop below. If a warp finished + * gpujoin_gist_getnext() very early, it can reload another index-key for + * the next search during the GiST-index search by the other warps/threads. + * If usage of temp_stack[] exceeds get_local_size(), all the warps move to + * the second phase to run gpujoin_join_quals(), because it means we can + * utilize all the core to evaluate Join quals in parallel; that is the most + * efficient way to run. + */ + vlpos_saved_2 = kcxt->vlpos; + do { + ItemPointerData *t_ctid; + cl_uint mask; + cl_uint t_off; + cl_uint l_next = l_state[depth]; + + t_ctid = gpujoin_gist_getnext(kcxt, + kgjoin, + depth, + kds_gist, + gist_keys, + &l_next); + assert(__activemask() == ~0U); + if (__any_sync(__activemask(), kcxt->errcode != 0)) + goto bailout; /* error */ + + mask = __ballot_sync(__activemask(), t_ctid != NULL); + count = __popc(mask); + if (LaneId() == 0) + temp_index = atomicAdd(&temp_pos[depth], count); + __syncwarp(~0U); + temp_index = __shfl_sync(__activemask(), temp_index, 0); + + if (temp_index + count > GPUJOIN_PSEUDO_STACK_NROOMS) + goto bailout; /* urgent flush; cannot write out all the results */ + temp_index += __popc(mask & ((1U << LaneId()) - 1)); + + if (t_ctid) + { + assert(t_ctid->ip_posid == USHRT_MAX); + t_off = (((cl_uint)t_ctid->ip_blkid.bi_hi << 16) | + ((cl_uint)t_ctid->ip_blkid.bi_lo)); + assert(temp_index < GPUJOIN_PSEUDO_STACK_NROOMS); + temp_stack = __wr_stack_base + + (depth+1) * (GPUJOIN_PSEUDO_STACK_NROOMS + temp_index); + memcpy(temp_stack, rd_stack, sizeof(cl_uint) * depth); + temp_stack[depth] = t_off; + assert(__kds_unpack(t_off) < kds_hash->length); + } + + if (LaneId() == 0) + atomicAdd(&stat_nitems2[depth], count); + __syncwarp(~0U); + l_state[depth] = l_next; + kcxt->vlpos = vlpos_saved_2; /* rewind */ + assert(__activemask() == ~0U); + } while (__any_sync(__activemask(), l_state[depth] != UINT_MAX)); + /* try to reload the next index-key, if temp_stack[] still has space. */ + assert(__activemask() == ~0U); + if (__shfl_sync(__activemask(), temp_pos[depth], 0) < get_local_size()) + goto reload; + } + else + { + l_state[depth] = UINT_MAX; + } +bailout: + /* error checks */ + if (__syncthreads_count(kcxt->errcode != 0) > 0) + return -1; + + if (temp_pos[depth] >= (scan_done ? 1 : get_local_size())) + { + temp_stack = NULL; + if (get_local_id() < temp_pos[depth]) + { + kern_tupitem *tupitem; + cl_bool joinquals_matched = false; + + temp_stack = __wr_stack_base + + (depth+1) * (GPUJOIN_PSEUDO_STACK_NROOMS + get_local_id()); + tupitem = (kern_tupitem *)((char *)kds_hash + + __kds_unpack(temp_stack[depth]) + - offsetof(kern_tupitem, htup)); + assert((char *)tupitem < (char *)kds_hash + kds_hash->length); + /* check join quals */ + if (gpujoin_join_quals(kcxt, + kds_src, + kds_extra, + kmrels, + depth, + temp_stack, + &tupitem->htup, + &joinquals_matched)) + { + assert(joinquals_matched); + /* No RIGHT JOIN are needed */ + assert(tupitem->rowid < kds_hash->nitems); + if (oj_map && !oj_map[tupitem->rowid]) + oj_map[tupitem->rowid] = true; + } + else + { + temp_stack = NULL; + } + } + + /* write out the result */ + wr_index = write_pos[depth]; + wr_index += pgstromStairlikeBinaryCount(temp_stack != NULL, &count); + if (get_local_id() == 0) + { + write_pos[depth] += count; + stat_nitems[depth] += count; + } + wr_stack = __wr_stack_base + (depth+1) * wr_index; + if (temp_stack) + memcpy(wr_stack, temp_stack, sizeof(cl_uint) * (depth+1)); + __syncthreads(); + + /* rewind the temp stack */ + if (get_local_id() == 0) + { + if (get_local_size() < temp_pos[depth]) + { + cl_uint remain = temp_pos[depth] - get_local_size(); + + temp_stack = __wr_stack_base + (depth+1) * GPUJOIN_PSEUDO_STACK_NROOMS; + memcpy(temp_stack, + temp_stack + (depth+1) * get_local_size(), + sizeof(cl_uint) * (depth+1) * remain); + temp_pos[depth] -= get_local_size(); + } + else + { + temp_pos[depth] = 0; + } + } + } + /* count number of threads still in-progress */ + count = __syncthreads_count(l_state[depth] != UINT_MAX && + l_state[depth] != 0); + if (get_local_id() == 0) + wip_count[depth] = count; + + /* see comment in gpujoin_exec_hashjoin */ + wr_index = write_pos[depth]; + __syncthreads(); + if (wr_index + get_local_size() <= GPUJOIN_PSEUDO_STACK_NROOMS) + return depth; + return depth+1; +} + +#define PSTACK_DEPTH(d) \ + ((d) >= 0 && (d) <= kgjoin->num_rels \ + ? (cl_uint *)((char *)pstack + pstack->ps_headsz + \ + get_group_id() * pstack->ps_unitsz + \ + pstack->ps_offset[(d)]) \ + : NULL) + +/* + * gpujoin_main + */ +DEVICE_FUNCTION(void) +gpujoin_main(kern_context *kcxt, + kern_gpujoin *kgjoin, + kern_multirels *kmrels, + kern_data_store *kds_src, + kern_data_extra *kds_extra, + kern_data_store *kds_dst, + kern_parambuf *kparams_gpreagg, /* only if combined GpuJoin */ + cl_uint *l_state, + cl_bool *matched) +{ + gpujoinPseudoStack *pstack = kgjoin->pstack; + cl_int max_depth = kgjoin->num_rels; + cl_int depth; + __shared__ cl_int depth_thread0 __attribute__((unused)); + + assert(kds_src->format == KDS_FORMAT_ROW || + kds_src->format == KDS_FORMAT_BLOCK || + kds_src->format == KDS_FORMAT_ARROW || + kds_src->format == KDS_FORMAT_COLUMN); + assert((kds_dst->format == KDS_FORMAT_ROW && kparams_gpreagg == NULL) || + (kds_dst->format == KDS_FORMAT_SLOT && kparams_gpreagg != NULL)); + + /* init per-depth context */ + if (get_local_id() == 0) + { + src_read_pos = UINT_MAX; + stat_source_nitems = 0; + memset(stat_nitems, 0, sizeof(cl_uint) * (max_depth+1)); + memset(stat_nitems2, 0, sizeof(cl_uint) * (max_depth+1)); + memset(wip_count, 0, sizeof(cl_uint) * (max_depth+1)); + memset(read_pos, 0, sizeof(cl_uint) * (max_depth+1)); + memset(write_pos, 0, sizeof(cl_uint) * (max_depth+1)); + memset(temp_pos, 0, sizeof(cl_uint) * (max_depth+1)); + memset(gist_pos, 0, sizeof(cl_uint) * (max_depth+1) * MAXWARPS_PER_BLOCK); + scan_done = false; + base_depth = 0; + } + /* resume the per-depth context, if any */ + if (kgjoin->resume_context) + depth = gpujoin_resume_context(kgjoin, l_state, matched); + else + depth = 0; + __syncthreads(); + + /* main logic of GpuJoin */ + while (depth >= 0) + { + /* rewind the varlena buffer */ + kcxt->vlpos = kcxt->vlbuf; + if (depth == 0) + { + /* LOAD FROM KDS_SRC (ROW/BLOCK/ARROW) */ + depth = gpujoin_load_source(kcxt, + kgjoin, + kds_src, + kds_extra, + PSTACK_DEPTH(depth), + l_state); + } + else if (depth > max_depth) + { + assert(depth == kmrels->nrels + 1); + if (kds_dst->format == KDS_FORMAT_ROW) + { + /* PROJECTION (ROW) */ + depth = gpujoin_projection_row(kcxt, + kgjoin, + kmrels, + kds_src, + kds_extra, + kds_dst, + PSTACK_DEPTH(kgjoin->num_rels), + l_state, + matched); + } + else + { + /* PROJECTION (SLOT) */ + depth = gpujoin_projection_slot(kcxt, + kparams_gpreagg, + kgjoin, + kmrels, + kds_src, + kds_extra, + kds_dst, + PSTACK_DEPTH(kgjoin->num_rels), + l_state, + matched); + } + } + else if (kmrels->chunks[depth-1].is_nestloop) + { + /* NEST-LOOP */ + depth = gpujoin_exec_nestloop(kcxt, + kgjoin, + kmrels, + kds_src, + kds_extra, + depth, + PSTACK_DEPTH(depth-1), + PSTACK_DEPTH(depth), + l_state, + matched); + } + else if (kmrels->chunks[depth-1].gist_offset != 0) + { + /* GiST-INDEX */ + depth = gpujoin_exec_gistindex(kcxt, + kgjoin, + kmrels, + kds_src, + kds_extra, + depth, + PSTACK_DEPTH(depth-1), + PSTACK_DEPTH(depth), + l_state, + matched); + } + else + { + /* HASH-JOIN */ + depth = gpujoin_exec_hashjoin(kcxt, + kgjoin, + kmrels, + kds_src, + kds_extra, + depth, + PSTACK_DEPTH(depth-1), + PSTACK_DEPTH(depth), + l_state, + matched); + } + if (get_local_id() == 0) + depth_thread0 = depth; + if (__syncthreads_count(kcxt->errcode) > 0) + return; + assert(depth_thread0 == depth); + } + + /* update statistics only if normal exit */ + if (depth == -1 && get_local_id() == 0) + { + gpujoinSuspendContext *sb + = KERN_GPUJOIN_SUSPEND_CONTEXT(kgjoin, get_group_id()); + sb->depth = -1; /* no more suspend/resume! */ + + atomicAdd(&kgjoin->source_nitems, stat_source_nitems); + atomicAdd(&kgjoin->outer_nitems, stat_nitems[0]); + for (int i=0; i <= max_depth; i++) + { + atomicAdd(&kgjoin->stat[i].nitems, stat_nitems[i+1]); + atomicAdd(&kgjoin->stat[i].nitems2, stat_nitems2[i+1]); + } + } +} + +/* + * gpujoin_collocate_outer_join_map + * + * it merges the result of other GPU devices and CPU fallback + */ +KERNEL_FUNCTION(void) +gpujoin_colocate_outer_join_map(kern_multirels *kmrels, + cl_uint num_devices) +{ + size_t nrooms = kmrels->ojmaps_length / sizeof(cl_uint); + cl_uint *ojmaps = (cl_uint *)((char *)kmrels + kmrels->kmrels_length); + cl_uint *destmap = ojmaps + kmrels->cuda_dindex * nrooms; + cl_uint i, j, map; + + for (i = get_global_id(); + i < nrooms; + i += get_global_size()) + { + map = 0; + for (j = 0; j <= num_devices; j++) + { + map |= ojmaps[i]; + ojmaps += nrooms; + } + destmap[i] = map; + } +} + +/* + * gpujoin_right_outer + */ +DEVICE_FUNCTION(void) +gpujoin_right_outer(kern_context *kcxt, + kern_gpujoin *kgjoin, + kern_multirels *kmrels, + cl_int outer_depth, + kern_data_store *kds_dst, + kern_parambuf *kparams_gpreagg, + cl_uint *l_state, + cl_bool *matched) +{ + gpujoinPseudoStack *pstack = kgjoin->pstack; + cl_int max_depth = kgjoin->num_rels; + cl_int depth; + __shared__ cl_int depth_thread0 __attribute__((unused)); + + assert(KERN_MULTIRELS_RIGHT_OUTER_JOIN(kmrels, outer_depth)); + assert((kds_dst->format == KDS_FORMAT_ROW && kparams_gpreagg == NULL) || + (kds_dst->format == KDS_FORMAT_SLOT && kparams_gpreagg != NULL)); + + /* setup per-depth context */ + memset(l_state, 0, sizeof(l_state)); + memset(matched, 0, sizeof(matched)); + if (get_local_id() == 0) + { + src_read_pos = UINT_MAX; + stat_source_nitems = 0; + memset(stat_nitems, 0, sizeof(cl_uint) * (max_depth+1)); + memset(stat_nitems2, 0, sizeof(cl_uint) * (max_depth+1)); + memset(wip_count, 0, sizeof(cl_uint) * (max_depth+1)); + memset(read_pos, 0, sizeof(cl_uint) * (max_depth+1)); + memset(write_pos, 0, sizeof(cl_uint) * (max_depth+1)); + memset(temp_pos, 0, sizeof(cl_uint) * (max_depth+1)); + memset(gist_pos, 0, sizeof(cl_uint) * (max_depth+1) * MAXWARPS_PER_BLOCK); + scan_done = false; + base_depth = outer_depth; + } + /* resume the per-depth context, if any */ + if (kgjoin->resume_context) + depth = gpujoin_resume_context(kgjoin, l_state, matched); + else + depth = outer_depth; + __syncthreads(); + + /* main logic of GpuJoin */ + while (depth >= outer_depth) + { + /* rewind the varlena buffer */ + kcxt->vlpos = kcxt->vlbuf; + if (depth == outer_depth) + { + /* makes RIGHT OUTER combinations using OUTER JOIN map */ + depth = gpujoin_load_outer(kcxt, + kgjoin, + kmrels, + outer_depth, + PSTACK_DEPTH(outer_depth), + l_state); + } + else if (depth > max_depth) + { + assert(depth == kmrels->nrels + 1); + if (kds_dst->format == KDS_FORMAT_ROW) + { + /* PROJECTION (ROW) */ + depth = gpujoin_projection_row(kcxt, + kgjoin, + kmrels, + NULL, + NULL, + kds_dst, + PSTACK_DEPTH(kgjoin->num_rels), + l_state, + matched); + } + else + { + /* PROJECTION (SLOT) */ + depth = gpujoin_projection_slot(kcxt, + kparams_gpreagg, + kgjoin, + kmrels, + NULL, + NULL, + kds_dst, + PSTACK_DEPTH(kgjoin->num_rels), + l_state, + matched); + } + } + else if (kmrels->chunks[depth-1].is_nestloop) + { + /* NEST-LOOP */ + depth = gpujoin_exec_nestloop(kcxt, + kgjoin, + kmrels, + NULL, + NULL, + depth, + PSTACK_DEPTH(depth-1), + PSTACK_DEPTH(depth), + l_state, + matched); + } + else if (kmrels->chunks[depth-1].gist_offset) + { + /* GiST-INDEX */ + depth = gpujoin_exec_gistindex(kcxt, + kgjoin, + kmrels, + NULL, + NULL, + depth, + PSTACK_DEPTH(depth-1), + PSTACK_DEPTH(depth), + l_state, + matched); + } + else + { + /* HASH-JOIN */ + depth = gpujoin_exec_hashjoin(kcxt, + kgjoin, + kmrels, + NULL, + NULL, + depth, + PSTACK_DEPTH(depth-1), + PSTACK_DEPTH(depth), + l_state, + matched); + } + if (get_local_id() == 0) + depth_thread0 = depth; + if (__syncthreads_count(kcxt->errcode) > 0) + return; + assert(depth == depth_thread0); + } + + /* write out statistics */ + if (get_local_id() == 0) + { + gpujoinSuspendContext *sb + = KERN_GPUJOIN_SUSPEND_CONTEXT(kgjoin, get_group_id()); + sb->depth = -1; /* no more suspend/resume! */ + + assert(stat_source_nitems == 0); + assert(stat_nitems[0] == 0); + for (int i=outer_depth; i <= max_depth; i++) + { + atomicAdd(&kgjoin->stat[i-1].nitems, stat_nitems[i]); + atomicAdd(&kgjoin->stat[i-1].nitems2, stat_nitems2[i]); + } + } + __syncthreads(); +} diff --git a/src/cuda_gpujoin.h b/old/cuda_gpujoin.h similarity index 100% rename from src/cuda_gpujoin.h rename to old/cuda_gpujoin.h diff --git a/old/cuda_gpupreagg.cu b/old/cuda_gpupreagg.cu new file mode 100644 index 000000000..f18dc2666 --- /dev/null +++ b/old/cuda_gpupreagg.cu @@ -0,0 +1,1773 @@ +/* + * cuda_gpupreagg.h + * + * Preprocess of aggregate using GPU acceleration, to reduce number of + * rows to be processed by CPU; including the Sort reduction. + * -- + * Copyright 2011-2021 (C) KaiGai Kohei + * Copyright 2014-2021 (C) PG-Strom Developers Team + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the PostgreSQL License. + */ +#include "cuda_common.h" +#include "cuda_gpupreagg.h" +#include "cuda_postgis.h" + +/* + * common portion for gpupreagg_setup_* + */ +STATIC_FUNCTION(bool) +gpupreagg_setup_common(kern_context *kcxt, + kern_gpupreagg *kgpreagg, + kern_data_store *kds_src, + kern_data_store *kds_slot, + cl_uint nvalids, + cl_uint slot_index, + cl_char *tup_dclass, + Datum *tup_values, + cl_int *tup_extra) +{ + cl_uint offset; + cl_uint required; + cl_uint extra_sz = 0; + cl_bool suspend_kernel = false; + __shared__ cl_uint nitems_base; + __shared__ cl_uint extra_base; + + /* + * calculation of the required extra buffer + */ + if (slot_index != UINT_MAX) + { + if (kds_slot->ncols > 0) + memset(tup_extra, 0, sizeof(cl_int) * kds_slot->ncols); + + for (int j=0; j < kds_slot->ncols; j++) + { + kern_colmeta *cmeta = &kds_slot->colmeta[j]; + cl_char dclass = tup_dclass[j]; + cl_char *addr; + + if (dclass == DATUM_CLASS__NULL) + continue; + if (cmeta->attbyval) + { + assert(dclass == DATUM_CLASS__NORMAL); + continue; + } + if (cmeta->attlen > 0) + { + assert(dclass == DATUM_CLASS__NORMAL); + addr = DatumGetPointer(tup_values[j]); + if (addr < (char *)kds_src || + addr >= (char *)kds_src + kds_src->length) + { + tup_extra[j] = cmeta->attlen; + extra_sz += MAXALIGN(cmeta->attlen); + } + } + else + { + /* + * NOTE: DATUM_CLASS__* that is not NORMAL only happen when + * Var-node references the kds_src buffer which is not + * a normal heap-tuple (Apache Arrow). So, it is sufficient + * to copy only pg_varlena_t or pg_array_t according to the + * datum class. Unlike gpupreagg_final_data_move(), kds_src + * buffer shall be valid until reduction steps. + */ + assert(cmeta->attlen == -1); + switch (dclass) + { + case DATUM_CLASS__VARLENA: + tup_extra[j] = sizeof(pg_varlena_t); + extra_sz += MAXALIGN(sizeof(pg_varlena_t)); + break; + case DATUM_CLASS__ARRAY: + tup_extra[j] = sizeof(pg_array_t); + extra_sz += MAXALIGN(sizeof(pg_array_t)); + break; + case DATUM_CLASS__COMPOSITE: + tup_extra[j] = sizeof(pg_composite_t); + extra_sz += MAXALIGN(sizeof(pg_composite_t)); + break; + case DATUM_CLASS__GEOMETRY: + tup_extra[j] = sizeof(pg_geometry_t); + extra_sz += MAXALIGN(sizeof(pg_geometry_t)); + break; + default: + assert(dclass == DATUM_CLASS__NORMAL); + addr = DatumGetPointer(tup_values[j]); + if (addr < (char *)kds_src || + addr >= (char *)kds_src + kds_src->length) + { + tup_extra[j] = VARSIZE_ANY(addr); + extra_sz += MAXALIGN(VARSIZE_ANY(addr)); + } + break; + } + } + } + } + + /* + * allocation of extra buffer for indirect/varlena values + */ + offset = pgstromStairlikeSum(extra_sz, &required); + if (get_local_id() == 0) + { + union { + struct { + cl_uint nitems; + cl_uint usage; + } i; + cl_ulong v64; + } oldval, curval, newval; + + curval.i.nitems = kds_slot->nitems; + curval.i.usage = kds_slot->usage; + do { + newval = oldval = curval; + newval.i.nitems += nvalids; + newval.i.usage += __kds_packed(required); + if (KERN_DATA_STORE_SLOT_LENGTH(kds_slot, newval.i.nitems) + + __kds_unpack(newval.i.usage) > kds_slot->length) + { + suspend_kernel = true; + atomicAdd(&kgpreagg->suspend_count, 1); + break; + } + } while((curval.v64 = atomicCAS((cl_ulong *)&kds_slot->nitems, + oldval.v64, + newval.v64)) != oldval.v64); + nitems_base = oldval.i.nitems; + extra_base = __kds_unpack(oldval.i.usage); + } + if (__syncthreads_count(suspend_kernel) > 0) + return false; + + if (slot_index != UINT_MAX) + { + assert(slot_index < nvalids); + slot_index += nitems_base; + /* + * Fixup pointers if needed. Please note that any variables on + * kcxt->vlbuf is not visible to other threads. + */ + if (extra_sz > 0) + { + char *extra_pos + = (char *)kds_slot + kds_slot->length + - (extra_base + required) + offset; + + for (int j=0; j < kds_slot->ncols; j++) + { + if (tup_extra[j] == 0) + continue; + memcpy(extra_pos, + DatumGetPointer(tup_values[j]), + tup_extra[j]); + tup_values[j] = PointerGetDatum(extra_pos); + extra_pos += MAXALIGN(tup_extra[j]); + } + } + memcpy(KERN_DATA_STORE_VALUES(kds_slot, slot_index), + tup_values, sizeof(Datum) * kds_slot->ncols); + memcpy(KERN_DATA_STORE_DCLASS(kds_slot, slot_index), + tup_dclass, sizeof(cl_char) * kds_slot->ncols); + } + return true; +} + +/* + * gpupreagg_setup_row + */ +DEVICE_FUNCTION(void) +gpupreagg_setup_row(kern_context *kcxt, + kern_gpupreagg *kgpreagg, + kern_data_store *kds_src, /* in: KDS_FORMAT_ROW */ + kern_data_store *kds_slot) /* out: KDS_FORMAT_SLOT */ +{ + cl_uint src_nitems = __ldg(&kds_src->nitems); + cl_uint src_base; + cl_uint src_index; + cl_uint slot_index; + cl_uint count; + cl_uint nvalids; + cl_char *vlbuf_base; + cl_char *tup_dclass; + Datum *tup_values; + cl_int *tup_extra; + kern_tupitem *tupitem; + gpupreaggSuspendContext *my_suspend; + cl_bool rc; + + assert(kds_src->format == KDS_FORMAT_ROW && + kds_slot->format == KDS_FORMAT_SLOT); + + /* resume kernel from the point where suspended, if any */ + my_suspend = KERN_GPUPREAGG_SUSPEND_CONTEXT(kgpreagg, get_group_id()); + if (kgpreagg->resume_context) + src_base = my_suspend->r.src_base; + else + src_base = get_global_base(); + __syncthreads(); + + tup_dclass = (cl_char *) + kern_context_alloc(kcxt, sizeof(cl_char) * kds_slot->ncols); + tup_values = (Datum *) + kern_context_alloc(kcxt, sizeof(Datum) * kds_slot->ncols); + tup_extra = (cl_int *) + kern_context_alloc(kcxt, sizeof(cl_int) * kds_slot->ncols); + if (!tup_dclass || !tup_values || !tup_extra) + STROM_EREPORT(kcxt, ERRCODE_OUT_OF_MEMORY, "out of memory"); + /* bailout if any errors */ + if (__syncthreads_count(kcxt->errcode) > 0) + goto skip; + vlbuf_base = kcxt->vlpos; + + while (src_base < src_nitems) + { + kcxt->vlpos = vlbuf_base; /* rewind */ + src_index = src_base + get_local_id(); + if (src_index < src_nitems) + { + tupitem = KERN_DATA_STORE_TUPITEM(kds_src, src_index); + rc = gpupreagg_quals_eval(kcxt, kds_src, + &tupitem->htup.t_ctid, + &tupitem->htup); + kcxt->vlpos = vlbuf_base; /* rewind */ + } + else + { + tupitem = NULL; + rc = false; + } + /* bailout if any errors on gpupreagg_quals_eval */ + if (__syncthreads_count(kcxt->errcode) > 0) + break; + /* allocation of kds_slot buffer, if any */ + slot_index = pgstromStairlikeBinaryCount(rc, &nvalids); + if (nvalids > 0) + { + if (rc) + { + assert(tupitem != NULL); + gpupreagg_projection_row(kcxt, + kds_src, + &tupitem->htup, + tup_dclass, + tup_values); + } + /* bailout if any errors */ + if (__syncthreads_count(kcxt->errcode) > 0) + break; + + if (!gpupreagg_setup_common(kcxt, + kgpreagg, + kds_src, + kds_slot, + nvalids, + rc ? slot_index : UINT_MAX, + tup_dclass, + tup_values, + tup_extra)) + break; + } + /* update statistics */ + count = __syncthreads_count(tupitem != NULL); + if (get_local_id() == 0) + { + atomicAdd(&kgpreagg->nitems_real, count); + atomicAdd(&kgpreagg->nitems_filtered, count - nvalids); + } + /* move to the next window */ + src_base += get_global_size(); + } +skip: + /* save the current execution context */ + if (get_local_id() == 0) + my_suspend->r.src_base = src_base; +} + +DEVICE_FUNCTION(void) +gpupreagg_setup_block(kern_context *kcxt, + kern_gpupreagg *kgpreagg, + kern_data_store *kds_src, + kern_data_store *kds_slot) +{ + cl_uint window_sz; + cl_uint part_sz; + cl_uint n_parts; + cl_uint count; + cl_uint part_index = 0; + cl_uint line_index = 0; + cl_bool thread_is_valid = false; + cl_char *vlbuf_base; + cl_char *tup_dclass; + Datum *tup_values; + cl_int *tup_extra; + gpupreaggSuspendContext *my_suspend; + + assert(kds_src->format == KDS_FORMAT_BLOCK && + kds_slot->format == KDS_FORMAT_SLOT); + + part_sz = Min((kds_src->nrows_per_block + + warpSize-1) & ~(warpSize-1), get_local_size()); + n_parts = get_local_size() / part_sz; + if (get_local_id() < part_sz * n_parts) + thread_is_valid = true; + window_sz = n_parts * get_num_groups(); + + /* resume kernel from the point where suspended, if any */ + my_suspend = KERN_GPUPREAGG_SUSPEND_CONTEXT(kgpreagg, get_group_id()); + if (kgpreagg->resume_context) + { + part_index = my_suspend->b.part_index; + line_index = my_suspend->b.line_index; + } + __syncthreads(); + + tup_dclass = (cl_char *) + kern_context_alloc(kcxt, sizeof(cl_char) * kds_slot->ncols); + tup_values = (Datum *) + kern_context_alloc(kcxt, sizeof(Datum) * kds_slot->ncols); + tup_extra = (cl_int *) + kern_context_alloc(kcxt, sizeof(cl_int) * kds_slot->ncols); + if (!tup_dclass || !tup_values || !tup_extra) + STROM_EREPORT(kcxt, ERRCODE_OUT_OF_MEMORY, "out of memory"); + /* bailout if any errors */ + if (__syncthreads_count(kcxt->errcode) > 0) + goto out; + vlbuf_base = kcxt->vlpos; + + for (;;) + { + cl_uint part_base; + cl_uint part_id; + cl_uint line_no; + cl_uint n_lines; + cl_uint nvalids; + PageHeaderData *pg_page; + ItemPointerData t_self __attribute__ ((unused)); + BlockNumber block_nr; + + part_base = part_index * window_sz + get_group_id() * n_parts; + if (part_base >= kds_src->nitems) + break; + part_id = get_local_id() / part_sz + part_base; + line_no = get_local_id() % part_sz + line_index * part_sz; + + do { + HeapTupleHeaderData *htup = NULL; + ItemIdData *curr_lpp = NULL; + cl_uint slot_index; + cl_bool rc = false; + + kcxt->vlpos = vlbuf_base; /* rewind */ + if (thread_is_valid && part_id < kds_src->nitems) + { + pg_page = KERN_DATA_STORE_BLOCK_PGPAGE(kds_src, part_id); + n_lines = PageGetMaxOffsetNumber(pg_page); + block_nr = KERN_DATA_STORE_BLOCK_BLCKNR(kds_src, part_id); + t_self.ip_blkid.bi_hi = block_nr >> 16; + t_self.ip_blkid.bi_lo = block_nr & 0xffff; + t_self.ip_posid = line_no + 1; + + if (line_no < n_lines) + { + curr_lpp = PageGetItemId(pg_page, line_no + 1); + if (ItemIdIsNormal(curr_lpp)) + htup = PageGetItem(pg_page, curr_lpp); + } + } + else + { + pg_page = NULL; + n_lines = 0; + } + + /* evaluation of the qualifier */ + if (htup) + { + rc = gpupreagg_quals_eval(kcxt, kds_src, &t_self, htup); + kcxt->vlpos = vlbuf_base; /* rewind */ + } + /* bailout if any errors on gpupreagg_quals_eval */ + if (__syncthreads_count(kcxt->errcode) > 0) + goto out; + /* allocation of the kds_slot buffer */ + slot_index = pgstromStairlikeBinaryCount(rc, &nvalids); + if (nvalids > 0) + { + if (rc) + { + gpupreagg_projection_row(kcxt, + kds_src, + htup, + tup_dclass, + tup_values); + } + /* bailout if any errors */ + if (__syncthreads_count(kcxt->errcode) > 0) + goto out; + + if (!gpupreagg_setup_common(kcxt, + kgpreagg, + kds_src, + kds_slot, + nvalids, + rc ? slot_index : UINT_MAX, + tup_dclass, + tup_values, + tup_extra)) + goto out; + } + /* update statistics */ + count = __syncthreads_count(htup != NULL); + if (get_local_id() == 0) + { + atomicAdd(&kgpreagg->nitems_real, count); + atomicAdd(&kgpreagg->nitems_filtered, count - nvalids); + } + + /* + * Move to the next window of the line items, if any. + * If no threads in CUDA block wants to continue, exit the loop. + */ + line_index++; + line_no += part_sz; + } while (__syncthreads_count(thread_is_valid && + line_no < n_lines) > 0); + /* move to the next window */ + part_index++; + line_index = 0; + } +out: + if (get_local_id() == 0) + { + my_suspend->b.part_index = part_index; + my_suspend->b.line_index = line_index; + } +} + +/* + * gpupreagg_setup_arrow + */ +DEVICE_FUNCTION(void) +gpupreagg_setup_arrow(kern_context *kcxt, + kern_gpupreagg *kgpreagg, + kern_data_store *kds_src, /* in: KDS_FORMAT_ARROW */ + kern_data_store *kds_slot) /* out: KDS_FORMAT_SLOT */ +{ + cl_uint src_nitems = __ldg(&kds_src->nitems); + cl_uint src_base; + cl_uint src_index; + cl_uint slot_index; + cl_uint count; + cl_uint nvalids; + cl_char *vlbuf_base; + cl_char *tup_dclass; + Datum *tup_values; + cl_int *tup_extra; + gpupreaggSuspendContext *my_suspend; + cl_bool rc; + + assert(kds_src->format == KDS_FORMAT_ARROW && + kds_slot->format == KDS_FORMAT_SLOT); + + /* resume kernel from the point where suspended, if any */ + my_suspend = KERN_GPUPREAGG_SUSPEND_CONTEXT(kgpreagg, get_group_id()); + if (kgpreagg->resume_context) + src_base = my_suspend->c.src_base; + else + src_base = get_global_base(); + __syncthreads(); + + tup_dclass = (cl_char *) + kern_context_alloc(kcxt, sizeof(cl_char) * kds_slot->ncols); + tup_values = (Datum *) + kern_context_alloc(kcxt, sizeof(Datum) * kds_slot->ncols); + tup_extra = (cl_int *) + kern_context_alloc(kcxt, sizeof(cl_int) * kds_slot->ncols); + if (!tup_dclass || !tup_values || !tup_extra) + STROM_EREPORT(kcxt, ERRCODE_OUT_OF_MEMORY, "out of memory"); + /* bailout if any errors */ + if (__syncthreads_count(kcxt->errcode) > 0) + goto skip; + vlbuf_base = kcxt->vlpos; + + while (src_base < src_nitems) + { + kcxt->vlpos = vlbuf_base; /* rewind */ + src_index = src_base + get_local_id(); + if (src_index < src_nitems) + { + rc = gpupreagg_quals_eval_arrow(kcxt, kds_src, src_index); + kcxt->vlpos = vlbuf_base; /* rewind */ + } + else + { + rc = false; + } + /* Bailout if any error on gpupreagg_quals_eval_arrow */ + if (__syncthreads_count(kcxt->errcode) > 0) + break; + /* allocation of kds_slot buffer, if any */ + slot_index = pgstromStairlikeBinaryCount(rc ? 1 : 0, &nvalids); + if (nvalids > 0) + { + if (rc) + { + gpupreagg_projection_arrow(kcxt, + kds_src, + src_index, + tup_dclass, + tup_values); + } + /* Bailout if any error */ + if (__syncthreads_count(kcxt->errcode) > 0) + break; + /* common portion */ + if (!gpupreagg_setup_common(kcxt, + kgpreagg, + kds_src, + kds_slot, + nvalids, + rc ? slot_index : UINT_MAX, + tup_dclass, + tup_values, + tup_extra)) + break; + } + /* update statistics */ + count = __syncthreads_count(src_index < src_nitems); + if (get_local_id() == 0) + { + atomicAdd(&kgpreagg->nitems_real, count); + atomicAdd(&kgpreagg->nitems_filtered, count - nvalids); + } + /* move to the next window */ + src_base += get_global_size(); + } +skip: + /* save the current execution context */ + if (get_local_id() == 0) + my_suspend->c.src_base = src_base; +} + +/* + * gpupreagg_setup_column + */ +DEVICE_FUNCTION(void) +gpupreagg_setup_column(kern_context *kcxt, + kern_gpupreagg *kgpreagg, + kern_data_store *kds_src, /* in: KDS_FORMAT_COLUMN */ + kern_data_extra *kds_extra, + kern_data_store *kds_slot) +{ + cl_uint src_base; + cl_char *tup_dclass; + Datum *tup_values; + cl_int *tup_extra; /* !!not related to extra buffer of column format!! */ + cl_char *vlbuf_base; + gpupreaggSuspendContext *my_suspend; + + assert(kds_src->format == KDS_FORMAT_COLUMN && + kds_slot->format == KDS_FORMAT_SLOT); + /* resume kernel from the point where suspended, if any */ + my_suspend = KERN_GPUPREAGG_SUSPEND_CONTEXT(kgpreagg, get_group_id()); + if (kgpreagg->resume_context) + src_base = my_suspend->c.src_base; + else + src_base = get_global_base(); + + tup_dclass = (cl_char *) + kern_context_alloc(kcxt, sizeof(cl_char) * kds_slot->ncols); + tup_values = (Datum *) + kern_context_alloc(kcxt, sizeof(Datum) * kds_slot->ncols); + tup_extra = (cl_int *) + kern_context_alloc(kcxt, sizeof(cl_int) * kds_slot->ncols); + if (!tup_dclass || !tup_values || !tup_extra) + STROM_EREPORT(kcxt, ERRCODE_OUT_OF_MEMORY, "out of memory"); + /* bailout if any errors */ + if (__syncthreads_count(kcxt->errcode) > 0) + goto skip; + vlbuf_base = kcxt->vlpos; + + while (src_base < kds_src->nitems) + { + cl_uint src_index = src_base + get_local_id(); + cl_uint slot_index; + cl_uint nvalids; + cl_uint count; + cl_bool visible = false; + cl_bool rc = false; + + kcxt->vlpos = vlbuf_base; /* rewind */ + if (src_index < kds_src->nitems) + { + visible = kern_check_visibility_column(kcxt, + kds_src, + src_index); + if (visible) + { + rc = gpupreagg_quals_eval_column(kcxt, + kds_src, + kds_extra, + src_index); + } + kcxt->vlpos = vlbuf_base; /* rewind */ + } + /* bailout if any errors */ + if (__syncthreads_count(kcxt->errcode) > 0) + break; + /* allocation of kds_slot buffer, if any */ + slot_index = pgstromStairlikeBinaryCount(rc ? 1 : 0, &nvalids); + if (nvalids > 0) + { + if (rc) + { + gpupreagg_projection_column(kcxt, + kds_src, + kds_extra, + src_index, + tup_dclass, + tup_values); + } + /* bailout if any errors */ + if (__syncthreads_count(kcxt->errcode) > 0) + break; + /* common portion */ + if (!gpupreagg_setup_common(kcxt, + kgpreagg, + kds_src, + kds_slot, + nvalids, + rc ? slot_index : UINT_MAX, + tup_dclass, + tup_values, + tup_extra)) + break; + } + /* update statistics */ + count = __syncthreads_count(visible); + if (get_local_id() == 0) + { + atomicAdd(&kgpreagg->nitems_real, count); + atomicAdd(&kgpreagg->nitems_filtered, count - nvalids); + } + /* move to the next window */ + src_base += get_global_size(); + } +skip: + /* save the current execution context */ + if (get_local_id() == 0) + my_suspend->c.src_base = src_base; +} + +/* + * gpupreagg_nogroup_reduction + */ +DEVICE_FUNCTION(void) +gpupreagg_nogroup_reduction(kern_context *kcxt, + kern_gpupreagg *kgpreagg, /* in/out */ + kern_errorbuf *kgjoin_errorbuf, /* in */ + kern_data_store *kds_slot, /* in */ + kern_data_store *kds_final, /* global out */ + cl_char *p_dclass, /* __private__ */ + Datum *p_values, /* __private__ */ + char *p_extras) /* __private__ */ +{ + cl_bool is_last_reduction = false; + cl_bool try_final_merge = true; + cl_uint lane_id = (get_local_id() & warpSize - 1); + + /* init local/private buffer */ + assert(MAXWARPS_PER_BLOCK <= get_local_size() && + MAXWARPS_PER_BLOCK == warpSize); + gpupreagg_init_local_slot(p_dclass, p_values, p_extras); + + /* skip if previous stage reported an error */ + if (kgjoin_errorbuf && + __syncthreads_count(kgjoin_errorbuf->errcode) != 0) + return; + if (__syncthreads_count(kgpreagg->kerror.errcode) != 0) + return; + + assert(kgpreagg->num_group_keys == 0); + assert(kds_slot->format == KDS_FORMAT_SLOT); + assert(kds_final->format == KDS_FORMAT_SLOT); + assert(kds_slot->ncols == kds_final->ncols); + if (get_global_id() == 0) + kgpreagg->setup_slot_done = true; + + /* start private reduction */ + is_last_reduction = false; + do { + cl_uint index; + + if (lane_id == 0) + index = atomicAdd(&kgpreagg->read_slot_pos, warpSize); + index = __shfl_sync(__activemask(), index, 0); + if (index + warpSize >= kds_slot->nitems) + is_last_reduction = true; + index += lane_id; + + /* accumulate to the private buffer */ + if (index < kds_slot->nitems) + { + gpupreagg_update_normal(p_dclass, + p_values, + GPUPREAGG_ACCUM_MAP_LOCAL, + KERN_DATA_STORE_DCLASS(kds_slot, index), + KERN_DATA_STORE_VALUES(kds_slot, index), + GPUPREAGG_ACCUM_MAP_GLOBAL); + } + } while (!is_last_reduction); + + __syncthreads(); + + /* + * inter-warp reduction using shuffle operations + */ + for (cl_uint mask = 1; mask < warpSize; mask += mask) + { + cl_uint buddy_id = ((get_local_id() ^ mask) & (warpSize-1)); + + gpupreagg_merge_shuffle(p_dclass, + p_values, + GPUPREAGG_ACCUM_MAP_LOCAL, + buddy_id); + } + + /* + * update the final buffer + */ + try_final_merge = ((get_local_id() & (warpSize - 1)) == 0); + do { + if (try_final_merge) + { + union { + struct { + cl_uint nitems; + cl_uint usage; + } i; + cl_ulong v64; + } oldval, curval, newval; + + assert((get_local_id() & (warpSize - 1)) == 0); + + oldval.i.nitems = 0; + oldval.i.usage = kds_final->usage; + newval.i.nitems = 0xffffffffU; /* LOCKED */ + newval.i.usage = kds_final->usage + + __kds_packed(GPUPREAGG_ACCUM_EXTRA_BUFSZ); + + curval.v64 = atomicCAS((cl_ulong *)&kds_final->nitems, + oldval.v64, + newval.v64); + if (curval.i.nitems <= 1) + { + cl_char *f_dclass = KERN_DATA_STORE_DCLASS(kds_final, 0); + Datum *f_values = KERN_DATA_STORE_VALUES(kds_final, 0); + char *f_extras; + + if (curval.i.nitems == 0) + { + f_extras = ((char *)kds_final + + kds_final->length - + __kds_unpack(curval.i.usage) - + GPUPREAGG_ACCUM_EXTRA_BUFSZ); + gpupreagg_init_final_slot(f_dclass, f_values, f_extras); + atomicAdd(&kgpreagg->num_groups, 1); + __threadfence(); + atomicExch(&kds_final->nitems, 1); /* UNLOCK */ + } + gpupreagg_merge_atomic(f_dclass, + f_values, + GPUPREAGG_ACCUM_MAP_GLOBAL, + p_dclass, + p_values, + GPUPREAGG_ACCUM_MAP_LOCAL); + try_final_merge = false; + kgpreagg->final_buffer_modified = true; + } + else + { + assert(curval.i.nitems == 0xffffffffU); + } + } + } while (__syncthreads_count(try_final_merge) > 0); +} + +#define HASHITEM_EMPTY (0xffffffffU) +#define HASHITEM_LOCKED (0xfffffffeU) + +static __shared__ cl_bool l_final_buffer_modified; + +/* + * gpupreagg_init_final_hash + */ +KERNEL_FUNCTION(void) +gpupreagg_init_final_hash(kern_global_hashslot *f_hash, + size_t f_hash_nslots, + size_t f_hash_length) +{ + if (get_global_id() == 0) + { + f_hash->length = f_hash_length; + f_hash->lock = 0; + f_hash->usage = 0; + f_hash->nslots = f_hash_nslots; + } + + for (size_t i = get_global_id(); i < f_hash_nslots; i += get_global_size()) + f_hash->slots[i] = HASHITEM_EMPTY; +} + +/* + * gpupreagg_create_final_slot + * + * + */ +STATIC_FUNCTION(cl_uint) +gpupreagg_create_final_slot(kern_context *kcxt, + kern_data_store *kds_final, + kern_data_store *kds_src, + cl_uint src_index, + cl_char *l_dclass, + Datum *l_values) +{ + cl_char *src_dclass = KERN_DATA_STORE_DCLASS(kds_src, src_index); + Datum *src_values = KERN_DATA_STORE_VALUES(kds_src, src_index); + cl_char *dst_dclass; + Datum *dst_values; + cl_uint dst_index; + cl_uint alloc_sz; + char *extra = NULL; + union { + struct { + cl_uint nitems; + cl_uint usage; + } i; + cl_ulong v64; + } oldval, curval, newval; + + /* sanity checks */ + assert(kds_final->format == KDS_FORMAT_SLOT && + kds_src->format == KDS_FORMAT_SLOT); + assert(kds_final->ncols == kds_src->ncols); + assert(src_index < kds_src->nitems); + + /* size for extra allocation */ + alloc_sz = GPUPREAGG_ACCUM_EXTRA_BUFSZ; + for (int j=0; j < kds_src->ncols; j++) + { + kern_colmeta *cmeta = &kds_src->colmeta[j]; + cl_char dclass = src_dclass[j]; + cl_uint len; + + if (GPUPREAGG_ATTR_IS_ACCUM_VALUES[j]) + continue; + if (dclass == DATUM_CLASS__NULL) + continue; + + if (cmeta->attbyval) + { + assert(dclass == DATUM_CLASS__NORMAL); + } + else if (cmeta->attlen > 0) + { + assert(dclass == DATUM_CLASS__NORMAL); + alloc_sz += MAXALIGN(cmeta->attlen); + } + else + { + assert(cmeta->attlen == -1); + switch (dclass) + { + case DATUM_CLASS__NORMAL: + len = VARSIZE_ANY(DatumGetPointer(src_values[j])); + break; + case DATUM_CLASS__VARLENA: + len = pg_varlena_datum_length(kcxt, src_values[j]); + break; + case DATUM_CLASS__ARRAY: + len = pg_array_datum_length(kcxt, src_values[j]); + break; + case DATUM_CLASS__COMPOSITE: + len = pg_composite_datum_length(kcxt, src_values[j]); + break; + case DATUM_CLASS__GEOMETRY: + len = pg_geometry_datum_length(kcxt, src_values[j]); + break; + default: + STROM_ELOG(kcxt, "unexpected internal format code"); + return UINT_MAX; + } + alloc_sz += MAXALIGN(len); + } + } + + /* + * allocation of a new slot and extra buffer + */ + curval.i.nitems = __volatileRead(&kds_final->nitems); + curval.i.usage = __volatileRead(&kds_final->usage); + do { + newval = oldval = curval; + newval.i.nitems += 1; + newval.i.usage += __kds_packed(alloc_sz); + if (KERN_DATA_STORE_SLOT_LENGTH(kds_final, newval.i.nitems) + + __kds_unpack(newval.i.usage) > kds_final->length) + { + STROM_EREPORT(kcxt, ERRCODE_OUT_OF_MEMORY, + "out of memory (kds_final)"); + return UINT_MAX; + } + } while ((curval.v64 = atomicCAS((cl_ulong *)&kds_final->nitems, + oldval.v64, + newval.v64)) != oldval.v64); + /* + * Move the initial values to kds_final + */ + dst_index = oldval.i.nitems; + dst_dclass = KERN_DATA_STORE_DCLASS(kds_final, dst_index); + dst_values = KERN_DATA_STORE_VALUES(kds_final, dst_index); + if (alloc_sz > 0) + extra = (char *)kds_final + kds_final->length - __kds_unpack(newval.i.usage); + l_final_buffer_modified = true; + + /* init final slot */ + gpupreagg_init_final_slot(dst_dclass, dst_values, extra); + extra += GPUPREAGG_ACCUM_EXTRA_BUFSZ; + + /* copy the grouping keys */ + for (int j=0; j < kds_src->ncols; j++) + { + kern_colmeta *cmeta = &kds_src->colmeta[j]; + cl_char dclass = src_dclass[j]; + Datum datum = src_values[j]; + cl_uint len; + + if (GPUPREAGG_ATTR_IS_ACCUM_VALUES[j]) + continue; + + if (dclass == DATUM_CLASS__NULL || cmeta->attbyval) + { + dst_dclass[j] = dclass; + dst_values[j] = datum; + } + else if (cmeta->attlen > 0) + { + assert(dclass == DATUM_CLASS__NORMAL); + memcpy(extra, DatumGetPointer(datum), cmeta->attlen); + dst_dclass[j] = DATUM_CLASS__NORMAL; + dst_values[j] = PointerGetDatum(extra); + extra += MAXALIGN(cmeta->attlen); + } + else + { + assert(cmeta->attlen == -1); + switch (dclass) + { + case DATUM_CLASS__NORMAL: + len = VARSIZE_ANY(datum); + memcpy(extra, DatumGetPointer(datum), len); + break; + case DATUM_CLASS__VARLENA: + len = pg_varlena_datum_write(kcxt, extra, datum); + break; + case DATUM_CLASS__ARRAY: + len = pg_array_datum_write(kcxt, extra, datum); + break; + case DATUM_CLASS__COMPOSITE: + len = pg_composite_datum_write(kcxt, extra, datum); + break; + case DATUM_CLASS__GEOMETRY: + len = pg_geometry_datum_write(kcxt, extra, datum); + break; + default: + STROM_ELOG(kcxt, "unexpected internal format code"); + return UINT_MAX; + } + dst_dclass[j] = DATUM_CLASS__NORMAL; + dst_values[j] = PointerGetDatum(extra); + extra += MAXALIGN(len); + } + } + /* copy the accum values */ + if (l_dclass && l_values) + gpupreagg_merge_atomic(dst_dclass, + dst_values, + GPUPREAGG_ACCUM_MAP_GLOBAL, + l_dclass, + l_values, + GPUPREAGG_ACCUM_MAP_LOCAL); + else + gpupreagg_update_atomic(dst_dclass, + dst_values, + GPUPREAGG_ACCUM_MAP_GLOBAL, + src_dclass, + src_values, + GPUPREAGG_ACCUM_MAP_GLOBAL); + __threadfence(); + + return dst_index; +} + +/* + * gpupreagg_expand_global_hash - expand size of the global hash slot on demand. + * up to the f_hashlimit. It internally acquires shared lock of the final + * hash-slot, if it returns true. So, caller MUST release it when a series of + * operations get completed. Elsewhere, it returns false. caller MUST retry. + */ +STATIC_FUNCTION(cl_bool) +__expand_global_hash(kern_context *kcxt, kern_global_hashslot *f_hash) +{ + cl_bool expanded = false; + cl_uint i, j; + + /* + * Expand the global hash-slot + */ + if (get_local_id() == 0) + { + cl_uint __nslots = 2 * f_hash->nslots + 2000; + cl_uint __usage = 2 * f_hash->usage + 2000; + size_t consumed; + + /* expand twice and mode */ + consumed = (MAXALIGN(offsetof(kern_global_hashslot, slots[__nslots])) + + MAXALIGN(sizeof(preagg_hash_item) * __usage)); + if (consumed <= f_hash->length) + { + f_hash->nslots = __nslots; + expanded = true; + } + else + { + STROM_EREPORT(kcxt, ERRCODE_STROM_DATASTORE_NOSPACE, + "f_hash has no more space"); + } + } + if (__syncthreads_count(expanded) == 0) + return false; /* failed */ + + /* fix up the global hash-slot */ + for (i = get_local_id(); i < f_hash->nslots; i += get_local_size()) + { + f_hash->slots[i] = HASHITEM_EMPTY; + } + __syncthreads(); + + for (i = 0; i < f_hash->usage; i += get_local_size()) + { + preagg_hash_item *hitem = NULL; + cl_uint hindex = UINT_MAX; + cl_uint next; + + j = i + get_local_id(); + if (j < f_hash->usage) + { + hitem = GLOBAL_HASHSLOT_GETITEM(f_hash, j); + hindex = hitem->hash % f_hash->nslots; + } + + do { + if (hitem) + { + next = __volatileRead(&f_hash->slots[hindex]); + assert(next == HASHITEM_EMPTY || next < f_hash->usage); + hitem->next = next; + if (atomicCAS(&f_hash->slots[hindex], next, j) == next) + hitem = NULL; + } + } while(__syncthreads_count(hitem != NULL) > 0); + } + return true; +} + +STATIC_INLINE(cl_bool) +gpupreagg_expand_global_hash(kern_context *kcxt, + kern_global_hashslot *f_hash, + cl_uint required) +{ + cl_bool lock_wait = false; + cl_bool expand_hash = false; + cl_uint old_lock; + cl_uint new_lock; + cl_uint curr_usage; + + /* Get shared/exclusive lock on the final hash slot */ + do { + if (get_local_id() == 0) + { + curr_usage = __volatileRead(&f_hash->usage); + expand_hash = (curr_usage + required > f_hash->nslots); + + old_lock = __volatileRead(&f_hash->lock); + if ((old_lock & 0x0001) != 0) + lock_wait = true; /* someone has exclusive lock */ + else + { + if (expand_hash) + new_lock = old_lock + 3; /* shared + exclusive lock */ + else + new_lock = old_lock + 2; /* shared lock */ + + if (atomicCAS(&f_hash->lock, + old_lock, + new_lock) == old_lock) + lock_wait = false; /* Ok, lock is acquired */ + else + lock_wait = true; /* Oops, conflict. Retry again. */ + } + } + } while (__syncthreads_count(lock_wait) > 0); + + if (__syncthreads_count(expand_hash) > 0) + { + /* wait while other threads are running in the critial section */ + lock_wait = false; + do { + if (get_local_id() == 0) + { + old_lock = __volatileRead(&f_hash->lock); + assert((old_lock & 1) == 1); + lock_wait = (old_lock != 3); + } + } while(__syncthreads_count(lock_wait) > 0); + + /* + * Expand the global hash table + */ + if (!__expand_global_hash(kcxt, f_hash)) + { + /* Error! release exclusive lock */ + __syncthreads(); + if (get_local_id() == 0) + { + old_lock = atomicSub(&f_hash->lock, 3); + assert((old_lock & 0x0001) != 0); + } + return false; + } + /* Ensure the updates of f_hash visible to others */ + __threadfence(); + /* Downgrade the lock */ + __syncthreads(); + if (get_local_id() == 0) + { + old_lock = atomicSub(&f_hash->lock, 1); + assert((old_lock & 0x0001) != 0); + } + } + return true; +} + +/* + * gpupreagg_global_reduction + */ +STATIC_FUNCTION(cl_bool) +gpupreagg_global_reduction(kern_context *kcxt, + kern_data_store *kds_slot, + cl_uint kds_index, + cl_uint hash, + kern_data_store *kds_final, + kern_global_hashslot *f_hash, + cl_char *l_dclass, /* can be NULL */ + Datum *l_values) /* can be NULL */ +{ + preagg_hash_item *hitem = NULL; + cl_uint hindex = hash % f_hash->nslots; + cl_uint next; + cl_uint curr; + cl_uint dst_index; + cl_char *dst_dclass; + Datum *dst_values; + cl_bool is_locked = false; + + /* + * Step-1: Lookup hash slot without locking + */ + curr = next = __volatileRead(&f_hash->slots[hindex]); + __threadfence(); + if (curr == HASHITEM_LOCKED) + return false; /* locked, try again */ +restart: + while (curr != HASHITEM_EMPTY) + { + assert(curr < __volatileRead(&f_hash->usage)); + + hitem = GLOBAL_HASHSLOT_GETITEM(f_hash, curr); + if (hitem->hash == hash && + gpupreagg_keymatch(kcxt, + kds_slot, kds_index, + kds_final, hitem->index)) + { + dst_dclass = KERN_DATA_STORE_DCLASS(kds_final, hitem->index); + dst_values = KERN_DATA_STORE_VALUES(kds_final, hitem->index); + + if (l_dclass && l_values) + gpupreagg_merge_atomic(dst_dclass, + dst_values, + GPUPREAGG_ACCUM_MAP_GLOBAL, + l_dclass, + l_values, + GPUPREAGG_ACCUM_MAP_LOCAL); + else + gpupreagg_update_atomic(dst_dclass, + dst_values, + GPUPREAGG_ACCUM_MAP_GLOBAL, + KERN_DATA_STORE_DCLASS(kds_slot, kds_index), + KERN_DATA_STORE_VALUES(kds_slot, kds_index), + GPUPREAGG_ACCUM_MAP_GLOBAL); + if (is_locked) + atomicExch(&f_hash->slots[hindex], next); //UNLOCK + l_final_buffer_modified = true; + return true; + } + curr = hitem->next; + } + + /* + * Step-2: Ensure that f_hash has no entry under the lock + */ + if (!is_locked) + { + curr = next = __volatileRead(&f_hash->slots[hindex]); + __threadfence(); + if (curr == HASHITEM_LOCKED || + atomicCAS(&f_hash->slots[hindex], + curr, + HASHITEM_LOCKED) != curr) + return false; /* already locked, try again */ + is_locked = true; + goto restart; + } + + /* + * Step-3: create a slot on kds_final + */ + dst_index = gpupreagg_create_final_slot(kcxt, + kds_final, + kds_slot, + kds_index, + l_dclass, + l_values); + if (dst_index == UINT_MAX) + { + /* likely, out of memory */ + atomicExch(&f_hash->slots[hindex], next); //UNLOCK + return false; + } + + /* + * Step-4: allocation of hash entry + */ + curr = atomicAdd(&f_hash->usage, 1); + if (offsetof(kern_global_hashslot, slots[f_hash->nslots]) + + sizeof(preagg_hash_item) * (curr + 1) >= f_hash->length) + { + STROM_EREPORT(kcxt, ERRCODE_OUT_OF_MEMORY, "out of memory"); + atomicExch(&f_hash->slots[hindex], next); //UNLOCK + return false; + } + hitem = GLOBAL_HASHSLOT_GETITEM(f_hash, curr); + hitem->index = dst_index; + hitem->hash = hash; + hitem->next = next; + + /* + * NOTE: Above modification to kds_final/f_hash are weakly-ordered memory + * writes, thus, updates on the hitem and kds_final may not be visible to + * other threads in the device. + * __threadfence() ensures any writes prior to the invocation are visible + * to other threads. Don't eliminate this. + */ + __threadfence(); + + atomicExch(&f_hash->slots[hindex], curr); //UNLOCK; + + return true; +} + +/* + * gpupreagg_local_reduction + * + * + */ +STATIC_INLINE(int) +gpupreagg_local_reduction(kern_context *kcxt, + kern_data_store *kds_slot, + cl_uint index, + cl_uint hash, + preagg_local_hashtable *l_htable, + preagg_hash_item *l_hitems, + cl_char *l_dclass, /* __shared__ */ + Datum *l_values, /* __shared__ */ + char *l_extras) /* __shared__ */ +{ + cl_uint hindex = hash % GPUPREAGG_LOCAL_HASH_NSLOTS; + cl_uint curr; + cl_uint next; + cl_bool is_locked = false; + + curr = next = __volatileRead(&l_htable->l_hslots[hindex]); + __threadfence_block(); + if (curr == HASHITEM_LOCKED) + return -1; /* locked */ +restart: + while (curr < GPUPREAGG_LOCAL_HASH_NROOMS) + { + preagg_hash_item *hitem = &l_hitems[curr]; + + if (hitem->hash == hash && + gpupreagg_keymatch(kcxt, + kds_slot, index, + kds_slot, hitem->index)) + { + if (is_locked) + atomicExch(&l_htable->l_hslots[hindex], next); //UNLOCK + goto found; + } + curr = hitem->next; + } + assert(curr == HASHITEM_EMPTY); + + if (__volatileRead(&l_htable->nitems) >= GPUPREAGG_LOCAL_HASH_NROOMS) + { + /* + * Here we could not find out the entry on the local hash-table, + * but obviously no space on the local hash-table also. + * In this case, thread goes to the second path for the global-to- + * global reduction. + */ + if (is_locked) + atomicExch(&l_htable->l_hslots[hindex], next); //UNLOCK + return 0; /* not found */ + } + assert(l_hitems && l_dclass && l_values); + + /* + * Begin critical section + */ + if (!is_locked) + { + curr = next = __volatileRead(&l_htable->l_hslots[hindex]); + __threadfence_block(); + if (curr == HASHITEM_LOCKED || + atomicCAS(&l_htable->l_hslots[hindex], + next, + HASHITEM_LOCKED) != next) + return -1; /* lock contension, retry again. */ + is_locked = true; + goto restart; + } + curr = atomicAdd(&l_htable->nitems, 1); + if (curr >= GPUPREAGG_LOCAL_HASH_NROOMS) + { + /* + * Oops, the local hash-table has no space to save a new + * entry any more. So, unlock the slot, then return to + * the caller to go to the second path for the global-to- + * global reduction. + */ + atomicExch(&l_htable->l_hslots[hindex], next); //UNLOCK + return 0; /* not found */ + } + + /* + * initial allocation of the hash-item that is allocated above. + */ + l_hitems[curr].index = index; + l_hitems[curr].hash = hash; + l_hitems[curr].next = next; + + if (l_extras) + l_extras += GPUPREAGG_ACCUM_EXTRA_BUFSZ * curr; + gpupreagg_init_local_slot(l_dclass + GPUPREAGG_NUM_ACCUM_VALUES * curr, + l_values + GPUPREAGG_NUM_ACCUM_VALUES * curr, + l_extras); + /* + * __threadfence_block() makes above updates visible to other concurent + * threads within this block. + */ + __threadfence_block(); + /* UNLOCK */ + atomicExch(&l_htable->l_hslots[hindex], curr); +found: + /* Runs global-to-local reduction */ + gpupreagg_update_atomic(l_dclass + GPUPREAGG_NUM_ACCUM_VALUES * curr, + l_values + GPUPREAGG_NUM_ACCUM_VALUES * curr, + GPUPREAGG_ACCUM_MAP_LOCAL, + KERN_DATA_STORE_DCLASS(kds_slot, index), + KERN_DATA_STORE_VALUES(kds_slot, index), + GPUPREAGG_ACCUM_MAP_GLOBAL); + return 1; /* ok, merged */ +} + +/* + * gpupreagg_group_reduction + */ +DEVICE_FUNCTION(void) +gpupreagg_groupby_reduction(kern_context *kcxt, + kern_gpupreagg *kgpreagg, /* in/out */ + kern_errorbuf *kgjoin_errorbuf, /* in */ + kern_data_store *kds_slot, /* in */ + kern_data_store *kds_final, /* out */ + kern_global_hashslot *f_hash, /* out */ + preagg_hash_item *l_hitems, /* __shared__ */ + cl_char *l_dclass, /* __shared__ */ + Datum *l_values, /* __shared__ */ + char *l_extras) /* __shared__ */ +{ + cl_bool is_last_reduction = false; + cl_uint l_nitems; + __shared__ preagg_local_hashtable l_htable; + __shared__ cl_uint base; + + /* skip if previous stage reported an error */ + if (kgjoin_errorbuf && + __syncthreads_count(kgjoin_errorbuf->errcode) != 0) + return; + if (__syncthreads_count(kgpreagg->kerror.errcode) != 0) + return; + + assert(kgpreagg->num_group_keys > 0); + assert(kds_slot->format == KDS_FORMAT_SLOT); + assert(kds_final->format == KDS_FORMAT_SLOT); + if (get_global_id() == 0) + kgpreagg->setup_slot_done = true; + + /* + * setup local hash-table + */ + if (get_local_id() == 0) + { + l_final_buffer_modified = false; + l_htable.nitems = 0; + } + for (int i = get_local_id(); i < GPUPREAGG_LOCAL_HASH_NSLOTS; i += get_local_size()) + l_htable.l_hslots[i] = HASHITEM_EMPTY; + __syncthreads(); + + /* + * main loop for the local/global hybrid reduction + */ + do { + cl_uint hash = UINT_MAX; + int status; + int index; + int count; + + /* fetch next items from the kds_slot */ + if (get_local_id() == 0) + base = atomicAdd(&kgpreagg->read_slot_pos, get_local_size()); + __syncthreads(); + if (base >= kds_slot->nitems) + break; + if (base + get_local_size() >= kds_slot->nitems) + is_last_reduction = true; + + /* calculation of the hash-value of the item */ + index = base + get_local_id(); + if (index < kds_slot->nitems) + { + cl_char *__dclass = KERN_DATA_STORE_DCLASS(kds_slot, index); + Datum *__values = KERN_DATA_STORE_VALUES(kds_slot, index); + + hash = gpupreagg_hashvalue(kcxt, __dclass, __values); + } + if (__syncthreads_count(kcxt->errcode) > 0) + return; /* error */ + + /* + * 1st path - try local reduction + */ + status = -1; + do { + if (status < 0 && index < kds_slot->nitems) + status = gpupreagg_local_reduction(kcxt, + kds_slot, + index, + hash, + &l_htable, + l_hitems, + l_dclass, + l_values, + l_extras); + else + status = 1; + + if (__syncthreads_count(kcxt->errcode) > 0) + return; /* error */ + } while (__syncthreads_count(status < 0) > 0); + + /* + * 2nd path - try global reduction + */ + assert(status >= 0); + while ((count = __syncthreads_count(status == 0)) > 0) + { + if (gpupreagg_expand_global_hash(kcxt, f_hash, count)) + { + if (status == 0) + { + assert(index < kds_slot->nitems); + + if (gpupreagg_global_reduction(kcxt, + kds_slot, + index, + hash, + kds_final, + f_hash, + NULL, + NULL)) + status = 1; /* successfully, merged */ + } + /* unlock global hash slots */ + __syncthreads(); + if (get_local_id() == 0) + atomicSub(&f_hash->lock, 2); + } + /* quick bailout on error */ + if (__syncthreads_count(kcxt->errcode) > 0) + return; + } + } while (!is_last_reduction); + + __syncthreads(); + + /* + * last path - flush pending local reductions + */ + l_nitems = Min(l_htable.nitems, GPUPREAGG_LOCAL_HASH_NROOMS); + for (cl_uint i = 0; i < l_nitems; i += get_local_size()) + { + cl_uint j = i + get_local_id(); + cl_int status = 0; + cl_int count; + + while ((count = __syncthreads_count(!status && j < l_nitems)) > 0) + { + if (gpupreagg_expand_global_hash(kcxt, f_hash, count)) + { + if (!status && j < l_nitems) + { + preagg_hash_item *hitem = &l_hitems[j]; + cl_char *my_dclass = l_dclass + GPUPREAGG_NUM_ACCUM_VALUES * j; + Datum *my_values = l_values + GPUPREAGG_NUM_ACCUM_VALUES * j; + + if (gpupreagg_global_reduction(kcxt, + kds_slot, + hitem->index, + hitem->hash, + kds_final, + f_hash, + my_dclass, + my_values)) + status = 1; /* merged */ + } + else + { + status = 1; + } + /* unlock global hash slots */ + __syncthreads(); + if (get_local_id() == 0) + atomicSub(&f_hash->lock, 2); + } + /* quick bailout on error */ + if (__syncthreads_count(kcxt->errcode) > 0) + return; + } + } + __syncthreads(); + if (get_local_id() == 0 && l_final_buffer_modified) + kgpreagg->final_buffer_modified = true; +} + +/* + * aggcalc operations for hyper-log-log + */ +DEVICE_FUNCTION(void) +aggcalc_init_hll_sketch(cl_char *p_accum_dclass, + Datum *p_accum_datum, + char *extra_pos) +{ + cl_uint sz = VARHDRSZ + (1U << GPUPREAGG_HLL_REGISTER_BITS); + + *p_accum_dclass = DATUM_CLASS__NULL; + memset(extra_pos, 0, sz); + SET_VARSIZE(extra_pos, sz); + *p_accum_datum = PointerGetDatum(extra_pos); +} + +DEVICE_FUNCTION(void) +aggcalc_shuffle_hll_sketch(cl_char *p_accum_dclass, + Datum *p_accum_datum, + int lane_id) +{ + cl_char my_dclass; + cl_char buddy_dclass; + varlena *hll_state = (varlena *)DatumGetPointer(*p_accum_datum); + cl_uint *hll_regs = (cl_uint *)VARDATA(hll_state); + cl_uint nrooms = (1U << GPUPREAGG_HLL_REGISTER_BITS); + cl_uint index; + + assert(VARSIZE_EXHDR(hll_state) == nrooms); + assert(__activemask() == ~0U); + my_dclass = *p_accum_dclass; + buddy_dclass = __shfl_sync(__activemask(), my_dclass, lane_id); + + nrooms /= sizeof(cl_uint); + for (index=0; index < nrooms; index++) + { + union { + cl_uchar regs[4]; + cl_uint v32; + } myself, buddy; + + myself.v32 = hll_regs[index]; + buddy.v32 = __shfl_sync(__activemask(), myself.v32, lane_id); + if (my_dclass == DATUM_CLASS__NULL) + { + if (buddy_dclass != DATUM_CLASS__NULL) + { + hll_regs[index] = buddy.v32; + *p_accum_dclass = DATUM_CLASS__NORMAL; + } + } + else + { + assert(my_dclass == DATUM_CLASS__NORMAL); + if (buddy_dclass != DATUM_CLASS__NULL) + { + assert(buddy_dclass == DATUM_CLASS__NORMAL); + if (myself.regs[0] < buddy.regs[0]) + myself.regs[0] = buddy.regs[0]; + if (myself.regs[1] < buddy.regs[1]) + myself.regs[1] = buddy.regs[1]; + if (myself.regs[2] < buddy.regs[2]) + myself.regs[2] = buddy.regs[2]; + if (myself.regs[3] < buddy.regs[3]) + myself.regs[3] = buddy.regs[3]; + hll_regs[index] = myself.v32; + } + } + } +} + +DEVICE_FUNCTION(void) +aggcalc_normal_hll_sketch(cl_char *p_accum_dclass, + Datum *p_accum_datum, + cl_char newval_dclass, + Datum newval_datum) /* = int8 hash */ +{ + cl_uint nrooms = (1U << GPUPREAGG_HLL_REGISTER_BITS); + cl_uint index; + cl_uint count; + cl_char *hll_regs; + + if (newval_dclass != DATUM_CLASS__NULL) + { + assert(newval_dclass == DATUM_CLASS__NORMAL); + + + index = (newval_datum & (nrooms - 1)); + count = __clzll(__brevll(newval_datum >> GPUPREAGG_HLL_REGISTER_BITS)) + 1; + hll_regs = VARDATA(*p_accum_datum); + if (hll_regs[index] < count) + hll_regs[index] = count; + *p_accum_dclass = DATUM_CLASS__NORMAL; + } +} + +DEVICE_FUNCTION(void) +aggcalc_merge_hll_sketch(cl_char *p_accum_dclass, + Datum *p_accum_datum, + cl_char newval_dclass, + Datum newval_datum) /* =bytea sketch */ +{ + if (newval_dclass != DATUM_CLASS__NULL) + { + cl_uint *dst_regs = (cl_uint *)VARDATA(*p_accum_datum); + cl_uint *new_regs = (cl_uint *)VARDATA(newval_datum); + cl_uint nrooms = (1U << GPUPREAGG_HLL_REGISTER_BITS); + cl_uint index; + + assert(newval_dclass == DATUM_CLASS__NORMAL); + assert(VARSIZE_EXHDR(*p_accum_datum) == nrooms && + VARSIZE_EXHDR(newval_datum) == nrooms); + nrooms /= sizeof(cl_uint); + for (index=0; index < nrooms; index++) + { + union { + cl_uchar regs[4]; + cl_uint v32; + } oldval, curval, newval, tmpval; + + tmpval.v32 = __volatileRead(&new_regs[index]); + curval.v32 = __volatileRead(&dst_regs[index]); + do { + newval = oldval = curval; + if (newval.regs[0] < tmpval.regs[0]) + newval.regs[0] = tmpval.regs[0]; + if (newval.regs[1] < tmpval.regs[1]) + newval.regs[1] = tmpval.regs[1]; + if (newval.regs[2] < tmpval.regs[2]) + newval.regs[2] = tmpval.regs[2]; + if (newval.regs[3] < tmpval.regs[3]) + newval.regs[3] = tmpval.regs[3]; + if (newval.v32 == curval.v32) + break; + } while ((curval.v32 = atomicCAS(&dst_regs[index], + oldval.v32, + newval.v32)) != oldval.v32); + } + *p_accum_dclass = DATUM_CLASS__NORMAL; + } +} + +DEVICE_FUNCTION(void) +aggcalc_update_hll_sketch(cl_char *p_accum_dclass, + Datum *p_accum_datum, + cl_char newval_dclass, + Datum newval_datum) /* =int8 hash */ +{ + cl_uint nrooms = (1U << GPUPREAGG_HLL_REGISTER_BITS); + cl_uint index; + cl_uint count; + cl_uint *hll_regs; + + if (newval_dclass != DATUM_CLASS__NULL) + { + union { + cl_uchar regs[4]; + cl_uint v32; + } oldval, curval, newval; + + assert(newval_dclass == DATUM_CLASS__NORMAL); + + index = (newval_datum & (nrooms - 1)); + count = __clzll(__brevll(newval_datum >> GPUPREAGG_HLL_REGISTER_BITS)) + 1; + hll_regs = (cl_uint *)VARDATA(*p_accum_datum); + hll_regs += (index >> 2); + index &= 3; + + curval.v32 = __volatileRead(hll_regs); + do { + if (count <= curval.regs[index]) + break; + newval = oldval = curval; + newval.regs[index] = count; + } while ((curval.v32 = atomicCAS(hll_regs, + oldval.v32, + newval.v32)) != oldval.v32); + *p_accum_dclass = DATUM_CLASS__NORMAL; + } +} diff --git a/src/cuda_gpupreagg.h b/old/cuda_gpupreagg.h similarity index 100% rename from src/cuda_gpupreagg.h rename to old/cuda_gpupreagg.h diff --git a/old/cuda_gpuscan.cu b/old/cuda_gpuscan.cu new file mode 100644 index 000000000..90910c5df --- /dev/null +++ b/old/cuda_gpuscan.cu @@ -0,0 +1,750 @@ +/* + * libgpuscan.cu + * + * GPU implementation of GpuScan + * ---- + * Copyright 2011-2021 (C) KaiGai Kohei + * Copyright 2014-2021 (C) PG-Strom Developers Team + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the PostgreSQL License. + */ +#include "cuda_common.h" +#include "cuda_gpuscan.h" +#include "cuda_gcache.h" + +/* + * gpuscan_main_row - GpuScan logic for KDS_FORMAT_ROW + */ +DEVICE_FUNCTION(void) +gpuscan_main_row(kern_context *kcxt, + kern_gpuscan *kgpuscan, + kern_data_store *kds_src, + kern_data_store *kds_dst, + bool has_device_projection) +{ + gpuscanSuspendContext *my_suspend + = KERN_GPUSCAN_SUSPEND_CONTEXT(kgpuscan, get_group_id()); + cl_uint part_index = 0; + cl_uint src_index; + cl_uint src_base; + cl_uint total_nitems_in = 0; /* stat */ + cl_uint total_nitems_out = 0; /* stat */ + cl_uint total_extra_size = 0; /* stat */ + __shared__ cl_uint dst_nitems_base; + __shared__ cl_ulong dst_usage_base; + + assert(kds_src->format == KDS_FORMAT_ROW); + assert(kds_dst->format == KDS_FORMAT_SLOT); + /* quick bailout if any error happen on the prior kernel */ + if (__syncthreads_count(kgpuscan->kerror.errcode) != 0) + return; + /* resume kernel from the point where suspended, if any */ + if (kgpuscan->resume_context) + { + assert(my_suspend != NULL); + part_index = my_suspend->part_index; + } + + for (src_base = get_global_base() + part_index * get_global_size(); + src_base < kds_src->nitems; + src_base += get_global_size(), part_index++) + { + kern_tupitem *tupitem = NULL; + cl_bool rc = false; + cl_uint nvalids; + cl_uint required = 0; + cl_uint nitems_offset; + cl_uint usage_offset = 0; + cl_uint usage_length = 0; + cl_uint suspend_kernel = 0; + cl_char *tup_dclass = NULL; + Datum *tup_values = NULL; + + /* rewind the varlena buffer */ + kcxt->vlpos = kcxt->vlbuf; + /* Evalidation of the rows by WHERE-clause */ + src_index = src_base + get_local_id(); + if (src_index < kds_src->nitems) + { + tupitem = KERN_DATA_STORE_TUPITEM(kds_src, src_index); + rc = gpuscan_quals_eval(kcxt, kds_src, + &tupitem->htup.t_ctid, + &tupitem->htup); + } + /* bailout if any error */ + if (__syncthreads_count(kcxt->errcode) > 0) + break; + /* how many rows servived WHERE-clause evaluation? */ + nitems_offset = pgstromStairlikeBinaryCount(rc, &nvalids); + if (nvalids > 0) + { + /* extract the source tuple to the private slot, if any */ + if (rc) + { + kcxt->vlpos = kcxt->vlbuf; /* rewind */ + tup_dclass = (cl_char *) + kern_context_alloc(kcxt, sizeof(cl_char) * kds_dst->ncols); + tup_values = (Datum *) + kern_context_alloc(kcxt, sizeof(Datum) * kds_dst->ncols); + + if (!tup_dclass || !tup_values) + { + STROM_CPU_FALLBACK(kcxt, ERRCODE_OUT_OF_MEMORY, + "out of memory"); + } + else + { + gpuscan_projection_tuple(kcxt, + kds_src, + &tupitem->htup, + &tupitem->htup.t_ctid, + tup_dclass, + tup_values); + required = kds_slot_compute_extra(kcxt, + kds_dst, + tup_dclass, + tup_values); + } + } + /* bailout if any error */ + if (__syncthreads_count(kcxt->errcode) > 0) + break;; + /* allocation of the destination buffer */ + usage_offset = pgstromStairlikeSum(__kds_packed(required), + &usage_length); + if (get_local_id() == 0) + { + union { + struct { + cl_uint nitems; + cl_uint usage; + } i; + cl_ulong v64; + } oldval, curval, newval; + + curval.i.nitems = kds_dst->nitems; + curval.i.usage = kds_dst->usage; + do { + newval = oldval = curval; + newval.i.nitems += nvalids; + newval.i.usage += usage_length; + + if (KERN_DATA_STORE_SLOT_LENGTH(kds_dst, newval.i.nitems) + + __kds_unpack(newval.i.usage) > kds_dst->length) + { + atomicAdd(&kgpuscan->suspend_count, 1); + suspend_kernel = 1; + break; + } + } while ((curval.v64 = atomicCAS((cl_ulong *)&kds_dst->nitems, + oldval.v64, + newval.v64)) != oldval.v64); + dst_nitems_base = oldval.i.nitems; + dst_usage_base = oldval.i.usage; + } + if (__syncthreads_count(suspend_kernel) > 0) + break; + /* store the result tuple on the destination buffer */ + if (rc) + { + cl_uint dst_index = dst_nitems_base + nitems_offset; + char *dst_extra = ((char *)kds_dst + kds_dst->length - + __kds_unpack(dst_usage_base + + usage_offset) - required); + kds_slot_store_values(kcxt, + kds_dst, + dst_index, + dst_extra, + tup_dclass, + tup_values); + } + } + /* update statistics */ + if (get_local_id() == 0) + { + total_nitems_in += Min(kds_src->nitems - src_base, + get_local_size()); + total_nitems_out += nvalids; + total_extra_size += __kds_unpack(usage_length); + } + } + /* write back statistics */ + if (get_local_id() == 0) + { + atomicAdd(&kgpuscan->nitems_in, total_nitems_in); + atomicAdd(&kgpuscan->nitems_out, total_nitems_out); + atomicAdd(&kgpuscan->extra_size, total_extra_size); + } + /* suspend the current position (even if normal exit) */ + if (my_suspend && get_local_id() == 0) + { + my_suspend->part_index = part_index; + my_suspend->line_index = 0; + } +} + +/* + * gpuscan_main_block - GpuScan logic for KDS_FORMAT_BLOCK + */ +DEVICE_FUNCTION(void) +gpuscan_main_block(kern_context *kcxt, + kern_gpuscan *kgpuscan, + kern_data_store *kds_src, + kern_data_store *kds_dst, + bool has_device_projection) +{ + gpuscanSuspendContext *my_suspend + = KERN_GPUSCAN_SUSPEND_CONTEXT(kgpuscan, get_group_id()); + cl_uint part_sz; + cl_uint n_parts; + cl_uint window_sz; + cl_uint part_base; + cl_uint part_index = 0; + cl_uint line_index = 0; + cl_uint total_nitems_in = 0; /* stat */ + cl_uint total_nitems_out = 0; /* stat */ + cl_uint total_extra_size = 0; /* stat */ + cl_bool thread_is_valid = false; + __shared__ cl_uint dst_nitems_base; + __shared__ cl_uint dst_usage_base; + + assert(kds_src->format == KDS_FORMAT_BLOCK); + assert(kds_dst->format == KDS_FORMAT_SLOT); + /* quick bailout if any error happen on the prior kernel */ + if (__syncthreads_count(kgpuscan->kerror.errcode) != 0) + return; + + part_sz = KERN_DATA_STORE_PARTSZ(kds_src); + n_parts = get_local_size() / part_sz; + if (get_global_id() == 0) + kgpuscan->part_sz = part_sz; + if (get_local_id() < part_sz * n_parts) + thread_is_valid = true; + window_sz = n_parts * get_num_groups(); + + /* resume kernel from the point where suspended, if any */ + if (kgpuscan->resume_context) + { + part_index = my_suspend->part_index; + line_index = my_suspend->line_index; + } + __syncthreads(); + + for (;;) + { + cl_uint part_id; + cl_uint line_no; + cl_uint n_lines = 0; + + part_base = part_index * window_sz + get_group_id() * n_parts; + if (part_base >= kds_src->nitems) + break; + part_id = get_local_id() / part_sz + part_base; + line_no = get_local_id() % part_sz + line_index * part_sz; + + do { + HeapTupleHeaderData *htup = NULL; + ItemPointerData t_self; + PageHeaderData *pg_page; + BlockNumber block_nr; + cl_ushort t_len __attribute__((unused)); + cl_uint nvalids; + cl_uint required = 0; + cl_uint nitems_real; + cl_uint nitems_offset; + cl_uint usage_offset = 0; + cl_uint usage_length = 0; + cl_uint suspend_kernel = 0; + cl_bool rc = false; + cl_char *tup_dclass = NULL; + Datum *tup_values = NULL; + + /* rewind the varlena buffer */ + kcxt->vlpos = kcxt->vlbuf; + + /* identify the block */ + if (thread_is_valid && part_id < kds_src->nitems) + { + pg_page = KERN_DATA_STORE_BLOCK_PGPAGE(kds_src, part_id); + n_lines = PageGetMaxOffsetNumber(pg_page); + block_nr = KERN_DATA_STORE_BLOCK_BLCKNR(kds_src, part_id); + t_self.ip_blkid.bi_hi = block_nr >> 16; + t_self.ip_blkid.bi_lo = block_nr & 0xffff; + t_self.ip_posid = line_no + 1; + + if (line_no < n_lines) + { + ItemIdData *lpp = PageGetItemId(pg_page, line_no+1); + if (ItemIdIsNormal(lpp)) + htup = PageGetItem(pg_page, lpp); + t_len = ItemIdGetLength(lpp); + } + } + + /* evaluation of the qualifiers */ + if (htup) + { + rc = gpuscan_quals_eval(kcxt, + kds_src, + &t_self, + htup); + } + /* bailout if any error */ + if (__syncthreads_count(kcxt->errcode) > 0) + goto out_nostat; + + /* how many rows servived WHERE-clause evaluations? */ + nitems_offset = pgstromStairlikeBinaryCount(rc, &nvalids); + if (nvalids > 0) + { + /* store the result heap-tuple to destination buffer */ + if (rc) + { + tup_dclass = (cl_char *) + kern_context_alloc(kcxt, sizeof(cl_char) * kds_dst->ncols); + tup_values = (Datum *) + kern_context_alloc(kcxt, sizeof(Datum) * kds_dst->ncols); + + if (!tup_dclass || !tup_values) + { + STROM_EREPORT(kcxt, ERRCODE_OUT_OF_MEMORY, + "out of memory"); + } + else + { + gpuscan_projection_tuple(kcxt, + kds_src, + htup, + &t_self, + tup_dclass, + tup_values); + required = kds_slot_compute_extra(kcxt, + kds_dst, + tup_dclass, + tup_values); + } + } + /* bailout if any error */ + if (__syncthreads_count(kcxt->errcode) > 0) + goto out; + /* allocation of the destination buffer */ + usage_offset = pgstromStairlikeSum(__kds_packed(required), + &usage_length); + if (get_local_id() == 0) + { + union { + struct { + cl_uint nitems; + cl_uint usage; + } i; + cl_ulong v64; + } oldval, curval, newval; + + curval.i.nitems = kds_dst->nitems; + curval.i.usage = kds_dst->usage; + do { + newval = oldval = curval; + newval.i.nitems += nvalids; + newval.i.usage += usage_length; + + if (KERN_DATA_STORE_SLOT_LENGTH(kds_dst, newval.i.nitems) + + __kds_unpack(newval.i.usage) > kds_dst->length) + { + atomicAdd(&kgpuscan->suspend_count, 1); + suspend_kernel = 1; + break; + } + } while ((curval.v64 = atomicCAS((cl_ulong *)&kds_dst->nitems, + oldval.v64, + newval.v64)) != oldval.v64); + dst_nitems_base = oldval.i.nitems; + dst_usage_base = oldval.i.usage; + } + if (__syncthreads_count(suspend_kernel) > 0) + goto out; + /* store the result heap tuple */ + if (rc) + { + cl_uint dst_index = dst_nitems_base + nitems_offset; + char *dst_extra = ((char *)kds_dst + kds_dst->length - + __kds_unpack(dst_usage_base + + usage_offset) - required); + kds_slot_store_values(kcxt, + kds_dst, + dst_index, + dst_extra, + tup_dclass, + tup_values); + } + } + /* update statistics */ + nitems_real = __syncthreads_count(htup != NULL); + if (get_local_id() == 0) + { + total_nitems_in += nitems_real; + total_nitems_out += nvalids; + total_extra_size += __kds_unpack(usage_length); + } + + /* + * Move to the next window of the line items, if any. + * If no threads in CUDA block wants to continue, exit the loop. + */ + line_index++; + line_no += part_sz; + } while (__syncthreads_count(thread_is_valid && + line_no < n_lines) > 0); + /* move to the next window */ + part_index++; + line_index = 0; + } +out: + /* update statistics */ + if (get_local_id() == 0) + { + atomicAdd(&kgpuscan->nitems_in, total_nitems_in); + atomicAdd(&kgpuscan->nitems_out, total_nitems_out); + atomicAdd(&kgpuscan->extra_size, total_extra_size); + } +out_nostat: + if (get_local_id() == 0) + { + my_suspend->part_index = part_index; + my_suspend->line_index = line_index; + } +} + +/* + * gpuscan_main_arrow - GpuScan logic for KDS_FORMAT_ARROW + */ +DEVICE_FUNCTION(void) +gpuscan_main_arrow(kern_context *kcxt, + kern_gpuscan *kgpuscan, + kern_data_store *kds_src, + kern_data_store *kds_dst, + bool has_device_projection) +{ + gpuscanSuspendContext *my_suspend + = KERN_GPUSCAN_SUSPEND_CONTEXT(kgpuscan, get_group_id()); + cl_uint part_index = 0; + cl_uint src_base; + cl_uint src_index; + cl_uint total_nitems_in = 0; /* stat */ + cl_uint total_nitems_out = 0; /* stat */ + cl_uint total_extra_size = 0; /* stat */ + __shared__ cl_uint dst_nitems_base; + __shared__ cl_uint dst_usage_base; + + assert(kds_src->format == KDS_FORMAT_ARROW); + assert(kds_dst->format == KDS_FORMAT_SLOT); + /* quick bailout if any error happen on the prior kernel */ + if (__syncthreads_count(kgpuscan->kerror.errcode) != 0) + return; + /* resume kernel from the point where suspended, if any */ + if (kgpuscan->resume_context) + { + assert(my_suspend != NULL); + part_index = my_suspend->part_index; + } + + for (src_base = get_global_base() + part_index * get_global_size(); + src_base < kds_src->nitems; + src_base += get_global_size(), part_index++) + { + kern_tupitem *tupitem __attribute__((unused)); + cl_bool rc; + cl_uint nvalids; + cl_uint required = 0; + cl_uint nitems_offset; + cl_uint usage_offset = 0; + cl_uint usage_length = 0; + cl_uint suspend_kernel = 0; + cl_char *tup_dclass = NULL; + Datum *tup_values = NULL; + + /* rewind the varlena buffer */ + kcxt->vlpos = kcxt->vlbuf; + + /* Evalidation of the rows by WHERE-clause */ + src_index = src_base + get_local_id(); + if (src_index < kds_src->nitems) + rc = gpuscan_quals_eval_arrow(kcxt, kds_src, src_index); + else + rc = false; + /* bailout if any error */ + if (__syncthreads_count(kcxt->errcode) > 0) + break; + + /* how many rows servived WHERE-clause evaluation? */ + nitems_offset = pgstromStairlikeBinaryCount(rc, &nvalids); + if (nvalids > 0) + { + if (rc) + { + kcxt->vlpos = kcxt->vlbuf; /* rewind */ + tup_dclass = (cl_char *) + kern_context_alloc(kcxt, sizeof(cl_char) * kds_dst->ncols); + tup_values = (Datum *) + kern_context_alloc(kcxt, sizeof(Datum) * kds_dst->ncols); + + if (!tup_dclass || !tup_values) + { + STROM_EREPORT(kcxt, ERRCODE_OUT_OF_MEMORY, + "out of memory"); + } + else + { + gpuscan_projection_arrow(kcxt, + kds_src, + src_index, + tup_dclass, + tup_values); + required = kds_slot_compute_extra(kcxt, + kds_dst, + tup_dclass, + tup_values); + } + } + /* bailout if any error */ + if (__syncthreads_count(kcxt->errcode) > 0) + break; + /* allocation of the destination buffer */ + usage_offset = pgstromStairlikeSum(__kds_packed(required), + &usage_length); + if (get_local_id() == 0) + { + union { + struct { + cl_uint nitems; + cl_uint usage; + } i; + cl_ulong v64; + } oldval, curval, newval; + + curval.i.nitems = kds_dst->nitems; + curval.i.usage = kds_dst->usage; + do { + newval = oldval = curval; + newval.i.nitems += nvalids; + newval.i.usage += usage_length; + + if (KERN_DATA_STORE_SLOT_LENGTH(kds_dst, newval.i.nitems) + + __kds_unpack(newval.i.usage) > kds_dst->length) + { + atomicAdd(&kgpuscan->suspend_count, 1); + suspend_kernel = 1; + break; + } + } while ((curval.v64 = atomicCAS((cl_ulong *)&kds_dst->nitems, + oldval.v64, + newval.v64)) != oldval.v64); + dst_nitems_base = oldval.i.nitems; + dst_usage_base = oldval.i.usage; + } + if (__syncthreads_count(suspend_kernel) > 0) + break; + /* store the result virtual-tuple on the destination buffer */ + if (rc) + { + cl_uint dst_index = dst_nitems_base + nitems_offset; + char *dst_extra = ((char *)kds_dst + kds_dst->length - + __kds_unpack(dst_usage_base + + usage_offset) - required); + kds_slot_store_values(kcxt, + kds_dst, + dst_index, + dst_extra, + tup_dclass, + tup_values); + } + /* bailout if any error */ + if (__syncthreads_count(kcxt->errcode) > 0) + break; + } + /* write back statistics */ + if (get_local_id() == 0) + { + total_nitems_in += Min(kds_src->nitems - src_base, + get_local_size()); + total_nitems_out += nvalids; + total_extra_size += __kds_unpack(usage_length); + } + } + /* write back statistics */ + if (get_local_id() == 0) + { + atomicAdd(&kgpuscan->nitems_in, total_nitems_in); + atomicAdd(&kgpuscan->nitems_out, total_nitems_out); + atomicAdd(&kgpuscan->extra_size, total_extra_size); + } + /* suspend the current position (even if normal exit) */ + if (my_suspend && get_local_id() == 0) + { + my_suspend->part_index = part_index; + my_suspend->line_index = 0; + } +} + +/* + * gpuscan_main_column - GpuScan logic for KDS_FORMAT_COLUMN + */ +DEVICE_FUNCTION(void) +gpuscan_main_column(kern_context *kcxt, + kern_gpuscan *kgpuscan, + kern_data_store *kds_src, + kern_data_extra *kds_extra, + kern_data_store *kds_dst) +{ + gpuscanSuspendContext *my_suspend + = KERN_GPUSCAN_SUSPEND_CONTEXT(kgpuscan, get_group_id()); + cl_uint part_index = 0; + cl_uint src_base; + cl_uint total_nitems_in = 0; + cl_uint total_nitems_out = 0; + cl_uint total_extra_size = 0; + __shared__ cl_uint dst_nitems_base; + __shared__ cl_uint dst_usage_base; + + assert(kds_src->format == KDS_FORMAT_COLUMN && + kds_dst->format == KDS_FORMAT_SLOT); + /* quick bailout if any error happen on the prior kernel */ + if (__syncthreads_count(kgpuscan->kerror.errcode) != 0) + return; + /* resume kernel from the point where suspended, if any */ + if (kgpuscan->resume_context) + { + assert(my_suspend != NULL); + part_index = my_suspend->part_index; + } + + for (src_base = get_global_base() + part_index * get_global_size(); + src_base < kds_src->nitems; + src_base += get_global_size(), part_index++) + { + cl_uint src_index = src_base + get_local_id(); + cl_bool rc = false; + cl_uint nvalids; + cl_uint required = 0; + cl_uint nitems_offset; + cl_uint usage_offset = 0; + cl_uint usage_length = 0; + cl_uint suspend_kernel = 0; + cl_char *tup_dclass = NULL; + Datum *tup_values = NULL; + + /* rewind the varlena buffer */ + kcxt->vlpos = kcxt->vlbuf; + /* evaluation of the row using WHERE-clause */ + if (src_index < kds_src->nitems) + { + if (kern_check_visibility_column(kcxt, kds_src, src_index)) + { + rc = gpuscan_quals_eval_column(kcxt, + kds_src, + kds_extra, + src_index); + } + } + /* bailout if any error */ + if (__syncthreads_count(kcxt->errcode) > 0) + break; + /* how many rows servived the evaluation above? */ + nitems_offset = pgstromStairlikeBinaryCount(rc, &nvalids); + if (nvalids > 0) + { + /* Ok, extract the source columns to form a result row */ + kcxt->vlpos = kcxt->vlbuf; /* rewind */ + if (rc) + { + tup_dclass = (cl_char *) + kern_context_alloc(kcxt, sizeof(cl_char) * kds_dst->ncols); + tup_values = (Datum *) + kern_context_alloc(kcxt, sizeof(Datum) * kds_dst->ncols); + gpuscan_projection_column(kcxt, + kds_src, + kds_extra, + src_index, + tup_dclass, + tup_values); + required = kds_slot_compute_extra(kcxt, + kds_dst, + tup_dclass, + tup_values); + } + /* bailout if any error */ + if (__syncthreads_count(kcxt->errcode) > 0) + break; + /* allocation of the destination buffer */ + usage_offset = pgstromStairlikeSum(__kds_packed(required), + &usage_length); + if (get_local_id() == 0) + { + union { + struct { + cl_uint nitems; + cl_uint usage; + } i; + cl_ulong v64; + } oldval, curval, newval; + + curval.i.nitems = kds_dst->nitems; + curval.i.usage = kds_dst->usage; + do { + newval = oldval = curval; + newval.i.nitems += nvalids; + newval.i.usage += usage_length; + + if (KERN_DATA_STORE_SLOT_LENGTH(kds_dst, newval.i.nitems) + + __kds_unpack(newval.i.usage) > kds_dst->length) + { + atomicAdd(&kgpuscan->suspend_count, 1); + suspend_kernel = 1; + break; + } + } while ((curval.v64 = atomicCAS((cl_ulong *)&kds_dst->nitems, + oldval.v64, + newval.v64)) != oldval.v64); + dst_nitems_base = oldval.i.nitems; + dst_usage_base = oldval.i.usage; + } + if (__syncthreads_count(suspend_kernel) > 0) + break; + /* store the result tuple on the destination buffer */ + if (rc) + { + cl_uint dst_index = dst_nitems_base + nitems_offset; + char *dst_extra = ((char *)kds_dst + kds_dst->length - + __kds_unpack(dst_usage_base + + usage_offset) - required); + kds_slot_store_values(kcxt, + kds_dst, + dst_index, + dst_extra, + tup_dclass, + tup_values); + } + } + /* update statistics */ + if (get_local_id() == 0) + { + total_nitems_in += Min(kds_src->nitems - src_base, + get_local_size()); + total_nitems_out += nvalids; + total_extra_size += __kds_unpack(usage_length); + } + } + /* write back statistics */ + if (get_local_id() == 0) + { + atomicAdd(&kgpuscan->nitems_in, total_nitems_in); + atomicAdd(&kgpuscan->nitems_out, total_nitems_out); + atomicAdd(&kgpuscan->extra_size, total_extra_size); + } + /* suspend the current position (even if normal exit) */ + if (my_suspend && get_local_id() == 0) + { + my_suspend->part_index = part_index; + my_suspend->line_index = 0; + } +} diff --git a/src/cuda_gpuscan.h b/old/cuda_gpuscan.h similarity index 100% rename from src/cuda_gpuscan.h rename to old/cuda_gpuscan.h diff --git a/src/cuda_gpusort.cu b/old/cuda_gpusort.cu similarity index 100% rename from src/cuda_gpusort.cu rename to old/cuda_gpusort.cu diff --git a/src/cuda_gpusort.h b/old/cuda_gpusort.h similarity index 100% rename from src/cuda_gpusort.h rename to old/cuda_gpusort.h diff --git a/src/cuda_jsonlib.cu b/old/cuda_jsonlib.cu similarity index 100% rename from src/cuda_jsonlib.cu rename to old/cuda_jsonlib.cu diff --git a/src/cuda_jsonlib.h b/old/cuda_jsonlib.h similarity index 100% rename from src/cuda_jsonlib.h rename to old/cuda_jsonlib.h diff --git a/src/cuda_misclib.cu b/old/cuda_misclib.cu similarity index 100% rename from src/cuda_misclib.cu rename to old/cuda_misclib.cu diff --git a/src/cuda_misclib.h b/old/cuda_misclib.h similarity index 100% rename from src/cuda_misclib.h rename to old/cuda_misclib.h diff --git a/src/cuda_numeric.cu b/old/cuda_numeric.cu similarity index 100% rename from src/cuda_numeric.cu rename to old/cuda_numeric.cu diff --git a/src/cuda_numeric.h b/old/cuda_numeric.h similarity index 100% rename from src/cuda_numeric.h rename to old/cuda_numeric.h diff --git a/src/cuda_postgis.cu b/old/cuda_postgis.cu similarity index 100% rename from src/cuda_postgis.cu rename to old/cuda_postgis.cu diff --git a/src/cuda_postgis.h b/old/cuda_postgis.h similarity index 100% rename from src/cuda_postgis.h rename to old/cuda_postgis.h diff --git a/src/cuda_primitive.cu b/old/cuda_primitive.cu similarity index 100% rename from src/cuda_primitive.cu rename to old/cuda_primitive.cu diff --git a/src/cuda_primitive.h b/old/cuda_primitive.h similarity index 100% rename from src/cuda_primitive.h rename to old/cuda_primitive.h diff --git a/src/cuda_program.c b/old/cuda_program.c similarity index 100% rename from src/cuda_program.c rename to old/cuda_program.c diff --git a/src/cuda_rangetype.cu b/old/cuda_rangetype.cu similarity index 100% rename from src/cuda_rangetype.cu rename to old/cuda_rangetype.cu diff --git a/src/cuda_rangetype.h b/old/cuda_rangetype.h similarity index 100% rename from src/cuda_rangetype.h rename to old/cuda_rangetype.h diff --git a/src/cuda_textlib.cu b/old/cuda_textlib.cu similarity index 100% rename from src/cuda_textlib.cu rename to old/cuda_textlib.cu diff --git a/src/cuda_textlib.h b/old/cuda_textlib.h similarity index 100% rename from src/cuda_textlib.h rename to old/cuda_textlib.h diff --git a/src/cuda_timelib.cu b/old/cuda_timelib.cu similarity index 100% rename from src/cuda_timelib.cu rename to old/cuda_timelib.cu diff --git a/src/cuda_timelib.h b/old/cuda_timelib.h similarity index 100% rename from src/cuda_timelib.h rename to old/cuda_timelib.h diff --git a/src/cuda_utils.h b/old/cuda_utils.h similarity index 100% rename from src/cuda_utils.h rename to old/cuda_utils.h diff --git a/src/datastore.c b/old/datastore.c similarity index 100% rename from src/datastore.c rename to old/datastore.c diff --git a/src/device_attrs.h b/old/device_attrs.h similarity index 100% rename from src/device_attrs.h rename to old/device_attrs.h diff --git a/old/extra.c b/old/extra.c new file mode 100644 index 000000000..38b1383df --- /dev/null +++ b/old/extra.c @@ -0,0 +1,611 @@ +/* + * extra.c + * + * Stuff related to invoke HeteroDB Extra Module + * ---- + * Copyright 2011-2021 (C) KaiGai Kohei + * Copyright 2014-2021 (C) PG-Strom Developers Team + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the PostgreSQL License. + */ +#include +#include "pg_strom.h" + +/* pg_strom.gpudirect_driver */ +#define GPUDIRECT_DRIVER_TYPE__NONE 1 +#define GPUDIRECT_DRIVER_TYPE__CUFILE 2 +#define GPUDIRECT_DRIVER_TYPE__NVME_STROM 3 + +static struct config_enum_entry pgstrom_gpudirect_driver_options[4]; +static int __pgstrom_gpudirect_driver; /* GUC */ + +PG_FUNCTION_INFO_V1(pgstrom_license_query); + +/* + * heterodbExtraModuleInfo + */ +static char *(*p_heterodb_extra_module_init)(unsigned int pg_version_num) = NULL; + +static char * +heterodbExtraModuleInit(void) +{ + char *res; + + if (!p_heterodb_extra_module_init) + elog(ERROR, "HeteroDB Extra module is not loaded yet"); + res = p_heterodb_extra_module_init(PG_VERSION_NUM); + if (!res) + elog(ERROR, "out of memory"); + return res; +} + +/* + * heterodbExtraEreport + */ +static heterodb_extra_error_info *p_heterodb_extra_error_data = NULL; + +static void +heterodbExtraEreport(int elevel) +{ + /* see ereport_domain definition */ +#if PG_VERSION_NUM >= 130000 + pg_prevent_errno_in_scope(); + if (errstart(elevel, TEXTDOMAIN)) + { + errcode(ERRCODE_INTERNAL_ERROR); + errmsg("[extra] %s", p_heterodb_extra_error_data->message); + errfinish(p_heterodb_extra_error_data->filename, + p_heterodb_extra_error_data->lineno, + p_heterodb_extra_error_data->funcname); + } +#else +#if PG_VERSION_NUM >= 120000 + pg_prevent_errno_in_scope(); +#endif + if (errstart(elevel, + p_heterodb_extra_error_data->filename, + p_heterodb_extra_error_data->lineno, + p_heterodb_extra_error_data->funcname, + TEXTDOMAIN)) + { + errcode(ERRCODE_INTERNAL_ERROR); + errmsg("%s", p_heterodb_extra_error_data->message); + errfinish(0); + } +#endif +} + +/* + * heterodbLicenseReload + */ +static int (*p_heterodb_license_reload)(void) = NULL; +static int +heterodbLicenseReload(void) +{ + if (!p_heterodb_license_reload) + return -1; + return p_heterodb_license_reload(); +} + +/* + * heterodbLicenseQuery + */ +static ssize_t (*p_heterodb_license_query)( + char *buf, + size_t bufsz) = NULL; + +static ssize_t +heterodbLicenseQuery(char *buf, size_t bufsz) +{ + if (!p_heterodb_license_query) + return -1; + return p_heterodb_license_query(buf, bufsz); +} + +/* + * pgstrom_license_query + */ +static char * +__heterodb_license_query(void) +{ + char *buf; + size_t bufsz; + ssize_t nbytes; + + if (heterodbLicenseReload() <= 0) + return NULL; + + bufsz = 2048; +retry: + buf = alloca(bufsz); + nbytes = heterodbLicenseQuery(buf, bufsz); + if (nbytes < 0) + return NULL; + if (nbytes < bufsz) + return pstrdup(buf); + bufsz += bufsz; + goto retry; +} + +Datum +pgstrom_license_query(PG_FUNCTION_ARGS) +{ + char *license; + + if (!superuser()) + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + (errmsg("only superuser can query commercial license")))); + license = __heterodb_license_query(); + if (!license) + PG_RETURN_NULL(); + + PG_RETURN_POINTER(DirectFunctionCall1(json_in, PointerGetDatum(license))); +} + +/* + * gpuDirectInitDriver + */ +static int (*p_gpudirect_init_driver)() = NULL; + +int +gpuDirectInitDriver(void) +{ + int rv = -1; + + if (p_gpudirect_init_driver) + { + rv = p_gpudirect_init_driver(); + if (rv) + heterodbExtraEreport(LOG); + } + return rv; +} + +/* + * gpuDirectOpenDriver + */ +static int (*p_gpudirect_open_driver)() = NULL; +static void +gpuDirectOpenDriver(void) +{ + if (!p_gpudirect_open_driver) + { + if (p_gpudirect_open_driver()) + heterodbExtraEreport(ERROR); + } +} + +/* + * gpuDirectCloseDriver + */ +static int (*p_gpudirect_close_driver)() = NULL; +static bool gpudirect_close_driver_is_registered = false; + +static void +gpuDirectCloseDriverOnExit(int code, Datum arg) +{ + if (p_gpudirect_close_driver) + { + if (p_gpudirect_close_driver()) + heterodbExtraEreport(LOG); + } +} + +/* + * gpuDirectFileDescOpen + */ +static int (*p_gpudirect_file_desc_open)( + GPUDirectFileDesc *gds_fdesc, + int rawfd, const char *pathname) = NULL; + +void +gpuDirectFileDescOpen(GPUDirectFileDesc *gds_fdesc, File pg_fdesc) +{ + int rawfd = FileGetRawDesc(pg_fdesc); + char *pathname = FilePathName(pg_fdesc); + + if (!gpudirect_close_driver_is_registered) + { + gpuDirectOpenDriver(); + on_proc_exit(gpuDirectCloseDriverOnExit, 0); + gpudirect_close_driver_is_registered = true; + } + if (p_gpudirect_file_desc_open(gds_fdesc, rawfd, pathname)) + heterodbExtraEreport(ERROR); +} + +/* + * gpuDirectFileDescOpenByPath + */ +static int (*p_gpudirect_file_desc_open_by_path)( + GPUDirectFileDesc *gds_fdesc, + const char *pathname) = NULL; + +void +gpuDirectFileDescOpenByPath(GPUDirectFileDesc *gds_fdesc, + const char *pathname) +{ + if (!gpudirect_close_driver_is_registered) + { + gpuDirectOpenDriver(); + on_proc_exit(gpuDirectCloseDriverOnExit, 0); + gpudirect_close_driver_is_registered = true; + } + if (p_gpudirect_file_desc_open_by_path(gds_fdesc, pathname)) + heterodbExtraEreport(ERROR); +} + +/* + * gpuDirectFileDescClose + */ +static void (*p_gpudirect_file_desc_close)( + const GPUDirectFileDesc *gds_fdesc) = NULL; + +void +gpuDirectFileDescClose(const GPUDirectFileDesc *gds_fdesc) +{ + Assert(p_gpudirect_file_desc_close != NULL); + p_gpudirect_file_desc_close(gds_fdesc); +} + +/* + * gpuDirectMapGpuMemory + */ +static CUresult (*p_gpudirect_map_gpu_memory)( + CUdeviceptr m_segment, + size_t m_segment_sz, + unsigned long *p_iomap_handle) = NULL; + +CUresult +gpuDirectMapGpuMemory(CUdeviceptr m_segment, + size_t m_segment_sz, + unsigned long *p_iomap_handle) +{ + Assert(p_gpudirect_map_gpu_memory != NULL); + return p_gpudirect_map_gpu_memory(m_segment, m_segment_sz, p_iomap_handle); +} + +/* + * gpuDirectUnmapGpuMemory + */ +static CUresult (*p_gpudirect_unmap_gpu_memory)( + CUdeviceptr m_segment, + unsigned long iomap_handle) = NULL; + +CUresult +gpuDirectUnmapGpuMemory(CUdeviceptr m_segment, + unsigned long iomap_handle) +{ + Assert(p_gpudirect_unmap_gpu_memory != NULL); + return p_gpudirect_unmap_gpu_memory(m_segment, iomap_handle); +} + +/* + * gpuDirectFileReadIOV + */ +static int (*p_gpudirect_file_read_iov)( + const GPUDirectFileDesc *gds_fdesc, + CUdeviceptr m_segment, + unsigned long iomap_handle, + off_t m_offset, + strom_io_vector *iovec) = NULL; + +void +gpuDirectFileReadIOV(const GPUDirectFileDesc *gds_fdesc, + CUdeviceptr m_segment, + unsigned long iomap_handle, + off_t m_offset, + strom_io_vector *iovec) +{ + Assert(p_gpudirect_file_read_iov != NULL); + if (p_gpudirect_file_read_iov(gds_fdesc, + m_segment, + iomap_handle, + m_offset, + iovec)) + werror("failed on gpuDirectFileReadIOV"); +} + +/* + * extraSysfsSetupDistanceMap + */ +static int (*p_sysfs_setup_distance_map)( + int gpu_count, + GpuPciDevItem *gpu_array, + const char *manual_config) = NULL; + +void +extraSysfsSetupDistanceMap(const char *manual_config) +{ + GpuPciDevItem *gpu_array; + int i; + + if (!p_sysfs_setup_distance_map) + return; /* nothing to do */ + + gpu_array = alloca(numDevAttrs * sizeof(GpuPciDevItem)); + memset(gpu_array, 0, numDevAttrs * sizeof(GpuPciDevItem)); + for (i=0; i < numDevAttrs; i++) + { + DevAttributes *dattr = &devAttrs[i]; + GpuPciDevItem *gpu = &gpu_array[i]; + + gpu->device_id = dattr->DEV_ID; + strncpy(gpu->device_name, dattr->DEV_NAME, + sizeof(gpu->device_name)); + gpu->pci_domain = dattr->PCI_DOMAIN_ID; + gpu->pci_bus_id = dattr->PCI_BUS_ID; + gpu->pci_dev_id = dattr->PCI_DEVICE_ID; + if (dattr->MULTI_GPU_BOARD) + gpu->pci_func_id = dattr->MULTI_GPU_BOARD_GROUP_ID; + } + if (p_sysfs_setup_distance_map(numDevAttrs, + gpu_array, + manual_config) < 0) + heterodbExtraEreport(ERROR); +} + +/* + * extraSysfsLookupOptimalGpu + */ +static int (*p_sysfs_lookup_optimal_gpus)(int fdesc, + int nrooms, + int *optimal_gpus) = NULL; +Bitmapset * +extraSysfsLookupOptimalGpus(File filp) +{ + Bitmapset *optimal_gpus = NULL; + int fdesc = FileGetRawDesc(filp); + int i, nitems; + int *__gpus; + + if (!p_sysfs_lookup_optimal_gpus || numDevAttrs == 0) + return NULL; + __gpus = alloca(sizeof(int) * numDevAttrs); + nitems = p_sysfs_lookup_optimal_gpus(fdesc, numDevAttrs, __gpus); + if (nitems < 0) + heterodbExtraEreport(ERROR); + for (i=0; i < nitems; i++) + { + Assert(__gpus[i] >= 0 && __gpus[i] < numDevAttrs); + optimal_gpus = bms_add_member(optimal_gpus, __gpus[i]); + } + return optimal_gpus; +} + +/* + * extraSysfsPrintNvmeInfo + */ +static ssize_t (*p_sysfs_print_nvme_info)( + int index, + char *buffer, + ssize_t buffer_sz) = NULL; + +ssize_t +extraSysfsPrintNvmeInfo(int index, char *buffer, ssize_t buffer_sz) +{ + if (!p_sysfs_print_nvme_info) + return -1; + return p_sysfs_print_nvme_info(index, buffer, buffer_sz); +} + +/* lookup_heterodb_extra_function */ +static void * +lookup_heterodb_extra_function(void *handle, const char *symbol) +{ + void *fn_addr; + + fn_addr = dlsym(handle, symbol); + if (!fn_addr) + elog(ERROR, "could not find extra symbol \"%s\" - %s", + symbol, dlerror()); + return fn_addr; +} +#define LOOKUP_HETERODB_EXTRA_FUNCTION(symbol) \ + p_##symbol = lookup_heterodb_extra_function(handle, #symbol) + +/* lookup_gpudirect_function */ +static void * +lookup_gpudirect_function(void *handle, const char *prefix, const char *func_name) +{ + char symbol[128]; + + snprintf(symbol, sizeof(symbol), "%s__%s", prefix, func_name); + return lookup_heterodb_extra_function(handle, symbol); +} + +#define LOOKUP_GPUDIRECT_EXTRA_FUNCTION(prefix,func_name) \ + p_gpudirect_##func_name = lookup_gpudirect_function(handle, prefix, #func_name) + +/* + * parse_heterodb_extra_module_info + */ +static void +parse_heterodb_extra_module_info(const char *extra_module_info, + uint32 *p_api_version, + bool *p_has_cufile, + bool *p_has_nvme_strom, + int *p_default_gpudirect_driver) +{ + char *buffer; + long api_version = 0; + bool has_cufile = false; + bool has_nvme_strom = false; + int default_gpudirect_driver = GPUDIRECT_DRIVER_TYPE__NONE; + char *tok, *pos, *end; + struct config_enum_entry *entry; + + buffer = alloca(strlen(extra_module_info) + 1); + strcpy(buffer, extra_module_info); + for (tok = strtok_r(buffer, ",", &pos); + tok != NULL; + tok = strtok_r(NULL, ",", &pos)) + { + if (strncmp(tok, "api_version=", 12) == 0) + { + api_version = strtol(tok+12, &end, 10); + if (api_version < 0 || *end != '\0') + elog(ERROR, "invalid extra module token [%s]", tok); + } + else if (strncmp(tok, "cufile=", 7) == 0) + { + if (strcmp(tok+7, "on") == 0) + has_cufile = true; + else if (strcmp(tok+7, "off") == 0) + has_cufile = false; + else + elog(ERROR, "invalid extra module token [%s]", tok); + } + else if (strncmp(tok, "nvme_strom=", 11) == 0) + { + if (strcmp(tok+11, "on") == 0) + has_nvme_strom = true; + else if (strcmp(tok+11, "off") == 0) + has_nvme_strom = false; + else + elog(ERROR, "invalid extra module token [%s]", tok); + } + } + + if (api_version < HETERODB_EXTRA_API_VERSION) + elog(ERROR, "HeteroDB Extra Module has Unsupported API version [%08lu]", + api_version); + + /* setup pgstrom.gpudirect_driver options */ + entry = pgstrom_gpudirect_driver_options; + entry->name = "none"; + entry->val = GPUDIRECT_DRIVER_TYPE__NONE; + entry->hidden = false; + entry++; + + if (has_nvme_strom) + { + default_gpudirect_driver = GPUDIRECT_DRIVER_TYPE__NVME_STROM; + entry->name = "nvme_strom"; + entry->val = GPUDIRECT_DRIVER_TYPE__NVME_STROM; + entry->hidden = false; + entry++; + } + if (has_cufile) + { + default_gpudirect_driver = GPUDIRECT_DRIVER_TYPE__CUFILE; + entry->name = "cufile"; + entry->val = GPUDIRECT_DRIVER_TYPE__CUFILE; + entry->hidden = false; + entry++; + } + memset(entry, 0, sizeof(struct config_enum_entry)); + + *p_api_version = api_version; + *p_has_cufile = has_cufile; + *p_has_nvme_strom = has_nvme_strom; + *p_default_gpudirect_driver = default_gpudirect_driver; +} + +/* + * pgstrom_init_extra + */ +void +pgstrom_init_extra(void) +{ + const char *prefix = NULL; + void *handle; + char *license; + char *extra_module_info; + + /* load the extra module */ + handle = dlopen(HETERODB_EXTRA_FILENAME, + RTLD_NOW | RTLD_LOCAL); + if (!handle) + { + handle = dlopen(HETERODB_EXTRA_PATHNAME, RTLD_NOW | RTLD_LOCAL); + if (!handle) + { + elog(LOG, "HeteroDB Extra module is not available"); + return; + } + } + + PG_TRY(); + { + uint32 api_version = 0; + bool has_cufile = false; + bool has_nvme_strom = false; + int default_gpudirect_driver; + + LOOKUP_HETERODB_EXTRA_FUNCTION(heterodb_extra_error_data); + LOOKUP_HETERODB_EXTRA_FUNCTION(heterodb_extra_module_init); + extra_module_info = heterodbExtraModuleInit(); + parse_heterodb_extra_module_info(extra_module_info, + &api_version, + &has_cufile, + &has_nvme_strom, + &default_gpudirect_driver); + if (api_version < HETERODB_EXTRA_API_VERSION) + elog(ERROR, "HeteroDB Extra module is too old [API version=%u]", + api_version); + /* pg_strom.gpudirect_driver */ + DefineCustomEnumVariable("pg_strom.gpudirect_driver", + "Selection of the GPUDirectSQL Driver", + NULL, + &__pgstrom_gpudirect_driver, + default_gpudirect_driver, + pgstrom_gpudirect_driver_options, + PGC_POSTMASTER, + GUC_NOT_IN_SAMPLE, + NULL, NULL, NULL); + if (__pgstrom_gpudirect_driver == GPUDIRECT_DRIVER_TYPE__CUFILE) + prefix = "cufile"; + else if (__pgstrom_gpudirect_driver == GPUDIRECT_DRIVER_TYPE__NVME_STROM) + prefix = "nvme_strom"; + + if (prefix) + { + LOOKUP_GPUDIRECT_EXTRA_FUNCTION(prefix, init_driver); + LOOKUP_GPUDIRECT_EXTRA_FUNCTION(prefix, open_driver); + LOOKUP_GPUDIRECT_EXTRA_FUNCTION(prefix, close_driver); + LOOKUP_GPUDIRECT_EXTRA_FUNCTION(prefix, file_desc_open); + LOOKUP_GPUDIRECT_EXTRA_FUNCTION(prefix, file_desc_open_by_path); + LOOKUP_GPUDIRECT_EXTRA_FUNCTION(prefix, file_desc_close); + LOOKUP_GPUDIRECT_EXTRA_FUNCTION(prefix, map_gpu_memory); + LOOKUP_GPUDIRECT_EXTRA_FUNCTION(prefix, unmap_gpu_memory); + LOOKUP_GPUDIRECT_EXTRA_FUNCTION(prefix, file_read_iov); + } + LOOKUP_HETERODB_EXTRA_FUNCTION(sysfs_setup_distance_map); + LOOKUP_HETERODB_EXTRA_FUNCTION(sysfs_lookup_optimal_gpus); + LOOKUP_HETERODB_EXTRA_FUNCTION(sysfs_print_nvme_info); + LOOKUP_HETERODB_EXTRA_FUNCTION(heterodb_license_reload); + LOOKUP_HETERODB_EXTRA_FUNCTION(heterodb_license_query); + } + PG_CATCH(); + { + p_heterodb_extra_error_data = NULL; + p_heterodb_extra_module_init = NULL; + p_gpudirect_init_driver = NULL; + p_gpudirect_open_driver = NULL; + p_gpudirect_close_driver = NULL; + p_gpudirect_file_desc_open = NULL; + p_gpudirect_file_desc_open_by_path = NULL; + p_gpudirect_file_desc_close = NULL; + p_gpudirect_map_gpu_memory = NULL; + p_gpudirect_unmap_gpu_memory = NULL; + p_gpudirect_file_read_iov = NULL; + p_sysfs_setup_distance_map = NULL; + p_sysfs_lookup_optimal_gpus = NULL; + p_sysfs_print_nvme_info = NULL; + p_heterodb_license_reload = NULL; + p_heterodb_license_query = NULL; + PG_RE_THROW(); + } + PG_END_TRY(); + elog(LOG, "HeteroDB Extra module loaded [%s]", extra_module_info); + + license = __heterodb_license_query(); + if (license) + { + elog(LOG, "HeteroDB License: %s", license); + pfree(license); + } +} diff --git a/next/float2.c b/old/float2.c similarity index 50% rename from next/float2.c rename to old/float2.c index 3e591b48c..4d3d1bfa4 100644 --- a/next/float2.c +++ b/old/float2.c @@ -3,8 +3,8 @@ * * half-precision floating point data type support * ---- - * Copyright 2011-2023 (C) KaiGai Kohei - * Copyright 2014-2023 (C) PG-Strom Developers Team + * Copyright 2011-2021 (C) KaiGai Kohei + * Copyright 2014-2021 (C) PG-Strom Developers Team * * This program is free software; you can redistribute it and/or modify * it under the terms of the PostgreSQL License. @@ -12,27 +12,16 @@ #include "pg_strom.h" #include "float2.h" -#ifndef EMULATE_FLOAT2 -#define PG_GETARG_FP16(x) __short_as_half__(PG_GETARG_UINT16(x)) -#define PG_GETARG_FP16_AS_FP32(x) ((float)PG_GETARG_FP16(x)) -#define PG_GETARG_FP16_AS_FP64(x) ((double)PG_GETARG_FP16(x)) -#define PG_RETURN_FP16(x) PG_RETURN_UINT16(__half_as_short__(x)) -#define PG_RETURN_FP32_AS_FP16(x) PG_RETURN_FP16((float2_t)(x)) -#define PG_RETURN_FP64_AS_FP16(x) PG_RETURN_FP16((float2_t)(x)) -#else -#define PG_GETARG_FP16(x) PG_GETARG_UINT16(x) -#define PG_GETARG_FP16_AS_FP32(x) fp16_to_fp32(PG_GETARG_FP16(x)) -#define PG_GETARG_FP16_AS_FP64(x) fp16_to_fp64(PG_GETARG_FP16(x)) -#define PG_RETURN_FP16(x) PG_RETURN_UINT16(x) -#define PG_RETURN_FP32_AS_FP16(x) PG_RETURN_FP16(fp32_to_fp16(x)) -#define PG_RETURN_FP64_AS_FP16(x) PG_RETURN_FP16(fp64_to_fp16(x)) -#endif +#define PG_GETARG_FLOAT2(x) PG_GETARG_INT16(x) +#define PG_RETURN_FLOAT2(x) PG_RETURN_INT16(x) +#define DatumGetFloat2(x) DatumGetInt16(x) +#define Float2GetDatum(x) Int16GetDatum(x) /* type i/o handler */ -PG_FUNCTION_INFO_V1(pgstrom_float2in); -PG_FUNCTION_INFO_V1(pgstrom_float2out); -PG_FUNCTION_INFO_V1(pgstrom_float2recv); -PG_FUNCTION_INFO_V1(pgstrom_float2send); +PG_FUNCTION_INFO_V1(pgstrom_float2_in); +PG_FUNCTION_INFO_V1(pgstrom_float2_out); +PG_FUNCTION_INFO_V1(pgstrom_float2_recv); +PG_FUNCTION_INFO_V1(pgstrom_float2_send); /* type cast */ PG_FUNCTION_INFO_V1(pgstrom_float2_to_float4); PG_FUNCTION_INFO_V1(pgstrom_float2_to_float8); @@ -49,88 +38,95 @@ PG_FUNCTION_INFO_V1(pgstrom_int4_to_float2); PG_FUNCTION_INFO_V1(pgstrom_int8_to_float2); PG_FUNCTION_INFO_V1(pgstrom_numeric_to_float2); /* type comparison */ -PG_FUNCTION_INFO_V1(pgstrom_float2eq); -PG_FUNCTION_INFO_V1(pgstrom_float2ne); -PG_FUNCTION_INFO_V1(pgstrom_float2lt); -PG_FUNCTION_INFO_V1(pgstrom_float2le); -PG_FUNCTION_INFO_V1(pgstrom_float2gt); -PG_FUNCTION_INFO_V1(pgstrom_float2ge); -PG_FUNCTION_INFO_V1(pgstrom_float2cmp); -PG_FUNCTION_INFO_V1(pgstrom_float2larger); -PG_FUNCTION_INFO_V1(pgstrom_float2smaller); -PG_FUNCTION_INFO_V1(pgstrom_float2hash); - -PG_FUNCTION_INFO_V1(pgstrom_float42eq); -PG_FUNCTION_INFO_V1(pgstrom_float42ne); -PG_FUNCTION_INFO_V1(pgstrom_float42lt); -PG_FUNCTION_INFO_V1(pgstrom_float42le); -PG_FUNCTION_INFO_V1(pgstrom_float42gt); -PG_FUNCTION_INFO_V1(pgstrom_float42ge); -PG_FUNCTION_INFO_V1(pgstrom_float42cmp); - -PG_FUNCTION_INFO_V1(pgstrom_float82eq); -PG_FUNCTION_INFO_V1(pgstrom_float82ne); -PG_FUNCTION_INFO_V1(pgstrom_float82lt); -PG_FUNCTION_INFO_V1(pgstrom_float82le); -PG_FUNCTION_INFO_V1(pgstrom_float82gt); -PG_FUNCTION_INFO_V1(pgstrom_float82ge); -PG_FUNCTION_INFO_V1(pgstrom_float82cmp); - -PG_FUNCTION_INFO_V1(pgstrom_float24eq); -PG_FUNCTION_INFO_V1(pgstrom_float24ne); -PG_FUNCTION_INFO_V1(pgstrom_float24lt); -PG_FUNCTION_INFO_V1(pgstrom_float24le); -PG_FUNCTION_INFO_V1(pgstrom_float24gt); -PG_FUNCTION_INFO_V1(pgstrom_float24ge); -PG_FUNCTION_INFO_V1(pgstrom_float24cmp); - -PG_FUNCTION_INFO_V1(pgstrom_float28eq); -PG_FUNCTION_INFO_V1(pgstrom_float28ne); -PG_FUNCTION_INFO_V1(pgstrom_float28lt); -PG_FUNCTION_INFO_V1(pgstrom_float28le); -PG_FUNCTION_INFO_V1(pgstrom_float28gt); -PG_FUNCTION_INFO_V1(pgstrom_float28ge); -PG_FUNCTION_INFO_V1(pgstrom_float28cmp); +PG_FUNCTION_INFO_V1(pgstrom_float2_eq); +PG_FUNCTION_INFO_V1(pgstrom_float2_ne); +PG_FUNCTION_INFO_V1(pgstrom_float2_lt); +PG_FUNCTION_INFO_V1(pgstrom_float2_le); +PG_FUNCTION_INFO_V1(pgstrom_float2_gt); +PG_FUNCTION_INFO_V1(pgstrom_float2_ge); +PG_FUNCTION_INFO_V1(pgstrom_float2_cmp); +PG_FUNCTION_INFO_V1(pgstrom_float2_larger); +PG_FUNCTION_INFO_V1(pgstrom_float2_smaller); +PG_FUNCTION_INFO_V1(pgstrom_float2_hash); + +PG_FUNCTION_INFO_V1(pgstrom_float42_eq); +PG_FUNCTION_INFO_V1(pgstrom_float42_ne); +PG_FUNCTION_INFO_V1(pgstrom_float42_lt); +PG_FUNCTION_INFO_V1(pgstrom_float42_le); +PG_FUNCTION_INFO_V1(pgstrom_float42_gt); +PG_FUNCTION_INFO_V1(pgstrom_float42_ge); +PG_FUNCTION_INFO_V1(pgstrom_float42_cmp); + +PG_FUNCTION_INFO_V1(pgstrom_float82_eq); +PG_FUNCTION_INFO_V1(pgstrom_float82_ne); +PG_FUNCTION_INFO_V1(pgstrom_float82_lt); +PG_FUNCTION_INFO_V1(pgstrom_float82_le); +PG_FUNCTION_INFO_V1(pgstrom_float82_gt); +PG_FUNCTION_INFO_V1(pgstrom_float82_ge); +PG_FUNCTION_INFO_V1(pgstrom_float82_cmp); + +PG_FUNCTION_INFO_V1(pgstrom_float24_eq); +PG_FUNCTION_INFO_V1(pgstrom_float24_ne); +PG_FUNCTION_INFO_V1(pgstrom_float24_lt); +PG_FUNCTION_INFO_V1(pgstrom_float24_le); +PG_FUNCTION_INFO_V1(pgstrom_float24_gt); +PG_FUNCTION_INFO_V1(pgstrom_float24_ge); +PG_FUNCTION_INFO_V1(pgstrom_float24_cmp); + +PG_FUNCTION_INFO_V1(pgstrom_float28_eq); +PG_FUNCTION_INFO_V1(pgstrom_float28_ne); +PG_FUNCTION_INFO_V1(pgstrom_float28_lt); +PG_FUNCTION_INFO_V1(pgstrom_float28_le); +PG_FUNCTION_INFO_V1(pgstrom_float28_gt); +PG_FUNCTION_INFO_V1(pgstrom_float28_ge); +PG_FUNCTION_INFO_V1(pgstrom_float28_cmp); /* unary operators */ -PG_FUNCTION_INFO_V1(pgstrom_float2up); -PG_FUNCTION_INFO_V1(pgstrom_float2um); -PG_FUNCTION_INFO_V1(pgstrom_float2abs); +PG_FUNCTION_INFO_V1(pgstrom_float2_up); +PG_FUNCTION_INFO_V1(pgstrom_float2_um); +PG_FUNCTION_INFO_V1(pgstrom_float2_abs); /* arithmetric operators */ -PG_FUNCTION_INFO_V1(pgstrom_float2pl); -PG_FUNCTION_INFO_V1(pgstrom_float2mi); -PG_FUNCTION_INFO_V1(pgstrom_float2mul); -PG_FUNCTION_INFO_V1(pgstrom_float2div); - -PG_FUNCTION_INFO_V1(pgstrom_float24pl); -PG_FUNCTION_INFO_V1(pgstrom_float24mi); -PG_FUNCTION_INFO_V1(pgstrom_float24mul); -PG_FUNCTION_INFO_V1(pgstrom_float24div); - -PG_FUNCTION_INFO_V1(pgstrom_float28pl); -PG_FUNCTION_INFO_V1(pgstrom_float28mi); -PG_FUNCTION_INFO_V1(pgstrom_float28mul); -PG_FUNCTION_INFO_V1(pgstrom_float28div); - -PG_FUNCTION_INFO_V1(pgstrom_float42pl); -PG_FUNCTION_INFO_V1(pgstrom_float42mi); -PG_FUNCTION_INFO_V1(pgstrom_float42mul); -PG_FUNCTION_INFO_V1(pgstrom_float42div); - -PG_FUNCTION_INFO_V1(pgstrom_float82pl); -PG_FUNCTION_INFO_V1(pgstrom_float82mi); -PG_FUNCTION_INFO_V1(pgstrom_float82mul); -PG_FUNCTION_INFO_V1(pgstrom_float82div); +PG_FUNCTION_INFO_V1(pgstrom_float2_pl); +PG_FUNCTION_INFO_V1(pgstrom_float2_mi); +PG_FUNCTION_INFO_V1(pgstrom_float2_mul); +PG_FUNCTION_INFO_V1(pgstrom_float2_div); + +PG_FUNCTION_INFO_V1(pgstrom_float24_pl); +PG_FUNCTION_INFO_V1(pgstrom_float24_mi); +PG_FUNCTION_INFO_V1(pgstrom_float24_mul); +PG_FUNCTION_INFO_V1(pgstrom_float24_div); + +PG_FUNCTION_INFO_V1(pgstrom_float28_pl); +PG_FUNCTION_INFO_V1(pgstrom_float28_mi); +PG_FUNCTION_INFO_V1(pgstrom_float28_mul); +PG_FUNCTION_INFO_V1(pgstrom_float28_div); + +PG_FUNCTION_INFO_V1(pgstrom_float42_pl); +PG_FUNCTION_INFO_V1(pgstrom_float42_mi); +PG_FUNCTION_INFO_V1(pgstrom_float42_mul); +PG_FUNCTION_INFO_V1(pgstrom_float42_div); + +PG_FUNCTION_INFO_V1(pgstrom_float82_pl); +PG_FUNCTION_INFO_V1(pgstrom_float82_mi); +PG_FUNCTION_INFO_V1(pgstrom_float82_mul); +PG_FUNCTION_INFO_V1(pgstrom_float82_div); /* misc functions */ PG_FUNCTION_INFO_V1(pgstrom_cash_mul_flt2); PG_FUNCTION_INFO_V1(pgstrom_flt2_mul_cash); PG_FUNCTION_INFO_V1(pgstrom_cash_div_flt2); +PG_FUNCTION_INFO_V1(pgstrom_float8_as_int8); +PG_FUNCTION_INFO_V1(pgstrom_float4_as_int4); +PG_FUNCTION_INFO_V1(pgstrom_float2_as_int2); +PG_FUNCTION_INFO_V1(pgstrom_int8_as_float8); +PG_FUNCTION_INFO_V1(pgstrom_int4_as_float4); +PG_FUNCTION_INFO_V1(pgstrom_int2_as_float2); PG_FUNCTION_INFO_V1(pgstrom_float2_accum); PG_FUNCTION_INFO_V1(pgstrom_float2_sum); +PG_FUNCTION_INFO_V1(pgstrom_define_shell_type); static inline void -print_fp16(const char *prefix, uint32 value) +print_fp16(const char *prefix, cl_uint value) { elog(INFO, "%sFP16 0x%04x = %d + %d + 0x%04x", prefix ? prefix : "", @@ -141,7 +137,7 @@ print_fp16(const char *prefix, uint32 value) } static inline void -print_fp32(const char *prefix, uint32 value) +print_fp32(const char *prefix, cl_uint value) { elog(INFO, "%sFP32 0x%08x = %d + %d + 0x%08x", prefix ? prefix : "", @@ -152,7 +148,7 @@ print_fp32(const char *prefix, uint32 value) } static inline void -print_fp64(const char *prefix, uint64 value) +print_fp64(const char *prefix, cl_ulong value) { elog(INFO, "%sFP64 0x%016lx = %d + %ld + %014lx", prefix ? prefix : "", @@ -179,41 +175,46 @@ print_fp64(const char *prefix, uint64 value) } while(0) /* - * pgstrom_float2in + * pgstrom_float2_in */ Datum -pgstrom_float2in(PG_FUNCTION_ARGS) +pgstrom_float2_in(PG_FUNCTION_ARGS) { - float fval = DatumGetFloat4(float4in(fcinfo)); + Datum datum = float4in(fcinfo); + float fval; + + if (fcinfo->isnull) + PG_RETURN_NULL(); + fval = DatumGetFloat4(datum); - PG_RETURN_FP32_AS_FP16(fval); + PG_RETURN_FLOAT2(fp32_to_fp16(fval)); } /* - * pgstrom_float2out + * pgstrom_float2_out */ Datum -pgstrom_float2out(PG_FUNCTION_ARGS) +pgstrom_float2_out(PG_FUNCTION_ARGS) { - float fval = PG_GETARG_FP16_AS_FP32(0); + float fval = fp16_to_fp32((half_t)PG_GETARG_FLOAT2(0)); return DirectFunctionCall1(float4out, Float4GetDatum(fval)); } /* - * pgstrom_float2recv + * pgstrom_float2_recv */ Datum -pgstrom_float2recv(PG_FUNCTION_ARGS) +pgstrom_float2_recv(PG_FUNCTION_ARGS) { return int2recv(fcinfo); } /* - * pgstrom_float2send + * pgstrom_float2_send */ Datum -pgstrom_float2send(PG_FUNCTION_ARGS) +pgstrom_float2_send(PG_FUNCTION_ARGS) { return int2send(fcinfo); } @@ -224,9 +225,9 @@ pgstrom_float2send(PG_FUNCTION_ARGS) Datum pgstrom_float2_to_float4(PG_FUNCTION_ARGS) { - float fval = PG_GETARG_FP16_AS_FP32(0); + half_t fval = PG_GETARG_FLOAT2(0); - PG_RETURN_FLOAT4(fval); + PG_RETURN_FLOAT4(fp16_to_fp32(fval)); } /* @@ -235,9 +236,9 @@ pgstrom_float2_to_float4(PG_FUNCTION_ARGS) Datum pgstrom_float2_to_float8(PG_FUNCTION_ARGS) { - double fval = PG_GETARG_FP16_AS_FP64(0); + half_t fval = PG_GETARG_FLOAT2(0); - PG_RETURN_FLOAT8(fval); + PG_RETURN_FLOAT8(fp16_to_fp64(fval)); } /* @@ -246,7 +247,7 @@ pgstrom_float2_to_float8(PG_FUNCTION_ARGS) Datum pgstrom_float2_to_int1(PG_FUNCTION_ARGS) { - float fval = PG_GETARG_FP16_AS_FP32(0); + float fval = fp16_to_fp32(PG_GETARG_FLOAT2(0)); Datum ival = DirectFunctionCall1(ftoi4, Float4GetDatum(fval)); if (DatumGetInt32(ival) < SCHAR_MIN || @@ -263,7 +264,7 @@ pgstrom_float2_to_int1(PG_FUNCTION_ARGS) Datum pgstrom_float2_to_int2(PG_FUNCTION_ARGS) { - float fval = PG_GETARG_FP16_AS_FP32(0); + float fval = fp16_to_fp32(PG_GETARG_FLOAT2(0)); return DirectFunctionCall1(ftoi2, Float4GetDatum(fval)); } @@ -274,7 +275,7 @@ pgstrom_float2_to_int2(PG_FUNCTION_ARGS) Datum pgstrom_float2_to_int4(PG_FUNCTION_ARGS) { - float fval = PG_GETARG_FP16_AS_FP32(0); + float fval = fp16_to_fp32(PG_GETARG_FLOAT2(0)); return DirectFunctionCall1(ftoi4, Float4GetDatum(fval)); } @@ -285,9 +286,9 @@ pgstrom_float2_to_int4(PG_FUNCTION_ARGS) Datum pgstrom_float2_to_int8(PG_FUNCTION_ARGS) { - float fval = PG_GETARG_FP16_AS_FP32(0); + double fval = fp16_to_fp64(PG_GETARG_FLOAT2(0)); - return DirectFunctionCall1(ftoi8, Float4GetDatum(fval)); + return DirectFunctionCall1(dtoi8, Float8GetDatum(fval)); } /* @@ -296,7 +297,7 @@ pgstrom_float2_to_int8(PG_FUNCTION_ARGS) Datum pgstrom_float2_to_numeric(PG_FUNCTION_ARGS) { - float fval = PG_GETARG_FP16_AS_FP32(0); + float fval = fp16_to_fp32(PG_GETARG_FLOAT2(0)); return DirectFunctionCall1(float4_numeric, Float4GetDatum(fval)); } @@ -309,7 +310,7 @@ pgstrom_float4_to_float2(PG_FUNCTION_ARGS) { float fval = PG_GETARG_FLOAT4(0); - PG_RETURN_FP32_AS_FP16(fval); + PG_RETURN_FLOAT2(fp32_to_fp16(fval)); } /* @@ -320,7 +321,7 @@ pgstrom_float8_to_float2(PG_FUNCTION_ARGS) { double fval = PG_GETARG_FLOAT8(0); - PG_RETURN_FP64_AS_FP16(fval); + PG_RETURN_FLOAT2(fp64_to_fp16(fval)); } /* @@ -329,9 +330,9 @@ pgstrom_float8_to_float2(PG_FUNCTION_ARGS) Datum pgstrom_int1_to_float2(PG_FUNCTION_ARGS) { - int32 ival = (int32)PG_GETARG_DATUM(0); + float fval = (float)((int32)PG_GETARG_DATUM(0)); - PG_RETURN_FP32_AS_FP16((float)ival); + PG_RETURN_FLOAT2(fp32_to_fp16(fval)); } /* @@ -342,7 +343,7 @@ pgstrom_int2_to_float2(PG_FUNCTION_ARGS) { float fval = (float) PG_GETARG_INT16(0); - PG_RETURN_FP32_AS_FP16(fval); + PG_RETURN_FLOAT2(fp32_to_fp16(fval)); } /* @@ -353,7 +354,7 @@ pgstrom_int4_to_float2(PG_FUNCTION_ARGS) { double fval = (double) PG_GETARG_INT32(0); - PG_RETURN_FP64_AS_FP16(fval); + PG_RETURN_FLOAT2(fp64_to_fp16(fval)); } /* @@ -364,7 +365,7 @@ pgstrom_int8_to_float2(PG_FUNCTION_ARGS) { double fval = (double) PG_GETARG_INT64(0); - PG_RETURN_FP64_AS_FP16(fval); + PG_RETURN_FLOAT2(fp64_to_fp16(fval)); } /* @@ -375,101 +376,101 @@ pgstrom_numeric_to_float2(PG_FUNCTION_ARGS) { float fval = DatumGetFloat4(numeric_float4(fcinfo)); - PG_RETURN_FP32_AS_FP16(fval); + PG_RETURN_FLOAT2(fp32_to_fp16(fval)); } /* * Comparison operators */ Datum -pgstrom_float2eq(PG_FUNCTION_ARGS) +pgstrom_float2_eq(PG_FUNCTION_ARGS) { - float arg1 = PG_GETARG_FP16_AS_FP32(0); - float arg2 = PG_GETARG_FP16_AS_FP32(1); + float arg1 = fp16_to_fp32(PG_GETARG_FLOAT2(0)); + float arg2 = fp16_to_fp32(PG_GETARG_FLOAT2(1)); PG_RETURN_BOOL(float4_cmp_internal(arg1, arg2) == 0); } Datum -pgstrom_float2ne(PG_FUNCTION_ARGS) +pgstrom_float2_ne(PG_FUNCTION_ARGS) { - float arg1 = PG_GETARG_FP16_AS_FP32(0); - float arg2 = PG_GETARG_FP16_AS_FP32(1); + float arg1 = fp16_to_fp32(PG_GETARG_FLOAT2(0)); + float arg2 = fp16_to_fp32(PG_GETARG_FLOAT2(1)); PG_RETURN_BOOL(float4_cmp_internal(arg1, arg2) != 0); } Datum -pgstrom_float2lt(PG_FUNCTION_ARGS) +pgstrom_float2_lt(PG_FUNCTION_ARGS) { - float arg1 = PG_GETARG_FP16_AS_FP32(0); - float arg2 = PG_GETARG_FP16_AS_FP32(1); + float arg1 = fp16_to_fp32(PG_GETARG_FLOAT2(0)); + float arg2 = fp16_to_fp32(PG_GETARG_FLOAT2(1)); PG_RETURN_BOOL(float4_cmp_internal(arg1, arg2) < 0); } Datum -pgstrom_float2le(PG_FUNCTION_ARGS) +pgstrom_float2_le(PG_FUNCTION_ARGS) { - float arg1 = PG_GETARG_FP16_AS_FP32(0); - float arg2 = PG_GETARG_FP16_AS_FP32(1); + float arg1 = fp16_to_fp32(PG_GETARG_FLOAT2(0)); + float arg2 = fp16_to_fp32(PG_GETARG_FLOAT2(1)); PG_RETURN_BOOL(float4_cmp_internal(arg1, arg2) <= 0); } Datum -pgstrom_float2gt(PG_FUNCTION_ARGS) +pgstrom_float2_gt(PG_FUNCTION_ARGS) { - float arg1 = PG_GETARG_FP16_AS_FP32(0); - float arg2 = PG_GETARG_FP16_AS_FP32(1); + float arg1 = fp16_to_fp32(PG_GETARG_FLOAT2(0)); + float arg2 = fp16_to_fp32(PG_GETARG_FLOAT2(1)); PG_RETURN_BOOL(float4_cmp_internal(arg1, arg2) > 0); } Datum -pgstrom_float2ge(PG_FUNCTION_ARGS) +pgstrom_float2_ge(PG_FUNCTION_ARGS) { - float arg1 = PG_GETARG_FP16_AS_FP32(0); - float arg2 = PG_GETARG_FP16_AS_FP32(1); + float arg1 = fp16_to_fp32(PG_GETARG_FLOAT2(0)); + float arg2 = fp16_to_fp32(PG_GETARG_FLOAT2(1)); PG_RETURN_BOOL(float4_cmp_internal(arg1, arg2) >= 0); } Datum -pgstrom_float2cmp(PG_FUNCTION_ARGS) +pgstrom_float2_cmp(PG_FUNCTION_ARGS) { - float arg1 = PG_GETARG_FP16_AS_FP32(0); - float arg2 = PG_GETARG_FP16_AS_FP32(1); + float arg1 = fp16_to_fp32(PG_GETARG_FLOAT2(0)); + float arg2 = fp16_to_fp32(PG_GETARG_FLOAT2(1)); int comp = float4_cmp_internal(arg1, arg2); PG_RETURN_INT32(comp > 0 ? 1 : (comp < 0 ? -1 : 0)); } Datum -pgstrom_float2larger(PG_FUNCTION_ARGS) +pgstrom_float2_larger(PG_FUNCTION_ARGS) { - float arg1 = PG_GETARG_FP16_AS_FP32(0); - float arg2 = PG_GETARG_FP16_AS_FP32(1); + half_t arg1 = PG_GETARG_FLOAT2(0); + half_t arg2 = PG_GETARG_FLOAT2(1); - PG_RETURN_DATUM(arg1 > arg2 ? PG_GETARG_DATUM(0) : PG_GETARG_DATUM(1)); + PG_RETURN_FLOAT2(fp16_to_fp32(arg1) > fp16_to_fp32(arg2) ? arg1 : arg2); } Datum -pgstrom_float2smaller(PG_FUNCTION_ARGS) +pgstrom_float2_smaller(PG_FUNCTION_ARGS) { - float arg1 = PG_GETARG_FP16_AS_FP32(0); - float arg2 = PG_GETARG_FP16_AS_FP32(1); + half_t arg1 = PG_GETARG_FLOAT2(0); + half_t arg2 = PG_GETARG_FLOAT2(1); - PG_RETURN_DATUM(arg1 < arg2 ? PG_GETARG_DATUM(0) : PG_GETARG_DATUM(1)); + PG_RETURN_FLOAT2(fp16_to_fp32(arg1) < fp16_to_fp32(arg2) ? arg1 : arg2); } Datum -pgstrom_float2hash(PG_FUNCTION_ARGS) +pgstrom_float2_hash(PG_FUNCTION_ARGS) { - half_t fval = PG_GETARG_UINT16(0); - int32 sign = (fval & 0x8000); - int32 expo = (fval & 0x7c00) >> 10; - int32 frac = (fval & 0x03ff); + half_t fval = PG_GETARG_FLOAT2(0); + cl_int sign = (fval & 0x8000); + cl_int expo = (fval & 0x7c00) >> 10; + cl_int frac = (fval & 0x03ff); if (expo == 0x1f) { @@ -486,191 +487,191 @@ pgstrom_float2hash(PG_FUNCTION_ARGS) } Datum -pgstrom_float42eq(PG_FUNCTION_ARGS) +pgstrom_float42_eq(PG_FUNCTION_ARGS) { float arg1 = PG_GETARG_FLOAT4(0); - float arg2 = PG_GETARG_FP16_AS_FP32(1); + float arg2 = fp16_to_fp32(PG_GETARG_FLOAT2(1)); PG_RETURN_BOOL(float4_cmp_internal(arg1, arg2) == 0); } Datum -pgstrom_float42ne(PG_FUNCTION_ARGS) +pgstrom_float42_ne(PG_FUNCTION_ARGS) { float arg1 = PG_GETARG_FLOAT4(0); - float arg2 = PG_GETARG_FP16_AS_FP32(1); + float arg2 = fp16_to_fp32(PG_GETARG_FLOAT2(1)); PG_RETURN_BOOL(float4_cmp_internal(arg1, arg2) != 0); } Datum -pgstrom_float42lt(PG_FUNCTION_ARGS) +pgstrom_float42_lt(PG_FUNCTION_ARGS) { float arg1 = PG_GETARG_FLOAT4(0); - float arg2 = PG_GETARG_FP16_AS_FP32(1); + float arg2 = fp16_to_fp32(PG_GETARG_FLOAT2(1)); PG_RETURN_BOOL(float4_cmp_internal(arg1, arg2) < 0); } Datum -pgstrom_float42le(PG_FUNCTION_ARGS) +pgstrom_float42_le(PG_FUNCTION_ARGS) { float arg1 = PG_GETARG_FLOAT4(0); - float arg2 = PG_GETARG_FP16_AS_FP32(1); + float arg2 = fp16_to_fp32(PG_GETARG_FLOAT2(1)); PG_RETURN_BOOL(float4_cmp_internal(arg1, arg2) <= 0); } Datum -pgstrom_float42gt(PG_FUNCTION_ARGS) +pgstrom_float42_gt(PG_FUNCTION_ARGS) { float arg1 = PG_GETARG_FLOAT4(0); - float arg2 = PG_GETARG_FP16_AS_FP32(1); + float arg2 = fp16_to_fp32(PG_GETARG_FLOAT2(1)); PG_RETURN_BOOL(float4_cmp_internal(arg1, arg2) > 0); } Datum -pgstrom_float42ge(PG_FUNCTION_ARGS) +pgstrom_float42_ge(PG_FUNCTION_ARGS) { float arg1 = PG_GETARG_FLOAT4(0); - float arg2 = PG_GETARG_FP16_AS_FP32(1); + float arg2 = fp16_to_fp32(PG_GETARG_FLOAT2(1)); PG_RETURN_BOOL(float4_cmp_internal(arg1, arg2) >= 0); } Datum -pgstrom_float42cmp(PG_FUNCTION_ARGS) +pgstrom_float42_cmp(PG_FUNCTION_ARGS) { float arg1 = PG_GETARG_FLOAT4(0); - float arg2 = PG_GETARG_FP16_AS_FP32(1); + float arg2 = fp16_to_fp32(PG_GETARG_FLOAT2(1)); int comp = float4_cmp_internal(arg1, arg2); PG_RETURN_INT32(comp > 0 ? 1 : (comp < 0 ? -1 : 0)); } Datum -pgstrom_float82eq(PG_FUNCTION_ARGS) +pgstrom_float82_eq(PG_FUNCTION_ARGS) { double arg1 = PG_GETARG_FLOAT8(0); - double arg2 = PG_GETARG_FP16_AS_FP64(1); + double arg2 = fp16_to_fp64(PG_GETARG_FLOAT2(1)); PG_RETURN_BOOL(float8_cmp_internal(arg1, arg2) == 0); } Datum -pgstrom_float82ne(PG_FUNCTION_ARGS) +pgstrom_float82_ne(PG_FUNCTION_ARGS) { double arg1 = PG_GETARG_FLOAT8(0); - double arg2 = PG_GETARG_FP16_AS_FP64(1); + double arg2 = fp16_to_fp64(PG_GETARG_FLOAT2(1)); PG_RETURN_BOOL(float8_cmp_internal(arg1, arg2) != 0); } Datum -pgstrom_float82lt(PG_FUNCTION_ARGS) +pgstrom_float82_lt(PG_FUNCTION_ARGS) { double arg1 = PG_GETARG_FLOAT8(0); - double arg2 = PG_GETARG_FP16_AS_FP64(1); + double arg2 = fp16_to_fp64(PG_GETARG_FLOAT2(1)); PG_RETURN_BOOL(float8_cmp_internal(arg1, arg2) < 0); } Datum -pgstrom_float82le(PG_FUNCTION_ARGS) +pgstrom_float82_le(PG_FUNCTION_ARGS) { double arg1 = PG_GETARG_FLOAT8(0); - double arg2 = PG_GETARG_FP16_AS_FP64(1); + double arg2 = fp16_to_fp64(PG_GETARG_FLOAT2(1)); PG_RETURN_BOOL(float8_cmp_internal(arg1, arg2) <= 0); } Datum -pgstrom_float82gt(PG_FUNCTION_ARGS) +pgstrom_float82_gt(PG_FUNCTION_ARGS) { double arg1 = PG_GETARG_FLOAT8(0); - double arg2 = PG_GETARG_FP16_AS_FP64(1); + double arg2 = fp16_to_fp64(PG_GETARG_FLOAT2(1)); PG_RETURN_BOOL(float8_cmp_internal(arg1, arg2) > 0); } Datum -pgstrom_float82ge(PG_FUNCTION_ARGS) +pgstrom_float82_ge(PG_FUNCTION_ARGS) { double arg1 = PG_GETARG_FLOAT8(0); - double arg2 = PG_GETARG_FP16_AS_FP64(1); + double arg2 = fp16_to_fp64(PG_GETARG_FLOAT2(1)); PG_RETURN_BOOL(float8_cmp_internal(arg1, arg2) >= 0); } Datum -pgstrom_float82cmp(PG_FUNCTION_ARGS) +pgstrom_float82_cmp(PG_FUNCTION_ARGS) { double arg1 = PG_GETARG_FLOAT8(0); - double arg2 = PG_GETARG_FP16_AS_FP64(1); + double arg2 = fp16_to_fp64(PG_GETARG_FLOAT2(1)); int comp = float8_cmp_internal(arg1, arg2); PG_RETURN_INT32(comp > 0 ? 1 : (comp < 0 ? -1 : 0)); } Datum -pgstrom_float24eq(PG_FUNCTION_ARGS) +pgstrom_float24_eq(PG_FUNCTION_ARGS) { - float arg1 = PG_GETARG_FP16_AS_FP32(0); + float arg1 = fp16_to_fp32(PG_GETARG_FLOAT2(0)); float arg2 = PG_GETARG_FLOAT4(1); PG_RETURN_BOOL(float4_cmp_internal(arg1, arg2) == 0); } Datum -pgstrom_float24ne(PG_FUNCTION_ARGS) +pgstrom_float24_ne(PG_FUNCTION_ARGS) { - float arg1 = PG_GETARG_FP16_AS_FP32(0); + float arg1 = fp16_to_fp32(PG_GETARG_FLOAT2(0)); float arg2 = PG_GETARG_FLOAT4(1); PG_RETURN_BOOL(float4_cmp_internal(arg1, arg2) != 0); } Datum -pgstrom_float24lt(PG_FUNCTION_ARGS) +pgstrom_float24_lt(PG_FUNCTION_ARGS) { - float arg1 = PG_GETARG_FP16_AS_FP32(0); + float arg1 = fp16_to_fp32(PG_GETARG_FLOAT2(0)); float arg2 = PG_GETARG_FLOAT4(1); PG_RETURN_BOOL(float4_cmp_internal(arg1, arg2) < 0); } Datum -pgstrom_float24le(PG_FUNCTION_ARGS) +pgstrom_float24_le(PG_FUNCTION_ARGS) { - float arg1 = PG_GETARG_FP16_AS_FP32(0); + float arg1 = fp16_to_fp32(PG_GETARG_FLOAT2(0)); float arg2 = PG_GETARG_FLOAT4(1); PG_RETURN_BOOL(float4_cmp_internal(arg1, arg2) <= 0); } Datum -pgstrom_float24gt(PG_FUNCTION_ARGS) +pgstrom_float24_gt(PG_FUNCTION_ARGS) { - float arg1 = PG_GETARG_FP16_AS_FP32(0); + float arg1 = fp16_to_fp32(PG_GETARG_FLOAT2(0)); float arg2 = PG_GETARG_FLOAT4(1); PG_RETURN_BOOL(float4_cmp_internal(arg1, arg2) > 0); } Datum -pgstrom_float24ge(PG_FUNCTION_ARGS) +pgstrom_float24_ge(PG_FUNCTION_ARGS) { - float arg1 = PG_GETARG_FP16_AS_FP32(0); + float arg1 = fp16_to_fp32(PG_GETARG_FLOAT2(0)); float arg2 = PG_GETARG_FLOAT4(1); PG_RETURN_BOOL(float4_cmp_internal(arg1, arg2) >= 0); } Datum -pgstrom_float24cmp(PG_FUNCTION_ARGS) +pgstrom_float24_cmp(PG_FUNCTION_ARGS) { - float arg1 = PG_GETARG_FP16_AS_FP32(0); + float arg1 = fp16_to_fp32(PG_GETARG_FLOAT2(0)); float arg2 = PG_GETARG_FLOAT4(1); int comp = float4_cmp_internal(arg1, arg2); @@ -678,63 +679,63 @@ pgstrom_float24cmp(PG_FUNCTION_ARGS) } Datum -pgstrom_float28eq(PG_FUNCTION_ARGS) +pgstrom_float28_eq(PG_FUNCTION_ARGS) { - double arg1 = PG_GETARG_FP16_AS_FP64(0); + double arg1 = fp16_to_fp64(PG_GETARG_FLOAT2(0)); double arg2 = PG_GETARG_FLOAT8(1); PG_RETURN_BOOL(float8_cmp_internal(arg1, arg2) == 0); } Datum -pgstrom_float28ne(PG_FUNCTION_ARGS) +pgstrom_float28_ne(PG_FUNCTION_ARGS) { - double arg1 = PG_GETARG_FP16_AS_FP64(0); + double arg1 = fp16_to_fp64(PG_GETARG_FLOAT2(0)); double arg2 = PG_GETARG_FLOAT8(1); PG_RETURN_BOOL(float8_cmp_internal(arg1, arg2) != 0); } Datum -pgstrom_float28lt(PG_FUNCTION_ARGS) +pgstrom_float28_lt(PG_FUNCTION_ARGS) { - double arg1 = PG_GETARG_FP16_AS_FP64(0); + double arg1 = fp16_to_fp64(PG_GETARG_FLOAT2(0)); double arg2 = PG_GETARG_FLOAT8(1); PG_RETURN_BOOL(float8_cmp_internal(arg1, arg2) < 0); } Datum -pgstrom_float28le(PG_FUNCTION_ARGS) +pgstrom_float28_le(PG_FUNCTION_ARGS) { - double arg1 = PG_GETARG_FP16_AS_FP64(0); + double arg1 = fp16_to_fp64(PG_GETARG_FLOAT2(0)); double arg2 = PG_GETARG_FLOAT8(1); PG_RETURN_BOOL(float8_cmp_internal(arg1, arg2) <= 0); } Datum -pgstrom_float28gt(PG_FUNCTION_ARGS) +pgstrom_float28_gt(PG_FUNCTION_ARGS) { - double arg1 = PG_GETARG_FP16_AS_FP64(0); + double arg1 = fp16_to_fp64(PG_GETARG_FLOAT2(0)); double arg2 = PG_GETARG_FLOAT8(1); PG_RETURN_BOOL(float8_cmp_internal(arg1, arg2) > 0); } Datum -pgstrom_float28ge(PG_FUNCTION_ARGS) +pgstrom_float28_ge(PG_FUNCTION_ARGS) { - double arg1 = PG_GETARG_FP16_AS_FP64(0); + double arg1 = fp16_to_fp64(PG_GETARG_FLOAT2(0)); double arg2 = PG_GETARG_FLOAT8(1); PG_RETURN_BOOL(float8_cmp_internal(arg1, arg2) >= 0); } Datum -pgstrom_float28cmp(PG_FUNCTION_ARGS) +pgstrom_float28_cmp(PG_FUNCTION_ARGS) { - double arg1 = PG_GETARG_FP16_AS_FP64(0); + double arg1 = fp16_to_fp64(PG_GETARG_FLOAT2(0)); double arg2 = PG_GETARG_FLOAT8(1); int comp = float8_cmp_internal(arg1, arg2); @@ -745,45 +746,41 @@ pgstrom_float28cmp(PG_FUNCTION_ARGS) * unary operators */ Datum -pgstrom_float2up(PG_FUNCTION_ARGS) +pgstrom_float2_up(PG_FUNCTION_ARGS) { - float2_t fval = PG_GETARG_FP16(0); + half_t fval = PG_GETARG_FLOAT2(0); - PG_RETURN_FP16(fval); + PG_RETURN_FLOAT2(fval); } Datum -pgstrom_float2um(PG_FUNCTION_ARGS) +pgstrom_float2_um(PG_FUNCTION_ARGS) { - float2_t fval = PG_GETARG_FP16(0); -#ifndef EMULATE_FLOAT2 - fval = -fval; -#else - fval ^= ~0x8000; -#endif - PG_RETURN_FP16(fval); + half_t fval = PG_GETARG_FLOAT2(0); + + fval ^= 0x8000; + + PG_RETURN_FLOAT2(fval); } Datum -pgstrom_float2abs(PG_FUNCTION_ARGS) +pgstrom_float2_abs(PG_FUNCTION_ARGS) { - float2_t fval = PG_GETARG_FP16(0); -#ifndef EMULATE_FLOAT2 - fval = abs(fval); -#else + half_t fval = PG_GETARG_FLOAT2(0); + fval &= ~0x8000; -#endif - PG_RETURN_FP16(fval); + + PG_RETURN_FLOAT2(fval); } /* * arithmetic operations */ Datum -pgstrom_float2pl(PG_FUNCTION_ARGS) +pgstrom_float2_pl(PG_FUNCTION_ARGS) { - float arg1 = PG_GETARG_FP16_AS_FP32(0); - float arg2 = PG_GETARG_FP16_AS_FP32(1); + float arg1 = fp16_to_fp32(PG_GETARG_FLOAT2(0)); + float arg2 = fp16_to_fp32(PG_GETARG_FLOAT2(1)); float result; result = arg1 + arg2; @@ -793,10 +790,10 @@ pgstrom_float2pl(PG_FUNCTION_ARGS) } Datum -pgstrom_float2mi(PG_FUNCTION_ARGS) +pgstrom_float2_mi(PG_FUNCTION_ARGS) { - float arg1 = PG_GETARG_FP16_AS_FP32(0); - float arg2 = PG_GETARG_FP16_AS_FP32(1); + float arg1 = fp16_to_fp32(PG_GETARG_FLOAT2(0)); + float arg2 = fp16_to_fp32(PG_GETARG_FLOAT2(1)); float result; result = arg1 - arg2; @@ -805,10 +802,10 @@ pgstrom_float2mi(PG_FUNCTION_ARGS) } Datum -pgstrom_float2mul(PG_FUNCTION_ARGS) +pgstrom_float2_mul(PG_FUNCTION_ARGS) { - float arg1 = PG_GETARG_FP16_AS_FP32(0); - float arg2 = PG_GETARG_FP16_AS_FP32(1); + float arg1 = fp16_to_fp32(PG_GETARG_FLOAT2(0)); + float arg2 = fp16_to_fp32(PG_GETARG_FLOAT2(1)); float result; result = arg1 * arg2; @@ -820,10 +817,10 @@ pgstrom_float2mul(PG_FUNCTION_ARGS) } Datum -pgstrom_float2div(PG_FUNCTION_ARGS) +pgstrom_float2_div(PG_FUNCTION_ARGS) { - float arg1 = PG_GETARG_FP16_AS_FP32(0); - float arg2 = PG_GETARG_FP16_AS_FP32(1); + float arg1 = fp16_to_fp32(PG_GETARG_FLOAT2(0)); + float arg2 = fp16_to_fp32(PG_GETARG_FLOAT2(1)); float result; if (arg2 == 0.0) @@ -837,9 +834,9 @@ pgstrom_float2div(PG_FUNCTION_ARGS) } Datum -pgstrom_float24pl(PG_FUNCTION_ARGS) +pgstrom_float24_pl(PG_FUNCTION_ARGS) { - float arg1 = PG_GETARG_FP16_AS_FP32(0); + float arg1 = fp16_to_fp32(PG_GETARG_FLOAT2(0)); float arg2 = PG_GETARG_FLOAT4(1); float result; @@ -850,9 +847,9 @@ pgstrom_float24pl(PG_FUNCTION_ARGS) } Datum -pgstrom_float24mi(PG_FUNCTION_ARGS) +pgstrom_float24_mi(PG_FUNCTION_ARGS) { - float arg1 = PG_GETARG_FP16_AS_FP32(0); + float arg1 = fp16_to_fp32(PG_GETARG_FLOAT2(0)); float arg2 = PG_GETARG_FLOAT4(1); float result; @@ -863,9 +860,9 @@ pgstrom_float24mi(PG_FUNCTION_ARGS) } Datum -pgstrom_float24mul(PG_FUNCTION_ARGS) +pgstrom_float24_mul(PG_FUNCTION_ARGS) { - float arg1 = PG_GETARG_FP16_AS_FP32(0); + float arg1 = fp16_to_fp32(PG_GETARG_FLOAT2(0)); float arg2 = PG_GETARG_FLOAT4(1); float result; @@ -877,9 +874,9 @@ pgstrom_float24mul(PG_FUNCTION_ARGS) } Datum -pgstrom_float24div(PG_FUNCTION_ARGS) +pgstrom_float24_div(PG_FUNCTION_ARGS) { - float arg1 = PG_GETARG_FP16_AS_FP32(0); + float arg1 = fp16_to_fp32(PG_GETARG_FLOAT2(0)); float arg2 = PG_GETARG_FLOAT4(1); float result; @@ -895,9 +892,9 @@ pgstrom_float24div(PG_FUNCTION_ARGS) } Datum -pgstrom_float28pl(PG_FUNCTION_ARGS) +pgstrom_float28_pl(PG_FUNCTION_ARGS) { - double arg1 = PG_GETARG_FP16_AS_FP32(0); + double arg1 = fp16_to_fp64(PG_GETARG_FLOAT2(0)); double arg2 = PG_GETARG_FLOAT8(1); double result; @@ -908,9 +905,9 @@ pgstrom_float28pl(PG_FUNCTION_ARGS) } Datum -pgstrom_float28mi(PG_FUNCTION_ARGS) +pgstrom_float28_mi(PG_FUNCTION_ARGS) { - double arg1 = PG_GETARG_FP16_AS_FP64(0); + double arg1 = fp16_to_fp64(PG_GETARG_FLOAT2(0)); double arg2 = PG_GETARG_FLOAT8(1); double result; @@ -921,9 +918,9 @@ pgstrom_float28mi(PG_FUNCTION_ARGS) } Datum -pgstrom_float28mul(PG_FUNCTION_ARGS) +pgstrom_float28_mul(PG_FUNCTION_ARGS) { - double arg1 = PG_GETARG_FP16_AS_FP64(0); + double arg1 = fp16_to_fp64(PG_GETARG_FLOAT2(0)); double arg2 = PG_GETARG_FLOAT8(1); double result; @@ -935,9 +932,9 @@ pgstrom_float28mul(PG_FUNCTION_ARGS) } Datum -pgstrom_float28div(PG_FUNCTION_ARGS) +pgstrom_float28_div(PG_FUNCTION_ARGS) { - double arg1 = PG_GETARG_FP16_AS_FP64(0); + double arg1 = fp16_to_fp64(PG_GETARG_FLOAT2(0)); double arg2 = PG_GETARG_FLOAT8(1); double result; @@ -953,10 +950,10 @@ pgstrom_float28div(PG_FUNCTION_ARGS) } Datum -pgstrom_float42pl(PG_FUNCTION_ARGS) +pgstrom_float42_pl(PG_FUNCTION_ARGS) { float arg1 = PG_GETARG_FLOAT4(0); - float arg2 = PG_GETARG_FP16_AS_FP32(1); + float arg2 = fp16_to_fp32(PG_GETARG_FLOAT2(1)); float result; result = arg1 + arg2; @@ -965,10 +962,10 @@ pgstrom_float42pl(PG_FUNCTION_ARGS) } Datum -pgstrom_float42mi(PG_FUNCTION_ARGS) +pgstrom_float42_mi(PG_FUNCTION_ARGS) { float arg1 = PG_GETARG_FLOAT4(0); - float arg2 = PG_GETARG_FP16_AS_FP32(1); + float arg2 = fp16_to_fp32(PG_GETARG_FLOAT2(1)); float result; result = arg1 - arg2; @@ -977,10 +974,10 @@ pgstrom_float42mi(PG_FUNCTION_ARGS) } Datum -pgstrom_float42mul(PG_FUNCTION_ARGS) +pgstrom_float42_mul(PG_FUNCTION_ARGS) { float arg1 = PG_GETARG_FLOAT4(0); - float arg2 = PG_GETARG_FP16_AS_FP32(1); + float arg2 = fp16_to_fp32(PG_GETARG_FLOAT2(1)); float result; result = arg1 * arg2; @@ -991,10 +988,10 @@ pgstrom_float42mul(PG_FUNCTION_ARGS) } Datum -pgstrom_float42div(PG_FUNCTION_ARGS) +pgstrom_float42_div(PG_FUNCTION_ARGS) { float arg1 = PG_GETARG_FLOAT4(0); - float arg2 = PG_GETARG_FP16_AS_FP32(1); + float arg2 = fp16_to_fp32(PG_GETARG_FLOAT2(1)); float result; if (arg2 == 0.0) @@ -1008,10 +1005,10 @@ pgstrom_float42div(PG_FUNCTION_ARGS) } Datum -pgstrom_float82pl(PG_FUNCTION_ARGS) +pgstrom_float82_pl(PG_FUNCTION_ARGS) { double arg1 = PG_GETARG_FLOAT8(0); - double arg2 = PG_GETARG_FP16_AS_FP64(1); + double arg2 = fp16_to_fp64(PG_GETARG_FLOAT2(1)); double result; result = arg1 + arg2; @@ -1020,10 +1017,10 @@ pgstrom_float82pl(PG_FUNCTION_ARGS) } Datum -pgstrom_float82mi(PG_FUNCTION_ARGS) +pgstrom_float82_mi(PG_FUNCTION_ARGS) { double arg1 = PG_GETARG_FLOAT8(0); - double arg2 = PG_GETARG_FP16_AS_FP64(1); + double arg2 = fp16_to_fp64(PG_GETARG_FLOAT2(1)); double result; result = arg1 - arg2; @@ -1032,10 +1029,10 @@ pgstrom_float82mi(PG_FUNCTION_ARGS) } Datum -pgstrom_float82mul(PG_FUNCTION_ARGS) +pgstrom_float82_mul(PG_FUNCTION_ARGS) { double arg1 = PG_GETARG_FLOAT8(0); - double arg2 = PG_GETARG_FP16_AS_FP64(1); + double arg2 = fp16_to_fp64(PG_GETARG_FLOAT2(1)); double result; result = arg1 * arg2; @@ -1047,10 +1044,10 @@ pgstrom_float82mul(PG_FUNCTION_ARGS) } Datum -pgstrom_float82div(PG_FUNCTION_ARGS) +pgstrom_float82_div(PG_FUNCTION_ARGS) { double arg1 = PG_GETARG_FLOAT8(0); - double arg2 = PG_GETARG_FP16_AS_FP64(1); + double arg2 = fp16_to_fp64(PG_GETARG_FLOAT2(1)); double result; if (arg2 == 0.0) @@ -1071,7 +1068,7 @@ Datum pgstrom_cash_mul_flt2(PG_FUNCTION_ARGS) { Cash c = PG_GETARG_CASH(0); - float8 f = PG_GETARG_FP16_AS_FP64(1); + float8 f = fp16_to_fp64(PG_GETARG_FLOAT2(1)); Cash result; result = rint(c * f); @@ -1081,7 +1078,7 @@ pgstrom_cash_mul_flt2(PG_FUNCTION_ARGS) Datum pgstrom_flt2_mul_cash(PG_FUNCTION_ARGS) { - float8 f = PG_GETARG_FP16_AS_FP64(0); + float8 f = fp16_to_fp64(PG_GETARG_FLOAT2(0)); Cash c = PG_GETARG_CASH(1); Cash result; @@ -1093,7 +1090,7 @@ Datum pgstrom_cash_div_flt2(PG_FUNCTION_ARGS) { Cash c = PG_GETARG_CASH(0); - float8 f = PG_GETARG_FP16_AS_FP64(1); + float8 f = fp16_to_fp64(PG_GETARG_FLOAT2(1)); Cash result; if (f == 0.0) @@ -1105,12 +1102,60 @@ pgstrom_cash_div_flt2(PG_FUNCTION_ARGS) PG_RETURN_CASH(result); } +Datum +pgstrom_float8_as_int8(PG_FUNCTION_ARGS) +{ + float8 fval = PG_GETARG_FLOAT8(0); + + PG_RETURN_INT64(double_as_long(fval)); +} + +Datum +pgstrom_float4_as_int4(PG_FUNCTION_ARGS) +{ + float4 fval = PG_GETARG_FLOAT4(0); + + PG_RETURN_INT32(float_as_int(fval)); +} + +Datum +pgstrom_float2_as_int2(PG_FUNCTION_ARGS) +{ + half_t fval = PG_GETARG_FLOAT2(0); + + PG_RETURN_INT16(fval); /* actually, half_t is unsigned short */ +} + +Datum +pgstrom_int8_as_float8(PG_FUNCTION_ARGS) +{ + int64 ival = PG_GETARG_INT64(0); + + PG_RETURN_FLOAT8(long_as_double(ival)); +} + +Datum +pgstrom_int4_as_float4(PG_FUNCTION_ARGS) +{ + int32 ival = PG_GETARG_INT32(0); + + PG_RETURN_FLOAT4(int_as_float(ival)); +} + +Datum +pgstrom_int2_as_float2(PG_FUNCTION_ARGS) +{ + int16 ival = PG_GETARG_INT16(0); + + PG_RETURN_FLOAT2(ival); /* actually, half_t is unsigned short */ +} + Datum pgstrom_float2_accum(PG_FUNCTION_ARGS) { ArrayType *transarray = PG_GETARG_ARRAYTYPE_P(0); /* do computations as float8 */ - float8 newval = PG_GETARG_FP16_AS_FP64(1); + float8 newval = fp16_to_fp64(PG_GETARG_FLOAT2(1)); float8 *transvalues; float8 N, sumX, sumX2; @@ -1171,14 +1216,96 @@ pgstrom_float2_sum(PG_FUNCTION_ARGS) { if (PG_ARGISNULL(1)) PG_RETURN_NULL(); /* still no non-null */ - newval = PG_GETARG_FP16_AS_FP64(1); + newval = fp16_to_fp64(PG_GETARG_FLOAT2(1)); } else { newval = PG_GETARG_FLOAT8(0); if (!PG_ARGISNULL(1)) - newval += PG_GETARG_FP16_AS_FP64(1); + newval += fp16_to_fp64(PG_GETARG_FLOAT2(1)); } PG_RETURN_FLOAT8(newval); } + +Datum +pgstrom_define_shell_type(PG_FUNCTION_ARGS) +{ + Name type_name = PG_GETARG_NAME(0); + Oid type_oid = PG_GETARG_OID(1); + Oid type_namespace = PG_GETARG_OID(2); + Relation type_rel; + TupleDesc tupdesc; + HeapTuple tup; + Datum values[Natts_pg_type]; + bool isnull[Natts_pg_type]; + + if (!superuser()) + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + errmsg("must be superuser to create a shell type"))); + /* see TypeShellMake */ + type_rel = table_open(TypeRelationId, RowExclusiveLock); + tupdesc = RelationGetDescr(type_rel); + + memset(values, 0, sizeof(values)); + memset(isnull, 0, sizeof(isnull)); +#if PG_VERSION_NUM >= 120000 + values[Anum_pg_type_oid-1] = type_oid; +#endif + values[Anum_pg_type_typname-1] = NameGetDatum(type_name); + values[Anum_pg_type_typnamespace-1] = ObjectIdGetDatum(type_namespace); + values[Anum_pg_type_typowner-1] = ObjectIdGetDatum(GetUserId()); + values[Anum_pg_type_typlen-1] = Int16GetDatum(sizeof(int32)); + values[Anum_pg_type_typbyval-1] = BoolGetDatum(true); + values[Anum_pg_type_typtype-1] = CharGetDatum(TYPTYPE_PSEUDO); + values[Anum_pg_type_typcategory-1] =CharGetDatum(TYPCATEGORY_PSEUDOTYPE); + values[Anum_pg_type_typispreferred-1] = BoolGetDatum(false); + values[Anum_pg_type_typisdefined-1] = BoolGetDatum(false); + values[Anum_pg_type_typdelim-1] = CharGetDatum(DEFAULT_TYPDELIM); + values[Anum_pg_type_typrelid-1] = ObjectIdGetDatum(InvalidOid); + values[Anum_pg_type_typelem-1] = ObjectIdGetDatum(InvalidOid); + values[Anum_pg_type_typarray-1] = ObjectIdGetDatum(InvalidOid); + values[Anum_pg_type_typinput-1] = ObjectIdGetDatum(F_SHELL_IN); + values[Anum_pg_type_typoutput-1] = ObjectIdGetDatum(F_SHELL_OUT); + values[Anum_pg_type_typreceive-1] = ObjectIdGetDatum(InvalidOid); + values[Anum_pg_type_typsend-1] = ObjectIdGetDatum(InvalidOid); + values[Anum_pg_type_typmodin-1] = ObjectIdGetDatum(InvalidOid); + values[Anum_pg_type_typmodout-1] = ObjectIdGetDatum(InvalidOid); + values[Anum_pg_type_typanalyze-1] = ObjectIdGetDatum(InvalidOid); + values[Anum_pg_type_typalign-1] = CharGetDatum('i'); + values[Anum_pg_type_typstorage-1] = CharGetDatum('p'); + values[Anum_pg_type_typnotnull-1] = BoolGetDatum(false); + values[Anum_pg_type_typbasetype-1] = ObjectIdGetDatum(InvalidOid); + values[Anum_pg_type_typtypmod-1] = Int32GetDatum(-1); + values[Anum_pg_type_typndims-1] = Int32GetDatum(0); + values[Anum_pg_type_typcollation-1] = ObjectIdGetDatum(InvalidOid); + isnull[Anum_pg_type_typdefaultbin-1] = true; + isnull[Anum_pg_type_typdefault-1] = true; + isnull[Anum_pg_type_typacl-1] = true; + + /* create a new type tuple, and insert */ + tup = heap_form_tuple(tupdesc, values, isnull); +#if PG_VERSION_NUM < 120000 + HeapTupleSetOid(tup, type_oid); +#endif + CatalogTupleInsert(type_rel, tup); + + /* create dependencies */ + GenerateTypeDependencies(tup, + type_rel, + NULL, + NULL, + 0, + false, + false, + true, + false); + /* Post creation hook for new shell type */ + InvokeObjectPostCreateHook(TypeRelationId, type_oid, 0); + + heap_freetuple(tup); + table_close(type_rel, RowExclusiveLock); + + PG_RETURN_OID(type_oid); +} diff --git a/next/float2.h b/old/float2.h similarity index 56% rename from next/float2.h rename to old/float2.h index daca297ce..00f445e7e 100644 --- a/next/float2.h +++ b/old/float2.h @@ -3,8 +3,8 @@ * * Definition of half-precision floating-point * -- - * Copyright 2011-2023 (C) KaiGai Kohei - * Copyright 2014-2023 (C) PG-Strom Developers Team + * Copyright 2011-2021 (C) KaiGai Kohei + * Copyright 2014-2021 (C) PG-Strom Developers Team * * This program is free software; you can redistribute it and/or modify * it under the terms of the PostgreSQL License. @@ -15,18 +15,6 @@ typedef uint16_t half_t; -#if defined(__CUDACC__) -#include -typedef __half float2_t; -#elif defined(HAVE_FLOAT2) -typedef _Float16 float2_t; -#else -#define EMULATE_FLOAT2 1 -typedef half_t float2_t; -#endif -typedef float float4_t; -typedef double float8_t; - /* parameters of floating-point */ #define FP16_FRAC_BITS (10) #define FP16_EXPO_BITS (5) @@ -47,98 +35,57 @@ typedef double float8_t; #define FP64_EXPO_BIAS (1023) /* int/float reinterpret functions */ -INLINE_FUNCTION(double) -__longlong_as_double__(const uint64_t ival) +static inline double +long_as_double(uint64_t ival) { -#ifdef __CUDACC__ - return __longlong_as_double(ival); -#else union { uint64_t ival; double fval; } datum; datum.ival = ival; return datum.fval; -#endif } -INLINE_FUNCTION(uint64_t) -__double_as_longlong__(const double fval) +static inline uint64_t +double_as_long(double fval) { -#ifdef __CUDACC__ - return __double_as_longlong(fval); -#else union { uint64_t ival; double fval; } datum; datum.fval = fval; return datum.ival; -#endif } -INLINE_FUNCTION(float) -__int_as_float__(const uint32_t ival) +static inline float +int_as_float(uint32_t ival) { -#ifdef __CUDACC__ - return __uint_as_float(ival); -#else union { uint32_t ival; float fval; } datum; datum.ival = ival; return datum.fval; -#endif } -INLINE_FUNCTION(uint32_t) -__float_as_int__(const float fval) +static inline uint32_t +float_as_int(float fval) { -#ifdef __CUDACC__ - return __float_as_uint(fval); -#else union { uint32_t ival; float fval; } datum; datum.fval = fval; return datum.ival; -#endif -} - -INLINE_FUNCTION(float2_t) -__short_as_half__(const uint16_t ival) -{ - union { - uint16_t ival; - float2_t fval; - } datum; - datum.ival = ival; - return datum.fval; -} - -INLINE_FUNCTION(uint16_t) -__half_as_short__(const float2_t fval) -{ - union { - uint16_t ival; - float2_t fval; - } datum; - datum.fval = fval; - return datum.ival; } /* - * cast functions across floating point if emulation mode + * cast functions across floating point */ -INLINE_FUNCTION(float2_t) -fp32_to_fp16(const float value) +static inline half_t +fp32_to_fp16(float value) { -#ifndef EMULATE_FLOAT2 - return (float2_t)value; -#else - uint32_t x = __float_as_int__(value); + uint32_t x = float_as_int(value); uint32_t u = (x & 0x7fffffffU); uint32_t sign = ((x >> 16U) & 0x8000U); uint32_t remainder; @@ -185,21 +132,17 @@ fp32_to_fp16(const float value) result++; return result; -#endif } -INLINE_FUNCTION(float2_t) +static inline half_t fp64_to_fp16(double fval) { return fp32_to_fp16((float)fval); } -INLINE_FUNCTION(float4_t) -fp16_to_fp32(float2_t fp16val) +static inline float +fp16_to_fp32(half_t fp16val) { -#ifndef EMULATE_FLOAT2 - return (float4_t)fp16val; -#else uint32_t sign = ((uint32_t)(fp16val & 0x8000) << 16); int32_t expo = ((fp16val & 0x7c00) >> 10); int32_t frac = ((fp16val & 0x03ff)); @@ -233,16 +176,12 @@ fp16_to_fp32(float2_t fp16val) result = (sign | (expo << FP32_FRAC_BITS) | (frac << 13)); } - return __int_as_float__(result); -#endif + return int_as_float(result); } -INLINE_FUNCTION(float8_t) +static inline double fp16_to_fp64(half_t fp16val) { -#ifndef EMULATE_FLOAT2 - return (float8_t)fp16val; -#else uint64_t sign = ((uint64_t)(fp16val & 0x8000) << 48); int64_t expo = ((fp16val & 0x7c00) >> 10); int64_t frac = ((fp16val & 0x03ff)); @@ -275,38 +214,7 @@ fp16_to_fp64(half_t fp16val) expo += FP64_EXPO_BIAS; result = (sign | (expo << FP64_FRAC_BITS) | (frac << 42)); } - return __longlong_as_double__(result); -#endif -} - -#ifdef __cplusplus -INLINE_FUNCTION(float2_t) __to_fp16(float2_t fval) { return fval; } -INLINE_FUNCTION(float2_t) __to_fp16(float4_t fval) { return fp32_to_fp16(fval); } -INLINE_FUNCTION(float2_t) __to_fp16(float8_t fval) { return fp64_to_fp16(fval); } - -INLINE_FUNCTION(float4_t) __to_fp32(float2_t fval) { return fp16_to_fp32(fval); } -INLINE_FUNCTION(float4_t) __to_fp32(float4_t fval) { return fval; } -INLINE_FUNCTION(float4_t) __to_fp32(float8_t fval) { return (float)fval; } - -INLINE_FUNCTION(float8_t) __to_fp64(float2_t fval) { return fp16_to_fp64(fval); } -INLINE_FUNCTION(float8_t) __to_fp64(float4_t fval) { return (double)fval; } -INLINE_FUNCTION(float8_t) __to_fp64(float8_t fval) { return fval; } - -INLINE_FUNCTION(float2_t) -__fp16_unary_plus(float2_t fval) -{ - return fval; -} -INLINE_FUNCTION(float2_t) -__fp16_unary_minus(float2_t fval) -{ - return __short_as_half__(__half_as_short__(fval) ^ 0x8000U); -} -INLINE_FUNCTION(float2_t) -__fp16_unary_abs(float2_t fval) -{ - return __short_as_half__(__half_as_short__(fval) & 0x7fffU); + return long_as_double(result); } -#endif #endif /* FLOAT2_H */ diff --git a/src/gpu_cache.c b/old/gpu_cache.c similarity index 100% rename from src/gpu_cache.c rename to old/gpu_cache.c diff --git a/src/gpu_context.c b/old/gpu_context.c similarity index 100% rename from src/gpu_context.c rename to old/gpu_context.c diff --git a/old/gpu_device.c b/old/gpu_device.c new file mode 100644 index 000000000..a55b7995e --- /dev/null +++ b/old/gpu_device.c @@ -0,0 +1,685 @@ +/* + * gpu_device.c + * + * Routines to collect GPU device information. + * ---- + * Copyright 2011-2021 (C) KaiGai Kohei + * Copyright 2014-2021 (C) PG-Strom Developers Team + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the PostgreSQL License. + */ +#include "pg_strom.h" + +/* variable declarations */ +DevAttributes *devAttrs = NULL; +cl_int numDevAttrs = 0; +cl_uint devBaselineMaxThreadsPerBlock = UINT_MAX; + +/* catalog of device attributes */ +typedef enum { + DEVATTRKIND__INT, + DEVATTRKIND__BYTES, + DEVATTRKIND__KB, + DEVATTRKIND__KHZ, + DEVATTRKIND__COMPUTEMODE, + DEVATTRKIND__BOOL, + DEVATTRKIND__BITS, +} DevAttrKind; + +static struct { + CUdevice_attribute attr_id; + DevAttrKind attr_kind; + size_t attr_offset; + const char *attr_desc; +} DevAttrCatalog[] = { +#define DEV_ATTR(LABEL,KIND,a,DESC) \ + { CU_DEVICE_ATTRIBUTE_##LABEL, \ + DEVATTRKIND__##KIND, \ + offsetof(struct DevAttributes, LABEL), \ + DESC }, +#include "device_attrs.h" +#undef DEV_ATTR +}; + +/* declaration */ +Datum pgstrom_device_info(PG_FUNCTION_ARGS); +Datum pgstrom_gpu_device_name(PG_FUNCTION_ARGS); +Datum pgstrom_gpu_global_memsize(PG_FUNCTION_ARGS); +Datum pgstrom_gpu_max_blocksize(PG_FUNCTION_ARGS); +Datum pgstrom_gpu_warp_size(PG_FUNCTION_ARGS); +Datum pgstrom_gpu_max_shared_memory_perblock(PG_FUNCTION_ARGS); +Datum pgstrom_gpu_num_registers_perblock(PG_FUNCTION_ARGS); +Datum pgstrom_gpu_num_multiptocessors(PG_FUNCTION_ARGS); +Datum pgstrom_gpu_num_cuda_cores(PG_FUNCTION_ARGS); +Datum pgstrom_gpu_cc_major(PG_FUNCTION_ARGS); +Datum pgstrom_gpu_cc_minor(PG_FUNCTION_ARGS); +Datum pgstrom_gpu_pci_id(PG_FUNCTION_ARGS); + +/* static variables */ +static bool gpudirect_driver_is_initialized = false; +static bool __pgstrom_gpudirect_enabled; /* GUC */ +static int __pgstrom_gpudirect_threshold; /* GUC */ + +/* + * pgstrom_gpudirect_enabled + */ +bool +pgstrom_gpudirect_enabled(void) +{ + return __pgstrom_gpudirect_enabled; +} + +/* + * pgstrom_gpudirect_enabled_checker + */ +static bool +pgstrom_gpudirect_enabled_checker(bool *p_newval, void **extra, GucSource source) +{ + bool newval = *p_newval; + + if (newval && !gpudirect_driver_is_initialized) + elog(ERROR, "cannot enable GPUDirectSQL without driver module loaded"); + return true; +} + +/* + * pgstrom_gpudirect_threshold + */ +Size +pgstrom_gpudirect_threshold(void) +{ + return (Size)__pgstrom_gpudirect_threshold << 10; +} + +/* + * pgstrom_collect_gpu_device + */ +static bool +pgstrom_collect_gpu_device(void) +{ + StringInfoData str; + const char *cmdline = (CMD_GPUINFO_PATH " -md"); + char linebuf[2048]; + FILE *filp; + char *tok_attr; + char *tok_val; + char *pos; + char *cuda_runtime_version = NULL; + char *nvidia_driver_version = NULL; + int num_devices = -1; /* total num of GPUs; incl legacy models */ + int i, cuda_dindex; + + Assert(numDevAttrs == 0); + filp = OpenPipeStream(cmdline, PG_BINARY_R); + if (!filp) + return false; + + initStringInfo(&str); + while (fgets(linebuf, sizeof(linebuf), filp) != NULL) + { + /* trim '\n' on the tail */ + pos = linebuf + strlen(linebuf); + while (pos > linebuf && isspace(*--pos)) + *pos = '\0'; + /* empty line? */ + if (linebuf[0] == '\0') + continue; + + tok_attr = strchr(linebuf, ':'); + if (!tok_attr) + elog(ERROR, "unexpected gpuinfo -md format"); + *tok_attr++ = '\0'; + + tok_val = strchr(tok_attr, '='); + if (!tok_val) + elog(ERROR, "incorrect gpuinfo -md format"); + *tok_val++ = '\0'; + + if (strcmp(linebuf, "PLATFORM") == 0) + { + if (strcmp(tok_attr, "CUDA_RUNTIME_VERSION") == 0) + cuda_runtime_version = pstrdup(tok_val); + else if (strcmp(tok_attr, "NVIDIA_DRIVER_VERSION") == 0) + nvidia_driver_version = pstrdup(tok_val); + else if (strcmp(tok_attr, "NUMBER_OF_DEVICES") == 0) + { + num_devices = atoi(tok_val); + if (num_devices < 0) + elog(ERROR, "NUMBER_OF_DEVICES is not correct"); + } + else + elog(ERROR, "unknown PLATFORM attribute"); + } + else if (strncmp(linebuf, "DEVICE", 6) == 0) + { + int dindex = atoi(linebuf + 6); + + if (!devAttrs) + { + if (!cuda_runtime_version || + !nvidia_driver_version || + num_devices < 0) + elog(ERROR, "incorrect gpuinfo -md format"); + Assert(num_devices > 0); + devAttrs = MemoryContextAllocZero(TopMemoryContext, + sizeof(DevAttributes) * + num_devices); + } + + if (dindex < 0 || dindex >= num_devices) + elog(ERROR, "device index out of range"); + +#define DEV_ATTR(LABEL,a,b,c) \ + else if (strcmp(tok_attr, #LABEL) == 0) \ + devAttrs[dindex].LABEL = atoi(tok_val); + + if (strcmp(tok_attr, "DEVICE_ID") == 0) + { + devAttrs[dindex].DEV_ID = atoi(tok_val); + } + else if (strcmp(tok_attr, "DEVICE_NAME") == 0) + { + strncpy(devAttrs[dindex].DEV_NAME, tok_val, + sizeof(devAttrs[dindex].DEV_NAME)); + } + else if (strcmp(tok_attr, "DEVICE_BRAND") == 0) + { + strncpy(devAttrs[dindex].DEV_BRAND, tok_val, + sizeof(devAttrs[dindex].DEV_BRAND)); + } + else if (strcmp(tok_attr, "DEVICE_UUID") == 0) + { + strncpy(devAttrs[dindex].DEV_UUID, tok_val, + sizeof(devAttrs[dindex].DEV_UUID)); + } + else if (strcmp(tok_attr, "GLOBAL_MEMORY_SIZE") == 0) + devAttrs[dindex].DEV_TOTAL_MEMSZ = atol(tok_val); + else if (strcmp(tok_attr, "PCI_BAR1_MEMORY_SIZE") == 0) + devAttrs[dindex].DEV_BAR1_MEMSZ = atol(tok_val); +#include "device_attrs.h" + else + elog(ERROR, "incorrect gpuinfo -md format"); +#undef DEV_ATTR + } + else + elog(ERROR, "unexpected gpuinfo -md input:\n%s", linebuf); + } + ClosePipeStream(filp); + + for (i=0, cuda_dindex=0; i < num_devices; i++) + { + DevAttributes *dattrs = &devAttrs[i]; + char path[MAXPGPATH]; + char linebuf[2048]; + FILE *filp; + + /* Recommend to use Pascal or later */ + if (dattrs->COMPUTE_CAPABILITY_MAJOR < 6) + { + elog(LOG, "PG-Strom: GPU%d %s - CC %d.%d is not supported", + dattrs->DEV_ID, + dattrs->DEV_NAME, + dattrs->COMPUTE_CAPABILITY_MAJOR, + dattrs->COMPUTE_CAPABILITY_MINOR); + continue; + } + + /* Update the baseline device capability */ + devBaselineMaxThreadsPerBlock = Min(devBaselineMaxThreadsPerBlock, + dattrs->MAX_THREADS_PER_BLOCK); + + /* + * Only Tesla or Quadro which have PCI Bar1 more than 256MB + * supports GPUDirectSQL + */ + dattrs->DEV_SUPPORT_GPUDIRECTSQL = false; + if (dattrs->DEV_BAR1_MEMSZ > (256UL << 20)) + { +#if CUDA_VERSION < 11030 + if (strcmp(dattrs->DEV_BRAND, "TESLA") == 0 || + strcmp(dattrs->DEV_BRAND, "QUADRO") == 0 || + strcmp(dattrs->DEV_BRAND, "NVIDIA") == 0) + dattrs->DEV_SUPPORT_GPUDIRECTSQL = true; +#else + if (dattrs->GPU_DIRECT_RDMA_SUPPORTED) + dattrs->DEV_SUPPORT_GPUDIRECTSQL = true; +#endif + } + + /* + * read the numa node-id from the sysfs entry + * + * Note that we assume device function-id is 0, because it is + * uncertain whether MULTI_GPU_BOARD_GROUP_ID is an adequate value + * to query, and these sibling devices obviously belongs to same + * numa-node, even if function-id is not identical. + */ + snprintf(path, sizeof(path), + "/sys/bus/pci/devices/%04x:%02x:%02x.0/numa_node", + dattrs->PCI_DOMAIN_ID, + dattrs->PCI_BUS_ID, + dattrs->PCI_DEVICE_ID); + filp = fopen(path, "r"); + if (!filp) + dattrs->NUMA_NODE_ID = -1; /* unknown */ + else + { + if (!fgets(linebuf, sizeof(linebuf), filp)) + dattrs->NUMA_NODE_ID = -1; /* unknown */ + else + dattrs->NUMA_NODE_ID = atoi(linebuf); + fclose(filp); + } + + /* Log brief CUDA device properties */ + resetStringInfo(&str); + appendStringInfo(&str, "GPU%d %s (%d SMs; %dMHz, L2 %dkB)", + dattrs->DEV_ID, dattrs->DEV_NAME, + dattrs->MULTIPROCESSOR_COUNT, + dattrs->CLOCK_RATE / 1000, + dattrs->L2_CACHE_SIZE >> 10); + if (dattrs->DEV_TOTAL_MEMSZ > (4UL << 30)) + appendStringInfo(&str, ", RAM %.2fGB", + ((double)dattrs->DEV_TOTAL_MEMSZ / + (double)(1UL << 30))); + else + appendStringInfo(&str, ", RAM %zuMB", + dattrs->DEV_TOTAL_MEMSZ >> 20); + if (dattrs->MEMORY_CLOCK_RATE > (1UL << 20)) + appendStringInfo(&str, " (%dbits, %.2fGHz)", + dattrs->GLOBAL_MEMORY_BUS_WIDTH, + ((double)dattrs->MEMORY_CLOCK_RATE / + (double)(1UL << 20))); + else + appendStringInfo(&str, " (%dbits, %dMHz)", + dattrs->GLOBAL_MEMORY_BUS_WIDTH, + dattrs->MEMORY_CLOCK_RATE >> 10); + + if (dattrs->DEV_BAR1_MEMSZ > (1UL << 30)) + appendStringInfo(&str, ", PCI-E Bar1 %luGB", + dattrs->DEV_BAR1_MEMSZ >> 30); + else if (dattrs->DEV_BAR1_MEMSZ > (1UL << 20)) + appendStringInfo(&str, ", PCI-E Bar1 %luMB", + dattrs->DEV_BAR1_MEMSZ >> 30); + + appendStringInfo(&str, ", CC %d.%d", + dattrs->COMPUTE_CAPABILITY_MAJOR, + dattrs->COMPUTE_CAPABILITY_MINOR); + elog(LOG, "PG-Strom: %s", str.data); + + if (i != cuda_dindex) + memcpy(&devAttrs[cuda_dindex], + &devAttrs[i], sizeof(DevAttributes)); + cuda_dindex++; + } + + if (num_devices > 0) + { + if (cuda_dindex == 0) + elog(ERROR, "PG-Strom: no supported GPU devices found"); + numDevAttrs = cuda_dindex; + return true; + } + return false; +} + +/* + * pgstrom_init_gpu_device + */ +void +pgstrom_init_gpu_device(void) +{ + static char *cuda_visible_devices = NULL; + bool default_gpudirect_enabled = false; + size_t default_threshold = 0; + size_t shared_buffer_size = (size_t)NBuffers * (size_t)BLCKSZ; + int i; + + /* + * Set CUDA_VISIBLE_DEVICES environment variable prior to CUDA + * initialization + */ + DefineCustomStringVariable("pg_strom.cuda_visible_devices", + "CUDA_VISIBLE_DEVICES environment variables", + NULL, + &cuda_visible_devices, + NULL, + PGC_POSTMASTER, + GUC_NOT_IN_SAMPLE, + NULL, NULL, NULL); + if (cuda_visible_devices) + { + if (setenv("CUDA_VISIBLE_DEVICES", cuda_visible_devices, 1) != 0) + elog(ERROR, "failed to set CUDA_VISIBLE_DEVICES"); + } + /* collect device properties by gpuinfo command */ + if (!pgstrom_collect_gpu_device()) + return; /* cpu_only_mode */ + + /* pgstrom.gpudirect_enabled */ + if (gpuDirectInitDriver() == 0) + { + for (i=0; i < numDevAttrs; i++) + { + if (devAttrs[i].DEV_SUPPORT_GPUDIRECTSQL) + default_gpudirect_enabled = true; + } + gpudirect_driver_is_initialized = true; + } + DefineCustomBoolVariable("pg_strom.gpudirect_enabled", + "enables GPUDirect SQL", + NULL, + &__pgstrom_gpudirect_enabled, + default_gpudirect_enabled, + PGC_SUSET, + GUC_NOT_IN_SAMPLE, + pgstrom_gpudirect_enabled_checker, NULL, NULL); + + /* + * MEMO: Threshold of table's physical size to use NVMe-Strom: + * ((System RAM size) - + * (shared_buffer size)) * 0.5 + (shared_buffer size) + * + * If table size is enough large to issue real i/o, NVMe-Strom will + * make advantage by higher i/o performance. + */ + if (PAGE_SIZE * PHYS_PAGES > shared_buffer_size / 2) + default_threshold = (PAGE_SIZE * PHYS_PAGES - shared_buffer_size / 2); + default_threshold += shared_buffer_size; + + DefineCustomIntVariable("pg_strom.gpudirect_threshold", + "Tablesize threshold to use GPUDirect SQL", + NULL, + &__pgstrom_gpudirect_threshold, + default_threshold >> 10, + 262144, /* 256MB */ + INT_MAX, + PGC_SUSET, + GUC_NOT_IN_SAMPLE | GUC_UNIT_KB, + NULL, NULL, NULL); + +} + +/* + * optimal_workgroup_size - calculates the optimal block size + * according to the function and device attributes + */ +static __thread size_t __dynamic_shmem_per_block; +static __thread size_t __dynamic_shmem_per_thread; + +static size_t +blocksize_to_shmemsize_helper(int blocksize) +{ + return (__dynamic_shmem_per_block + + __dynamic_shmem_per_thread * (size_t)blocksize); +} + +/* + * gpuOccupancyMaxPotentialBlockSize + */ +CUresult +gpuOccupancyMaxPotentialBlockSize(int *p_min_grid_sz, + int *p_max_block_sz, + CUfunction kern_function, + size_t dynamic_shmem_per_block, + size_t dynamic_shmem_per_thread) +{ + cl_int min_grid_sz; + cl_int max_block_sz; + CUresult rc; + + if (dynamic_shmem_per_thread > 0) + { + __dynamic_shmem_per_block = dynamic_shmem_per_block; + __dynamic_shmem_per_thread = dynamic_shmem_per_thread; + rc = cuOccupancyMaxPotentialBlockSize(&min_grid_sz, + &max_block_sz, + kern_function, + blocksize_to_shmemsize_helper, + 0, + 0); + } + else + { + rc = cuOccupancyMaxPotentialBlockSize(&min_grid_sz, + &max_block_sz, + kern_function, + 0, + dynamic_shmem_per_block, + 0); + } + if (p_min_grid_sz) + *p_min_grid_sz = min_grid_sz; + if (p_max_block_sz) + *p_max_block_sz = max_block_sz; + return rc; +} + +CUresult +gpuOptimalBlockSize(int *p_grid_sz, + int *p_block_sz, + CUfunction kern_function, + CUdevice cuda_device, + size_t dynamic_shmem_per_block, + size_t dynamic_shmem_per_thread) +{ + cl_int mp_count; + cl_int min_grid_sz; + cl_int max_block_sz; + cl_int max_multiplicity; + size_t dynamic_shmem_sz; + CUresult rc; + + rc = cuDeviceGetAttribute(&mp_count, + CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, + cuda_device); + if (rc != CUDA_SUCCESS) + return rc; + + rc = gpuOccupancyMaxPotentialBlockSize(&min_grid_sz, + &max_block_sz, + kern_function, + dynamic_shmem_per_block, + dynamic_shmem_per_thread); + if (rc != CUDA_SUCCESS) + return rc; + + dynamic_shmem_sz = (dynamic_shmem_per_block + + dynamic_shmem_per_thread * max_block_sz); + rc = cuOccupancyMaxActiveBlocksPerMultiprocessor(&max_multiplicity, + kern_function, + max_block_sz, + dynamic_shmem_sz); + if (rc != CUDA_SUCCESS) + return rc; + + *p_grid_sz = Min(GPUKERNEL_MAX_SM_MULTIPLICITY, + max_multiplicity) * mp_count; + *p_block_sz = max_block_sz; + + return CUDA_SUCCESS; +} + +CUresult +__gpuOptimalBlockSize(int *p_grid_sz, + int *p_block_sz, + CUfunction kern_function, + int cuda_dindex, + size_t dynamic_shmem_per_block, + size_t dynamic_shmem_per_thread) +{ + cl_int mp_count = devAttrs[cuda_dindex].MULTIPROCESSOR_COUNT; + cl_int min_grid_sz; + cl_int max_block_sz; + cl_int max_multiplicity; + size_t dynamic_shmem_sz; + CUresult rc; + + rc = gpuOccupancyMaxPotentialBlockSize(&min_grid_sz, + &max_block_sz, + kern_function, + dynamic_shmem_per_block, + dynamic_shmem_per_thread); + if (rc != CUDA_SUCCESS) + return rc; + + dynamic_shmem_sz = (dynamic_shmem_per_block + + dynamic_shmem_per_thread * max_block_sz); + rc = cuOccupancyMaxActiveBlocksPerMultiprocessor(&max_multiplicity, + kern_function, + max_block_sz, + dynamic_shmem_sz); + if (rc != CUDA_SUCCESS) + return rc; + + *p_grid_sz = Min(GPUKERNEL_MAX_SM_MULTIPLICITY, + max_multiplicity) * mp_count; + *p_block_sz = max_block_sz; + + return CUDA_SUCCESS; +} + +/* + * pgstrom_device_info - SQL function to dump device info + */ +Datum +pgstrom_device_info(PG_FUNCTION_ARGS) +{ + FuncCallContext *fncxt; + DevAttributes *dattrs; + int dindex; + int aindex; + const char *att_name; + const char *att_value; + Datum values[4]; + bool isnull[4]; + HeapTuple tuple; + + if (SRF_IS_FIRSTCALL()) + { + TupleDesc tupdesc; + MemoryContext oldcxt; + + fncxt = SRF_FIRSTCALL_INIT(); + oldcxt = MemoryContextSwitchTo(fncxt->multi_call_memory_ctx); + + tupdesc = CreateTemplateTupleDesc(4); + TupleDescInitEntry(tupdesc, (AttrNumber) 1, "device_nr", + INT4OID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 2, "aindex", + INT4OID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 3, "attribute", + TEXTOID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 4, "value", + TEXTOID, -1, 0); + fncxt->tuple_desc = BlessTupleDesc(tupdesc); + + fncxt->user_fctx = 0; + + MemoryContextSwitchTo(oldcxt); + } + fncxt = SRF_PERCALL_SETUP(); + + dindex = fncxt->call_cntr / (lengthof(DevAttrCatalog) + 5); + aindex = fncxt->call_cntr % (lengthof(DevAttrCatalog) + 5); + + if (dindex >= numDevAttrs) + SRF_RETURN_DONE(fncxt); + dattrs = &devAttrs[dindex]; + + if (aindex == 0) + { + att_name = "GPU Device Name"; + att_value = dattrs->DEV_NAME; + } + else if (aindex == 1) + { + att_name = "GPU Device Brand"; + att_value = dattrs->DEV_BRAND; + } + else if (aindex == 2) + { + att_name = "GPU Device UUID"; + att_value = dattrs->DEV_UUID; + } + else if (aindex == 3) + { + att_name = "GPU Total RAM Size"; + att_value = format_bytesz(dattrs->DEV_TOTAL_MEMSZ); + } + else if (aindex == 4) + { + att_name = "GPU PCI Bar1 Size"; + att_value = format_bytesz(dattrs->DEV_BAR1_MEMSZ); + } + else + { + int i = aindex - 5; + int value = *((int *)((char *)dattrs + + DevAttrCatalog[i].attr_offset)); + + att_name = DevAttrCatalog[i].attr_desc; + switch (DevAttrCatalog[i].attr_kind) + { + case DEVATTRKIND__INT: + att_value = psprintf("%d", value); + break; + case DEVATTRKIND__BYTES: + att_value = format_bytesz((size_t)value); + break; + case DEVATTRKIND__KB: + att_value = format_bytesz((size_t)value * 1024); + break; + case DEVATTRKIND__KHZ: + if (value > 4000000) + att_value = psprintf("%.2f GHz", (double)value/1000000.0); + else if (value > 4000) + att_value = psprintf("%d MHz", value / 1000); + else + att_value = psprintf("%d kHz", value); + break; + case DEVATTRKIND__COMPUTEMODE: + switch (value) + { + case CU_COMPUTEMODE_DEFAULT: + att_value = "Default"; + break; +#if CUDA_VERSION < 8000 + case CU_COMPUTEMODE_EXCLUSIVE: + att_value = "Exclusive"; + break; +#endif + case CU_COMPUTEMODE_PROHIBITED: + att_value = "Prohibited"; + break; + case CU_COMPUTEMODE_EXCLUSIVE_PROCESS: + att_value = "Exclusive Process"; + break; + default: + att_value = "Unknown"; + break; + } + break; + case DEVATTRKIND__BOOL: + att_value = psprintf("%s", value != 0 ? "True" : "False"); + break; + case DEVATTRKIND__BITS: + att_value = psprintf("%dbits", value); + break; + default: + elog(ERROR, "Bug? unknown DevAttrKind: %d", + (int)DevAttrCatalog[i].attr_kind); + } + } + memset(isnull, 0, sizeof(isnull)); + values[0] = Int32GetDatum(dattrs->DEV_ID); + values[1] = Int32GetDatum(aindex); + values[2] = CStringGetTextDatum(att_name); + values[3] = CStringGetTextDatum(att_value); + + tuple = heap_form_tuple(fncxt->tuple_desc, values, isnull); + + SRF_RETURN_NEXT(fncxt, HeapTupleGetDatum(tuple)); +} +PG_FUNCTION_INFO_V1(pgstrom_device_info); diff --git a/src/gpu_mmgr.c b/old/gpu_mmgr.c similarity index 100% rename from src/gpu_mmgr.c rename to old/gpu_mmgr.c diff --git a/src/gpu_tasks.c b/old/gpu_tasks.c similarity index 100% rename from src/gpu_tasks.c rename to old/gpu_tasks.c diff --git a/utils/gpuinfo.c b/old/gpuinfo.c similarity index 99% rename from utils/gpuinfo.c rename to old/gpuinfo.c index 20a7675b6..80338bcbf 100644 --- a/utils/gpuinfo.c +++ b/old/gpuinfo.c @@ -162,7 +162,7 @@ static struct const char *attname_m; int attisminor; /* skip without -d option */ } attribute_catalog[] = { -#include "../src/device_attrs.h" +#include "device_attrs.h" }; #undef DEV_ATTR diff --git a/src/gpujoin.c b/old/gpujoin.c similarity index 100% rename from src/gpujoin.c rename to old/gpujoin.c diff --git a/src/gpupreagg.c b/old/gpupreagg.c similarity index 100% rename from src/gpupreagg.c rename to old/gpupreagg.c diff --git a/src/gpuscan.c b/old/gpuscan.c similarity index 100% rename from src/gpuscan.c rename to old/gpuscan.c diff --git a/next/heterodb_extra.h b/old/heterodb_extra.h similarity index 59% rename from next/heterodb_extra.h rename to old/heterodb_extra.h index bef3525fd..ee7464de6 100644 --- a/next/heterodb_extra.h +++ b/old/heterodb_extra.h @@ -3,8 +3,8 @@ * * Definitions of HeteroDB Extra Package * -- - * Copyright 2011-2023 (C) KaiGai Kohei - * Copyright 2017-2023 (C) HeteroDB,Inc + * Copyright 2011-2021 (C) KaiGai Kohei + * Copyright 2017-2021 (C) HeteroDB,Inc * * This software is an extension of PostgreSQL; You can use, copy, * modify or distribute it under the terms of 'LICENSE' included @@ -16,7 +16,7 @@ #define HETERODB_EXTRA_FILENAME "heterodb_extra.so" #define HETERODB_EXTRA_PATHNAME "/usr/lib64/" HETERODB_EXTRA_FILENAME -#define HETERODB_EXTRA_API_VERSION 20221225 +#define HETERODB_EXTRA_API_VERSION 20211018 /* gpudirect.c */ typedef struct @@ -33,6 +33,26 @@ typedef struct strom_io_chunk ioc[1]; } strom_io_vector; +typedef struct GPUDirectFileDesc +{ + int rawfd; + void *fhandle; + size_t bytesize; + /* CUfileHandle_t is an alias of 'void *' defined at cufile.h */ +} GPUDirectFileDesc; + +/* sysfs.c */ +typedef struct +{ + int device_id; + char device_name[128]; + const char *cpu_affinity; /* __internal use__ */ + int pci_domain; /* PCI_DOMAIN_ID */ + int pci_bus_id; /* PCI_BUS_ID */ + int pci_dev_id; /* PCI_DEVICE_ID */ + int pci_func_id; /* MULTI_GPU_BOARD ? MULTI_GPU_BOARD_GROUP_ID : 0 */ +} GpuPciDevItem; + /* misc.c */ typedef struct { diff --git a/old/main.c b/old/main.c new file mode 100644 index 000000000..f5281bec8 --- /dev/null +++ b/old/main.c @@ -0,0 +1,628 @@ +/* + * main.c + * + * Entrypoint of PG-Strom extension, and misc uncategolized functions. + * ---- + * Copyright 2011-2021 (C) KaiGai Kohei + * Copyright 2014-2021 (C) PG-Strom Developers Team + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the PostgreSQL License. + */ +#include "pg_strom.h" + +PG_MODULE_MAGIC; + +/* + * miscellaneous GUC parameters + */ +bool pgstrom_enabled; +bool pgstrom_cpu_fallback_enabled; +bool pgstrom_regression_test_mode; + +/* cost factors */ +double pgstrom_gpu_setup_cost; +double pgstrom_gpu_dma_cost; +double pgstrom_gpu_operator_cost; + +/* misc static variables */ +static HTAB *gpu_path_htable = NULL; +static planner_hook_type planner_hook_next = NULL; +static CustomPathMethods pgstrom_dummy_path_methods; +static CustomScanMethods pgstrom_dummy_plan_methods; + +/* for compatibility of shmem_request_hook in PG14 or former */ +#if PG_VERSION_NUM < 150000 +shmem_request_hook_type shmem_request_hook = NULL; +#endif + +/* misc variables */ +long PAGE_SIZE; +long PAGE_MASK; +int PAGE_SHIFT; +long PHYS_PAGES; +int pgstrom_num_users_extra = 0; +pgstromUsersExtraDescriptor pgstrom_users_extra_desc[8]; + +/* pg_strom.githash() */ +PG_FUNCTION_INFO_V1(pgstrom_githash); +Datum +pgstrom_githash(PG_FUNCTION_ARGS) +{ +#ifdef PGSTROM_GITHASH + PG_RETURN_TEXT_P(cstring_to_text(PGSTROM_GITHASH)); +#else + PG_RETURN_NULL(); +#endif +} + +static void +pgstrom_init_common_guc(void) +{ + if (cpu_only_mode()) + { + /* Disables PG-Strom features by GPU */ + DefineCustomBoolVariable("pg_strom.enabled", + "Enables the planner's use of PG-Strom", + NULL, + &pgstrom_enabled, + false, + PGC_INTERNAL, + GUC_NOT_IN_SAMPLE, + NULL, NULL, NULL); + return; + } + /* turn on/off PG-Strom feature */ + DefineCustomBoolVariable("pg_strom.enabled", + "Enables the planner's use of PG-Strom", + NULL, + &pgstrom_enabled, + true, + PGC_USERSET, + GUC_NOT_IN_SAMPLE, + NULL, NULL, NULL); + /* turn on/off CPU fallback if GPU could not execute the query */ + DefineCustomBoolVariable("pg_strom.cpu_fallback", + "Enables CPU fallback if GPU required re-run", + NULL, + &pgstrom_cpu_fallback_enabled, + false, + PGC_USERSET, + GUC_NOT_IN_SAMPLE, + NULL, NULL, NULL); + /* cost factor for Gpu setup */ + DefineCustomRealVariable("pg_strom.gpu_setup_cost", + "Cost to setup GPU device to run", + NULL, + &pgstrom_gpu_setup_cost, + 4000 * DEFAULT_SEQ_PAGE_COST, + 0, + DBL_MAX, + PGC_USERSET, + GUC_NOT_IN_SAMPLE, + NULL, NULL, NULL); + /* cost factor for each Gpu task */ + DefineCustomRealVariable("pg_strom.gpu_dma_cost", + "Cost to send/recv data via DMA", + NULL, + &pgstrom_gpu_dma_cost, + 10 * DEFAULT_SEQ_PAGE_COST, + 0, + DBL_MAX, + PGC_USERSET, + GUC_NOT_IN_SAMPLE, + NULL, NULL, NULL); + /* cost factor for Gpu operator */ + DefineCustomRealVariable("pg_strom.gpu_operator_cost", + "Cost of processing each operators by GPU", + NULL, + &pgstrom_gpu_operator_cost, + DEFAULT_CPU_OPERATOR_COST / 16.0, + 0, + DBL_MAX, + PGC_USERSET, + GUC_NOT_IN_SAMPLE, + NULL, NULL, NULL); + /* disables some platform specific EXPLAIN output */ + DefineCustomBoolVariable("pg_strom.regression_test_mode", + "Disables some platform specific output in EXPLAIN; that can lead undesired test failed but harmless", + NULL, + &pgstrom_regression_test_mode, + false, + PGC_USERSET, + GUC_NO_SHOW_ALL | GUC_NOT_IN_SAMPLE, + NULL, NULL, NULL); +} + +/* + * GPU-aware path tracker + * + * motivation: add_path() and add_partial_path() keeps only cheapest paths. + * Once some other dominates GpuXXX paths, it shall be wiped out, even if + * it potentially has a chance for more optimization (e.g, GpuJoin outer + * pull-up, GpuPreAgg + GpuJoin combined mode). + * So, we preserve PG-Strom related Path-nodes for the later referenced. + */ +typedef struct +{ + PlannerInfo *root; + Relids relids; + bool outer_parallel; + bool inner_parallel; + const Path *cheapest_gpu_path; +} gpu_path_entry; + +static uint32 +gpu_path_entry_hashvalue(const void *key, Size keysize) +{ + gpu_path_entry *gent = (gpu_path_entry *)key; + uint32 hash; + uint32 flags = 0; + + hash = hash_uint32(((uintptr_t)gent->root & 0xffffffffUL) ^ + ((uintptr_t)gent->root >> 32)); + if (gent->relids != NULL) + { + Bitmapset *relids = gent->relids; + + hash ^= hash_any((unsigned char *)relids, + offsetof(Bitmapset, words[relids->nwords])); + } + if (gent->outer_parallel) + flags |= 0x01; + if (gent->inner_parallel) + flags |= 0x02; + hash ^= hash_uint32(flags); + + return hash; +} + +static int +gpu_path_entry_compare(const void *key1, const void *key2, Size keysize) +{ + gpu_path_entry *gent1 = (gpu_path_entry *)key1; + gpu_path_entry *gent2 = (gpu_path_entry *)key2; + + if (gent1->root == gent2->root && + bms_equal(gent1->relids, gent2->relids) && + gent1->outer_parallel == gent2->outer_parallel && + gent1->inner_parallel == gent2->inner_parallel) + return 0; + /* not equal */ + return 1; +} + +static void * +gpu_path_entry_keycopy(void *dest, const void *src, Size keysize) +{ + gpu_path_entry *dent = (gpu_path_entry *)dest; + const gpu_path_entry *sent = (const gpu_path_entry *)src; + + dent->root = sent->root; + dent->relids = bms_copy(sent->relids); + dent->outer_parallel = sent->outer_parallel; + dent->inner_parallel = sent->inner_parallel; + + return dest; +} + +const Path * +gpu_path_find_cheapest(PlannerInfo *root, RelOptInfo *rel, + bool outer_parallel, + bool inner_parallel) +{ + gpu_path_entry hkey; + gpu_path_entry *gent; + + memset(&hkey, 0, sizeof(gpu_path_entry)); + hkey.root = root; + hkey.relids = rel->relids; + hkey.outer_parallel = outer_parallel; + hkey.inner_parallel = inner_parallel; + + gent = hash_search(gpu_path_htable, &hkey, HASH_FIND, NULL); + if (!gent) + return NULL; + return gent->cheapest_gpu_path; +} + +bool +gpu_path_remember(PlannerInfo *root, RelOptInfo *rel, + bool outer_parallel, + bool inner_parallel, + const Path *gpu_path) +{ + gpu_path_entry hkey; + gpu_path_entry *gent; + bool found; + + memset(&hkey, 0, sizeof(gpu_path_entry)); + hkey.root = root; + hkey.relids = rel->relids; + hkey.outer_parallel = outer_parallel; + hkey.inner_parallel = inner_parallel; + + gent = hash_search(gpu_path_htable, &hkey, HASH_ENTER, &found); + if (found) + { + /* new path is more expensive than prior one! */ + if (gent->cheapest_gpu_path->total_cost < gpu_path->total_cost) + return false; + } + Assert(gent->root == root && + bms_equal(gent->relids, rel->relids) && + gent->outer_parallel == outer_parallel && + gent->inner_parallel == inner_parallel); + gent->cheapest_gpu_path = pgstrom_copy_pathnode(gpu_path); + + return true; +} + +/* + * pgstrom_create_dummy_path + */ +Path * +pgstrom_create_dummy_path(PlannerInfo *root, Path *subpath) +{ + CustomPath *cpath = makeNode(CustomPath); + PathTarget *final_target = root->upper_targets[UPPERREL_FINAL]; + ListCell *lc1; + ListCell *lc2; + + /* sanity checks */ + if (list_length(final_target->exprs) != list_length(subpath->pathtarget->exprs)) + elog(ERROR, "CustomScan(dummy): incompatible tlist is supplied"); + forboth (lc1, final_target->exprs, + lc2, subpath->pathtarget->exprs) + { + Node *node1 = lfirst(lc1); + Node *node2 = lfirst(lc2); + + if (exprType(node1) != exprType(node2)) + elog(ERROR, "CustomScan(dummy): incompatible tlist entry: [%s] <-> [%s]", + nodeToString(node1), + nodeToString(node2)); + } + + cpath->path.pathtype = T_CustomScan; + cpath->path.parent = subpath->parent; + cpath->path.pathtarget = final_target; + cpath->path.param_info = NULL; + cpath->path.parallel_aware = subpath->parallel_aware; + cpath->path.parallel_safe = subpath->parallel_safe; + cpath->path.parallel_workers = subpath->parallel_workers; + cpath->path.pathkeys = subpath->pathkeys; + cpath->path.rows = subpath->rows; + cpath->path.startup_cost = subpath->startup_cost; + cpath->path.total_cost = subpath->total_cost; + + cpath->custom_paths = list_make1(subpath); + cpath->methods = &pgstrom_dummy_path_methods; + + return &cpath->path; +} + +/* + * pgstrom_dummy_create_plan - PlanCustomPath callback + */ +static Plan * +pgstrom_dummy_create_plan(PlannerInfo *root, + RelOptInfo *rel, + CustomPath *best_path, + List *tlist, + List *clauses, + List *custom_plans) +{ + CustomScan *cscan = makeNode(CustomScan); + + Assert(list_length(custom_plans) == 1); + cscan->scan.plan.parallel_aware = best_path->path.parallel_aware; + cscan->scan.plan.targetlist = tlist; + cscan->scan.plan.qual = NIL; + cscan->scan.plan.lefttree = linitial(custom_plans); + cscan->scan.scanrelid = 0; + cscan->custom_scan_tlist = tlist; + cscan->methods = &pgstrom_dummy_plan_methods; + + return &cscan->scan.plan; +} + +/* + * pgstrom_dummy_create_scan_state - CreateCustomScanState callback + */ +static Node * +pgstrom_dummy_create_scan_state(CustomScan *cscan) +{ + elog(ERROR, "Bug? dummy custom scan should not remain at the executor stage"); +} + +/* + * pgstrom_removal_dummy_plans + * + * Due to the interface design of the create_upper_paths_hook, some other path + * nodes can be stacked on the GpuPreAgg node, with the original final target- + * list. Even if a pair of Agg + GpuPreAgg adopted its modified target-list, + * the stacked path nodes (like sorting, window functions, ...) still consider + * it has the original target-list. + * It makes a problem at setrefs.c when PostgreSQL optimizer tries to replace + * the expressions by var-node using OUTER_VAR, because Agg + GpuPreAgg pair + * does not have the original expression, then it leads "variable not found" + * error. + */ +static void +pgstrom_removal_dummy_plans(PlannedStmt *pstmt, Plan **p_plan) +{ + Plan *plan = *p_plan; + ListCell *lc; + + Assert(plan != NULL); + switch (nodeTag(plan)) + { +#if PG_VERSION_NUM < 140000 + /* + * PG14 changed ModifyTable to use lefttree to save its subplan. + */ + case T_ModifyTable: + { + ModifyTable *splan = (ModifyTable *) plan; + + foreach (lc, splan->plans) + pgstrom_removal_dummy_plans(pstmt, (Plan **)&lfirst(lc)); + } + break; +#endif + case T_Append: + { + Append *splan = (Append *) plan; + + foreach (lc, splan->appendplans) + pgstrom_removal_dummy_plans(pstmt, (Plan **)&lfirst(lc)); + } + break; + + case T_MergeAppend: + { + MergeAppend *splan = (MergeAppend *) plan; + + foreach (lc, splan->mergeplans) + pgstrom_removal_dummy_plans(pstmt, (Plan **)&lfirst(lc)); + } + break; + + case T_BitmapAnd: + { + BitmapAnd *splan = (BitmapAnd *) plan; + + foreach (lc, splan->bitmapplans) + pgstrom_removal_dummy_plans(pstmt, (Plan **)&lfirst(lc)); + } + break; + + case T_BitmapOr: + { + BitmapOr *splan = (BitmapOr *) plan; + + foreach (lc, splan->bitmapplans) + pgstrom_removal_dummy_plans(pstmt, (Plan **)&lfirst(lc)); + } + break; + + case T_SubqueryScan: + { + SubqueryScan *sscan = (SubqueryScan *) plan; + + pgstrom_removal_dummy_plans(pstmt, &sscan->subplan); + } + break; + + case T_CustomScan: + { + CustomScan *cscan = (CustomScan *) plan; + + if (cscan->methods == &pgstrom_dummy_plan_methods) + { + Plan *subplan = outerPlan(cscan); + ListCell *lc1, *lc2; + + if (list_length(cscan->scan.plan.targetlist) != + list_length(subplan->targetlist)) + elog(ERROR, "Bug? dummy plan's targelist length mismatch"); + forboth (lc1, cscan->scan.plan.targetlist, + lc2, subplan->targetlist) + { + TargetEntry *tle1 = lfirst(lc1); + TargetEntry *tle2 = lfirst(lc2); + + if (exprType((Node *)tle1->expr) != + exprType((Node *)tle2->expr)) + elog(ERROR, "Bug? dummy TLE type mismatch [%s] [%s]", + nodeToString(tle1), + nodeToString(tle2)); + /* assign resource name */ + tle2->resname = tle1->resname; + } + *p_plan = subplan; + pgstrom_removal_dummy_plans(pstmt, p_plan); + return; + } + foreach (lc, cscan->custom_plans) + pgstrom_removal_dummy_plans(pstmt, (Plan **)&lfirst(lc)); + } + break; + + default: + break; + } + if (plan->lefttree) + pgstrom_removal_dummy_plans(pstmt, &plan->lefttree); + if (plan->righttree) + pgstrom_removal_dummy_plans(pstmt, &plan->righttree); +} + +/* + * pgstrom_post_planner + */ +static PlannedStmt * +pgstrom_post_planner(Query *parse, +#if PG_VERSION_NUM >= 130000 + const char *query_string, +#endif + int cursorOptions, + ParamListInfo boundParams) +{ + HTAB *gpu_path_htable_saved = gpu_path_htable; + PlannedStmt *pstmt; + ListCell *lc; + + PG_TRY(); + { + HASHCTL hctl; + + /* make hash-table to preserve GPU-aware path-nodes */ + memset(&hctl, 0, sizeof(HASHCTL)); + hctl.hcxt = CurrentMemoryContext; + hctl.keysize = offsetof(gpu_path_entry, cheapest_gpu_path); + hctl.entrysize = sizeof(gpu_path_entry); + hctl.hash = gpu_path_entry_hashvalue; + hctl.match = gpu_path_entry_compare; + hctl.keycopy = gpu_path_entry_keycopy; + gpu_path_htable = hash_create("GPU-aware Path-nodes table", + 512, + &hctl, + HASH_CONTEXT | + HASH_ELEM | + HASH_FUNCTION | + HASH_COMPARE | + HASH_KEYCOPY); + pstmt = planner_hook_next(parse, +#if PG_VERSION_NUM >= 130000 + query_string, +#endif + cursorOptions, + boundParams); + } + PG_CATCH(); + { + hash_destroy(gpu_path_htable); + gpu_path_htable = gpu_path_htable_saved; + PG_RE_THROW(); + } + PG_END_TRY(); + hash_destroy(gpu_path_htable); + gpu_path_htable = gpu_path_htable_saved; + + pgstrom_removal_dummy_plans(pstmt, &pstmt->planTree); + foreach (lc, pstmt->subplans) + pgstrom_removal_dummy_plans(pstmt, (Plan **)&lfirst(lc)); + + return pstmt; +} + +/* + * Routines to support user's extra GPU logic + */ +uint32 +pgstrom_register_users_extra(const pgstromUsersExtraDescriptor *__desc) +{ + pgstromUsersExtraDescriptor *desc; + const char *extra_name; + uint32 extra_flags; + + if (pgstrom_num_users_extra >= 7) + elog(ERROR, "too much PG-Strom users' extra module is registered"); + if (__desc->magic != PGSTROM_USERS_EXTRA_MAGIC_V1) + elog(ERROR, "magic number of pgstromUsersExtraDescriptor mismatch"); + if (__desc->pg_version / 100 != PG_MAJOR_VERSION) + elog(ERROR, "PG-Strom Users Extra is built for %u", __desc->pg_version); + + extra_name = strdup(__desc->extra_name); + if (!extra_name) + elog(ERROR, "out of memory"); + extra_flags = (1U << (pgstrom_num_users_extra + 24)); + + desc = &pgstrom_users_extra_desc[pgstrom_num_users_extra++]; + memcpy(desc, __desc, sizeof(pgstromUsersExtraDescriptor)); + desc->extra_flags = extra_flags; + desc->extra_name = extra_name; + elog(LOG, "PG-Strom users's extra [%s] registered", extra_name); + + return extra_flags; +} + +/* + * _PG_init + * + * Main entrypoint of PG-Strom. It shall be invoked only once when postmaster + * process is starting up, then it calls other sub-systems to initialize for + * each ones. + */ +void +_PG_init(void) +{ + /* + * PG-Strom has to be loaded using shared_preload_libraries option + */ + if (!process_shared_preload_libraries_in_progress) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("PG-Strom must be loaded via shared_preload_libraries"))); + + /* init misc variables */ + PAGE_SIZE = sysconf(_SC_PAGESIZE); + PAGE_MASK = PAGE_SIZE - 1; + PAGE_SHIFT = get_next_log2(PAGE_SIZE); + PHYS_PAGES = sysconf(_SC_PHYS_PAGES); + + /* load NVIDIA/HeteroDB related stuff, if any */ + pgstrom_init_nvrtc(); + pgstrom_init_extra(); + + /* dump version number */ + elog(LOG, "PG-Strom version %s built for PostgreSQL %s (git: %s)", + PGSTROM_VERSION, + PG_MAJORVERSION, + PGSTROM_GITHASH); + + /* init GPU/CUDA infrastracture */ + pgstrom_init_shmbuf(); + pgstrom_init_gpu_device(); + pgstrom_init_gpu_mmgr(); + pgstrom_init_gpu_context(); + pgstrom_init_cuda_program(); + pgstrom_init_codegen(); + + /* init custom-scan providers/FDWs */ + pgstrom_init_common_guc(); + pgstrom_init_gputasks(); + pgstrom_init_gpuscan(); + pgstrom_init_gpujoin(); + pgstrom_init_gpupreagg(); + pgstrom_init_relscan(); + pgstrom_init_arrow_fdw(); + pgstrom_init_gpu_cache(); + +#if PG_VERSION_NUM < 150000 + /* + * PG15 enforces shared memory requirement is added in the 'shmem_request_hook' + * but PG14 or former don't have such infrastructure. So, we provide our own + * infrastructure with same name and definition. + */ + if (shmem_request_hook) + shmem_request_hook(); +#endif + + /* dummy custom-scan node */ + memset(&pgstrom_dummy_path_methods, 0, sizeof(CustomPathMethods)); + pgstrom_dummy_path_methods.CustomName = "Dummy"; + pgstrom_dummy_path_methods.PlanCustomPath + = pgstrom_dummy_create_plan; + + memset(&pgstrom_dummy_plan_methods, 0, sizeof(CustomScanMethods)); + pgstrom_dummy_plan_methods.CustomName = "Dummy"; + pgstrom_dummy_plan_methods.CreateCustomScanState + = pgstrom_dummy_create_scan_state; + + /* planner hook registration */ + planner_hook_next = (planner_hook ? planner_hook : standard_planner); + planner_hook = pgstrom_post_planner; +} diff --git a/next/misc.c b/old/misc.c similarity index 68% rename from next/misc.c rename to old/misc.c index f7812de0d..c5e89bf7b 100644 --- a/next/misc.c +++ b/old/misc.c @@ -4,14 +4,44 @@ * miscellaneous and uncategorized routines but usefull for multiple subsystems * of PG-Strom. * ---- - * Copyright 2011-2023 (C) KaiGai Kohei - * Copyright 2014-2023 (C) PG-Strom Developers Team + * Copyright 2011-2021 (C) KaiGai Kohei + * Copyright 2014-2021 (C) PG-Strom Developers Team * * This program is free software; you can redistribute it and/or modify * it under the terms of the PostgreSQL License. */ #include "pg_strom.h" +/* + * make_flat_ands_expr - similar to make_ands_explicit but it pulls up + * underlying and-clause + */ +Expr * +make_flat_ands_explicit(List *andclauses) +{ + List *args = NIL; + ListCell *lc; + + if (andclauses == NIL) + return (Expr *) makeBoolConst(true, false); + else if (list_length(andclauses) == 1) + return (Expr *) linitial(andclauses); + + foreach (lc, andclauses) + { + Expr *expr = lfirst(lc); + + Assert(exprType((Node *)expr) == BOOLOID); + if (IsA(expr, BoolExpr) && + ((BoolExpr *)expr)->boolop == AND_EXPR) + args = list_concat(args, ((BoolExpr *) expr)->args); + else + args = lappend(args, expr); + } + Assert(list_length(args) > 1); + return make_andclause(args); +} + /* * fixup_varnode_to_origin */ @@ -39,7 +69,6 @@ fixup_varnode_to_origin(Node *node, List *cscan_tlist) (void *)cscan_tlist); } -#if 0 /* * find_appinfos_by_relids_nofail * @@ -151,40 +180,110 @@ get_parallel_divisor(Path *path) } return parallel_divisor; } -#endif /* - * append a binary chunk at the aligned block + * Usefulll wrapper routines like lsyscache.c */ -int -__appendBinaryStringInfo(StringInfo buf, const void *data, int datalen) +#if PG_VERSION_NUM < 110000 +char +get_func_prokind(Oid funcid) { - static uint64_t __zero = 0; - int padding = (MAXALIGN(buf->len) - buf->len); - int pos; - - if (padding > 0) - appendBinaryStringInfo(buf, (char *)&__zero, padding); - pos = buf->len; - appendBinaryStringInfo(buf, data, datalen); - return pos; + HeapTuple tup; + Form_pg_proc procForm; + char prokind; + + tup = SearchSysCache1(PROCOID, ObjectIdGetDatum(funcid)); + if (!HeapTupleIsValid(tup)) + elog(ERROR, "cache lookup failed for function %u", funcid); + procForm = (Form_pg_proc) GETSTRUCT(tup); + if (procForm->proisagg) + { + Assert(!procForm->proiswindow); + prokind = PROKIND_AGGREGATE; + } + else if (procForm->proiswindow) + { + Assert(!procForm->proisagg); + prokind = PROKIND_WINDOW; + } + else + { + prokind = PROKIND_FUNCTION; + } + ReleaseSysCache(tup); + + return prokind; } +#endif /* relnatts; + ReleaseSysCache(tup); + } + return relnatts; +} + +/* + * get_function_oid + */ +Oid +get_function_oid(const char *func_name, + oidvector *func_args, + Oid namespace_oid, + bool missing_ok) { - static uint64_t __zero = 0; - int padding = (MAXALIGN(buf->len) - buf->len); - int pos; - - if (padding > 0) - appendBinaryStringInfo(buf, (char *)&__zero, padding); - pos = buf->len; - enlargeStringInfo(buf, nbytes); - memset(buf->data + pos, 0, nbytes); - buf->len += nbytes; - - return pos; + Oid func_oid; + + func_oid = GetSysCacheOid3(PROCNAMEARGSNSP, +#if PG_VERSION_NUM >= 120000 + Anum_pg_proc_oid, +#endif + CStringGetDatum(func_name), + PointerGetDatum(func_args), + ObjectIdGetDatum(namespace_oid)); + if (!missing_ok && !OidIsValid(func_oid)) + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_FUNCTION), + errmsg("function %s is not defined", + funcname_signature_string(func_name, + func_args->dim1, + NIL, + func_args->values)))); + return func_oid; +} + +/* + * get_type_oid + */ +Oid +get_type_oid(const char *type_name, + Oid namespace_oid, + bool missing_ok) +{ + Oid type_oid; + + type_oid = GetSysCacheOid2(TYPENAMENSP, +#if PG_VERSION_NUM >= 120000 + Anum_pg_type_oid, +#endif + CStringGetDatum(type_name), + ObjectIdGetDatum(namespace_oid)); + if (!missing_ok && !OidIsValid(type_oid)) + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_OBJECT), + errmsg("type %s is not defined", type_name))); + + return type_oid; } /* @@ -210,25 +309,102 @@ get_type_name(Oid type_oid, bool missing_ok) } /* - * get_relation_am + * get_proc_library + */ +char * +get_proc_library(HeapTuple protup) +{ + Form_pg_proc proc = (Form_pg_proc)GETSTRUCT(protup); + + if (proc->prolang == ClanguageId) + { + Datum datum; + bool isnull; + + datum = SysCacheGetAttr(PROCOID, protup, + Anum_pg_proc_probin, + &isnull); + if (!isnull) + return TextDatumGetCString(datum); + } + else if (proc->prolang != INTERNALlanguageId && + proc->prolang != SQLlanguageId) + { + return (void *)(~0UL); + } + return NULL; +} + +/* + * get_object_extension_oid */ Oid -get_relation_am(Oid rel_oid, bool missing_ok) +get_object_extension_oid(Oid class_id, + Oid object_id, + int32 objsub_id, + bool missing_ok) { + Relation drel; + ScanKeyData skeys[3]; + SysScanDesc sscan; HeapTuple tup; - Oid relam; - - tup = SearchSysCache1(RELOID, ObjectIdGetDatum(rel_oid)); - if (!HeapTupleIsValid(tup)) + Oid ext_oid = InvalidOid; + + drel = table_open(DependRelationId, AccessShareLock); + + ScanKeyInit(&skeys[0], + Anum_pg_depend_classid, + BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(class_id)); + ScanKeyInit(&skeys[1], + Anum_pg_depend_objid, + BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(object_id)); + ScanKeyInit(&skeys[2], + Anum_pg_depend_objsubid, + BTEqualStrategyNumber, F_INT4EQ, + Int32GetDatum(objsub_id)); + sscan = systable_beginscan(drel, DependDependerIndexId, true, + NULL, 3, skeys); + while (HeapTupleIsValid(tup = systable_getnext(sscan))) { - if (!missing_ok) - elog(ERROR, "cache lookup failed for relation %u", rel_oid); - return InvalidOid; + Form_pg_depend dep = (Form_pg_depend) GETSTRUCT(tup); + + if (dep->refclassid == ExtensionRelationId && + dep->refobjsubid == 0 && + (dep->deptype == DEPENDENCY_EXTENSION || + dep->deptype == DEPENDENCY_AUTO_EXTENSION)) + { + ext_oid = dep->refobjid; + break; + } } - relam = ((Form_pg_class) GETSTRUCT(tup))->relam; - ReleaseSysCache(tup); + systable_endscan(sscan); + table_close(drel, AccessShareLock); - return relam; + if (!missing_ok && !OidIsValid(ext_oid)) + elog(ERROR, "couldn't find out references (class:%u, objid:%u, subid:%d) by pg_extension at pg_depend", + class_id, object_id, objsub_id); + + return ext_oid; +} + +/* + * bms_to_cstring - human readable Bitmapset + */ +char * +bms_to_cstring(Bitmapset *bms) +{ + StringInfoData buf; + int bit = -1; + + initStringInfo(&buf); + appendStringInfo(&buf, "{"); + while ((bit = bms_next_member(bms, bit)) >= 0) + appendStringInfo(&buf, " %d", bit); + appendStringInfo(&buf, " }"); + + return buf.data; } /* @@ -262,33 +438,6 @@ bms_from_pglist(List *pglist) return bms; } -Float * -__makeFloat(double fval) -{ - return makeFloat(psprintf("%e", fval)); -} - -Const * -__makeByteaConst(bytea *data) -{ - return makeConst(BYTEAOID, - -1, - InvalidOid, - -1, - PointerGetDatum(data), - data == NULL, - false); -} - -bytea * -__getByteaConst(Const *con) -{ - Assert(IsA(con, Const) && con->consttype == BYTEAOID); - - return (con->constisnull ? NULL : DatumGetByteaP(con->constvalue)); -} - -#if 0 /* * pathnode_tree_walker */ @@ -368,18 +517,22 @@ pathnode_tree_walker(Path *node, if (walker(((GatherPath *)node)->subpath, context)) return true; break; +#if PG_VERSION_NUM >= 100000 case T_GatherMergePath: if (walker(((GatherMergePath *)node)->subpath, context)) return true; break; +#endif /* >= PG10 */ case T_ProjectionPath: if (walker(((ProjectionPath *)node)->subpath, context)) return true; break; +#if PG_VERSION_NUM >= 100000 case T_ProjectSetPath: if (walker(((ProjectSetPath *)node)->subpath, context)) return true; break; +#endif /* >= PG10 */ case T_SortPath: if (walker(((SortPath *)node)->subpath, context)) return true; @@ -475,7 +628,6 @@ pathtree_has_parallel_aware(Path *node) { return __pathtree_has_parallel_aware(node, NULL); } -#endif /* * pgstrom_copy_pathnode @@ -533,17 +685,21 @@ pgstrom_copy_pathnode(const Path *pathnode) return &b->path; } case T_CustomPath: + if (pgstrom_path_is_gpuscan(pathnode)) + return pgstrom_copy_gpuscan_path(pathnode); + else if (pgstrom_path_is_gpujoin(pathnode)) + return pgstrom_copy_gpujoin_path(pathnode); + else if (pgstrom_path_is_gpupreagg(pathnode)) + return pgstrom_copy_gpupreagg_path(pathnode); + else { CustomPath *a = (CustomPath *)pathnode; CustomPath *b = pmemdup(a, sizeof(CustomPath)); List *subpaths = NIL; ListCell *lc; - foreach (lc, a->custom_paths) - { - Path *sp = pgstrom_copy_pathnode(lfirst(lc)); - subpaths = lappend(subpaths, sp); - } + subpaths = lappend(subpaths, + pgstrom_copy_pathnode(lfirst(lc))); b->custom_paths = subpaths; return &b->path; } @@ -579,8 +735,13 @@ pgstrom_copy_pathnode(const Path *pathnode) b->subpaths = subpaths; return &b->path; } +#if PG_VERSION_NUM < 120000 + case T_ResultPath: + return pmemdup(pathnode, sizeof(ResultPath)); +#else case T_GroupResultPath: return pmemdup(pathnode, sizeof(GroupResultPath)); +#endif case T_MaterialPath: { MaterialPath *a = (MaterialPath *)pathnode; @@ -588,13 +749,6 @@ pgstrom_copy_pathnode(const Path *pathnode) b->subpath = pgstrom_copy_pathnode(a->subpath); return &b->path; } - case T_MemoizePath: - { - MemoizePath *a = (MemoizePath *)pathnode; - MemoizePath *b = pmemdup(a, sizeof(MemoizePath)); - b->subpath = pgstrom_copy_pathnode(a->subpath); - return &b->path; - } case T_UniquePath: { UniquePath *a = (UniquePath *)pathnode; @@ -700,7 +854,16 @@ pgstrom_copy_pathnode(const Path *pathnode) { ModifyTablePath *a = (ModifyTablePath *)pathnode; ModifyTablePath *b = pmemdup(a, sizeof(ModifyTablePath)); +#if PG_VERSION_NUM < 140000 + List *subpaths = NIL; + ListCell *lc; + foreach (lc, a->subpaths) + subpaths = lappend(subpaths, + pgstrom_copy_pathnode(lfirst(lc))); + b->subpaths = subpaths; +#else b->subpath = pgstrom_copy_pathnode(a->subpath); +#endif return &b->path; } case T_LimitPath: @@ -716,44 +879,30 @@ pgstrom_copy_pathnode(const Path *pathnode) return NULL; } -#if 0 /* - * pgstrom_define_shell_type - A wrapper for TypeShellMake with a particular OID + * errorText - string form of the error code */ -PG_FUNCTION_INFO_V1(pgstrom_define_shell_type); -Datum -pgstrom_define_shell_type(PG_FUNCTION_ARGS) +const char * +errorText(int errcode) { - char *type_name = text_to_cstring(PG_GETARG_TEXT_PP(0)); - Oid type_oid = PG_GETARG_OID(1); - Oid type_namespace = PG_GETARG_OID(2); - bool __IsBinaryUpgrade = IsBinaryUpgrade; - Oid __binary_upgrade_next_pg_type_oid = binary_upgrade_next_pg_type_oid; + static __thread char buffer[160]; + const char *error_name; + const char *error_desc; - if (!superuser()) - ereport(ERROR, - (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), - errmsg("must be superuser to create a shell type"))); - PG_TRY(); + if (errcode >= 0 && errcode <= CUDA_ERROR_UNKNOWN) { - IsBinaryUpgrade = true; - binary_upgrade_next_pg_type_oid = type_oid; - - TypeShellMake(type_name, type_namespace, GetUserId()); - } - PG_CATCH(); - { - IsBinaryUpgrade = __IsBinaryUpgrade; - binary_upgrade_next_pg_type_oid = __binary_upgrade_next_pg_type_oid; - PG_RE_THROW(); + if (cuGetErrorName(errcode, &error_name) == CUDA_SUCCESS && + cuGetErrorString(errcode, &error_desc) == CUDA_SUCCESS) + { + snprintf(buffer, sizeof(buffer), "%s - %s", + error_name, error_desc); + return buffer; + } } - PG_END_TRY(); - IsBinaryUpgrade = __IsBinaryUpgrade; - binary_upgrade_next_pg_type_oid = __binary_upgrade_next_pg_type_oid; - - PG_RETURN_OID(type_oid); + snprintf(buffer, sizeof(buffer), + "%d - unknown", errcode); + return buffer; } -#endif /* * ---------------------------------------------------------------- @@ -797,15 +946,15 @@ pgstrom_random_setseed(PG_FUNCTION_ARGS) } PG_FUNCTION_INFO_V1(pgstrom_random_setseed); -static int64_t +static cl_long __random(void) { if (!pgstrom_random_seed_set) { - pgstrom_random_seed = (unsigned int)MyProcPid ^ 0xdeadbeafU; + pgstrom_random_seed = (unsigned int)MyProcPid ^ 0xdeadbeaf; pgstrom_random_seed_set = true; } - return (uint64_t)rand_r(&pgstrom_random_seed); + return (cl_ulong)rand_r(&pgstrom_random_seed); } static inline double @@ -830,7 +979,7 @@ pgstrom_random_int(PG_FUNCTION_ARGS) float8 ratio = (!PG_ARGISNULL(0) ? PG_GETARG_FLOAT8(0) : 0.0); int64 lower = (!PG_ARGISNULL(1) ? PG_GETARG_INT64(1) : 0); int64 upper = (!PG_ARGISNULL(2) ? PG_GETARG_INT64(2) : INT_MAX); - uint64_t v; + cl_ulong v; if (upper < lower) elog(ERROR, "%s: lower bound is larger than upper", __FUNCTION__); @@ -868,7 +1017,7 @@ pgstrom_random_date(PG_FUNCTION_ARGS) float8 ratio = (!PG_ARGISNULL(0) ? PG_GETARG_FLOAT8(0) : 0.0); DateADT lower; DateADT upper; - uint64_t v; + cl_ulong v; if (!PG_ARGISNULL(1)) lower = PG_GETARG_DATEADT(1); @@ -897,7 +1046,7 @@ pgstrom_random_time(PG_FUNCTION_ARGS) float8 ratio = (!PG_ARGISNULL(0) ? PG_GETARG_FLOAT8(0) : 0.0); TimeADT lower = 0; TimeADT upper = HOURS_PER_DAY * USECS_PER_HOUR - 1; - uint64_t v; + cl_ulong v; if (!PG_ARGISNULL(1)) lower = PG_GETARG_TIMEADT(1); @@ -922,7 +1071,7 @@ pgstrom_random_timetz(PG_FUNCTION_ARGS) TimeADT lower = 0; TimeADT upper = HOURS_PER_DAY * USECS_PER_HOUR - 1; TimeTzADT *temp; - uint64_t v; + cl_ulong v; if (!PG_ARGISNULL(1)) lower = PG_GETARG_TIMEADT(1); @@ -951,7 +1100,7 @@ pgstrom_random_timestamp(PG_FUNCTION_ARGS) float8 ratio = (!PG_ARGISNULL(0) ? PG_GETARG_FLOAT8(0) : 0.0); Timestamp lower; Timestamp upper; - uint64_t v; + cl_ulong v; struct pg_tm tm; if (!PG_ARGISNULL(1)) @@ -990,18 +1139,18 @@ pgstrom_random_macaddr(PG_FUNCTION_ARGS) { float8 ratio = (!PG_ARGISNULL(0) ? PG_GETARG_FLOAT8(0) : 0.0); macaddr *temp; - uint64_t lower; - uint64_t upper; - uint64_t v, x; + cl_ulong lower; + cl_ulong upper; + cl_ulong v, x; if (PG_ARGISNULL(1)) lower = 0xabcd00000000UL; else { temp = PG_GETARG_MACADDR_P(1); - lower = (((uint64_t)temp->a << 40) | ((uint64_t)temp->b << 32) | - ((uint64_t)temp->c << 24) | ((uint64_t)temp->d << 16) | - ((uint64_t)temp->e << 8) | ((uint64_t)temp->f)); + lower = (((cl_ulong)temp->a << 40) | ((cl_ulong)temp->b << 32) | + ((cl_ulong)temp->c << 24) | ((cl_ulong)temp->d << 16) | + ((cl_ulong)temp->e << 8) | ((cl_ulong)temp->f)); } if (PG_ARGISNULL(2)) @@ -1009,9 +1158,9 @@ pgstrom_random_macaddr(PG_FUNCTION_ARGS) else { temp = PG_GETARG_MACADDR_P(2); - upper = (((uint64_t)temp->a << 40) | ((uint64_t)temp->b << 32) | - ((uint64_t)temp->c << 24) | ((uint64_t)temp->d << 16) | - ((uint64_t)temp->e << 8) | ((uint64_t)temp->f)); + upper = (((cl_ulong)temp->a << 40) | ((cl_ulong)temp->b << 32) | + ((cl_ulong)temp->c << 24) | ((cl_ulong)temp->d << 16) | + ((cl_ulong)temp->e << 8) | ((cl_ulong)temp->f)); } if (upper < lower) @@ -1042,7 +1191,7 @@ pgstrom_random_inet(PG_FUNCTION_ARGS) float8 ratio = (!PG_ARGISNULL(0) ? PG_GETARG_FLOAT8(0) : 0.0); inet *temp; int i, j, bits; - uint64_t v; + cl_ulong v; if (generate_null(ratio)) PG_RETURN_NULL(); @@ -1075,7 +1224,7 @@ pgstrom_random_inet(PG_FUNCTION_ARGS) temp->inet_data.ipaddr[i--] = (v & 0xff); else { - uint32_t mask = (1 << bits) - 1; + cl_uint mask = (1 << bits) - 1; temp->inet_data.ipaddr[i] &= ~(mask); temp->inet_data.ipaddr[i] |= (v & mask); @@ -1097,7 +1246,7 @@ pgstrom_random_text(PG_FUNCTION_ARGS) text *temp; char *pos; int i, j, n; - uint64_t v; + cl_ulong v; if (generate_null(ratio)) PG_RETURN_NULL(); @@ -1135,11 +1284,11 @@ pgstrom_random_text_length(PG_FUNCTION_ARGS) "abcdefghijklmnopqrstuvwxyz" "0123456789+/"; float8 ratio = (!PG_ARGISNULL(0) ? PG_GETARG_FLOAT8(0) : 0.0); - int32_t maxlen; + cl_int maxlen; text *temp; char *pos; int i, j, n; - uint64_t v = 0; + cl_ulong v = 0; if (generate_null(ratio)) PG_RETURN_NULL(); @@ -1201,12 +1350,7 @@ pgstrom_random_int4range(PG_FUNCTION_ARGS) if (generate_null(ratio)) PG_RETURN_NULL(); - type_oid = GetSysCacheOid2(TYPENAMENSP, - Anum_pg_type_oid, - CStringGetDatum("int4range"), - ObjectIdGetDatum(PG_CATALOG_NAMESPACE)); - if (!OidIsValid(type_oid)) - elog(ERROR, "type 'int4range' is not defined"); + type_oid = get_type_oid("int4range", PG_CATALOG_NAMESPACE, false); typcache = range_get_typcache(fcinfo, type_oid); x = lower + __random() % (upper - lower); y = lower + __random() % (upper - lower); @@ -1228,12 +1372,7 @@ pgstrom_random_int8range(PG_FUNCTION_ARGS) if (generate_null(ratio)) PG_RETURN_NULL(); - type_oid = GetSysCacheOid2(TYPENAMENSP, - Anum_pg_type_oid, - CStringGetDatum("int8range"), - ObjectIdGetDatum(PG_CATALOG_NAMESPACE)); - if (!OidIsValid(type_oid)) - elog(ERROR, "type 'int8range' is not defined"); + type_oid = get_type_oid("int8range", PG_CATALOG_NAMESPACE, false); typcache = range_get_typcache(fcinfo, type_oid); v = (__random() << 31) | __random(); x = lower + v % (upper - lower); @@ -1255,7 +1394,7 @@ pgstrom_random_tsrange(PG_FUNCTION_ARGS) TypeCacheEntry *typcache; Oid type_oid; Timestamp x, y; - uint64_t v; + cl_ulong v; if (generate_null(ratio)) PG_RETURN_NULL(); @@ -1280,12 +1419,8 @@ pgstrom_random_tsrange(PG_FUNCTION_ARGS) } if (upper < lower) elog(ERROR, "%s: lower bound is larger than upper", __FUNCTION__); - type_oid = GetSysCacheOid2(TYPENAMENSP, - Anum_pg_type_oid, - CStringGetDatum("tsrange"), - ObjectIdGetDatum(PG_CATALOG_NAMESPACE)); - if (!OidIsValid(type_oid)) - elog(ERROR, "type 'tsrange' is not defined"); + + type_oid = get_type_oid("tsrange", PG_CATALOG_NAMESPACE, false); typcache = range_get_typcache(fcinfo, type_oid); v = (__random() << 31) | __random(); x = lower + v % (upper - lower); @@ -1307,7 +1442,7 @@ pgstrom_random_tstzrange(PG_FUNCTION_ARGS) TypeCacheEntry *typcache; Oid type_oid; Timestamp x, y; - uint64_t v; + cl_ulong v; if (generate_null(ratio)) PG_RETURN_NULL(); @@ -1332,12 +1467,8 @@ pgstrom_random_tstzrange(PG_FUNCTION_ARGS) } if (upper < lower) elog(ERROR, "%s: lower bound is larger than upper", __FUNCTION__); - type_oid = GetSysCacheOid2(TYPENAMENSP, - Anum_pg_type_oid, - CStringGetDatum("tstzrange"), - ObjectIdGetDatum(PG_CATALOG_NAMESPACE)); - if (!OidIsValid(type_oid)) - elog(ERROR, "type 'tstzrange' is not defined"); + + type_oid = get_type_oid("tstzrange", PG_CATALOG_NAMESPACE, false); typcache = range_get_typcache(fcinfo, type_oid); v = (__random() << 31) | __random(); x = lower + v % (upper - lower); @@ -1372,12 +1503,7 @@ pgstrom_random_daterange(PG_FUNCTION_ARGS) if (upper < lower) elog(ERROR, "%s: lower bound is larger than upper", __FUNCTION__); - type_oid = GetSysCacheOid2(TYPENAMENSP, - Anum_pg_type_oid, - CStringGetDatum("daterange"), - ObjectIdGetDatum(PG_CATALOG_NAMESPACE)); - if (!OidIsValid(type_oid)) - elog(ERROR, "type 'daterange' is not defined"); + type_oid = get_type_oid("daterange", PG_CATALOG_NAMESPACE, false); typcache = range_get_typcache(fcinfo, type_oid); x = lower + __random() % (upper - lower); y = lower + __random() % (upper - lower); @@ -1483,21 +1609,10 @@ __pwriteFile(int fdesc, const void *buffer, size_t nbytes, off_t f_pos) return count; } -/* ---------------------------------------------------------------- - * - * shared memory and mmap/munmap routines - * - * ---------------------------------------------------------------- +/* + * mmap/munmap wrapper that is automatically unmapped on regarding to + * the resource-owner. */ -#define IS_POSIX_SHMEM 0x80000000U -typedef struct -{ - uint32_t shmem_handle; - int shmem_fdesc; - char shmem_name[MAXPGPATH]; - ResourceOwner owner; -} shmemEntry; - typedef struct { void *mmap_addr; @@ -1506,54 +1621,18 @@ typedef struct int mmap_flags; ResourceOwner owner; } mmapEntry; - -static HTAB *shmem_tracker_htab = NULL; static HTAB *mmap_tracker_htab = NULL; -static void -cleanup_shmem_chunks(ResourceReleasePhase phase, - bool isCommit, - bool isTopLevel, - void *arg) -{ - if (phase == RESOURCE_RELEASE_AFTER_LOCKS && - shmem_tracker_htab && - hash_get_num_entries(shmem_tracker_htab) > 0) - { - HASH_SEQ_STATUS seq; - shmemEntry *entry; - - hash_seq_init(&seq, shmem_tracker_htab); - while ((entry = hash_seq_search(&seq)) != NULL) - { - if (entry->owner != CurrentResourceOwner) - continue; - if (isCommit) - elog(WARNING, "shared-memory '%s' leaks, and still alive", - entry->shmem_name); - if (unlink(entry->shmem_name) != 0) - elog(WARNING, "failed on unlink('%s'): %m", entry->shmem_name); - if (close(entry->shmem_fdesc) != 0) - elog(WARNING, "failed on close('%s'): %m", entry->shmem_name); - hash_search(shmem_tracker_htab, - &entry->shmem_handle, - HASH_REMOVE, - NULL); - } - } -} - static void cleanup_mmap_chunks(ResourceReleasePhase phase, bool isCommit, bool isTopLevel, void *arg) { - if (phase == RESOURCE_RELEASE_AFTER_LOCKS && - mmap_tracker_htab && + if (mmap_tracker_htab && hash_get_num_entries(mmap_tracker_htab) > 0) { - HASH_SEQ_STATUS seq; + HASH_SEQ_STATUS seq; mmapEntry *entry; hash_seq_init(&seq, mmap_tracker_htab); @@ -1577,125 +1656,15 @@ cleanup_mmap_chunks(ResourceReleasePhase phase, } } -uint32_t -__shmemCreate(const DpuStorageEntry *ds_entry) -{ - static uint my_random_seed = 0; - const char *shmem_dir = "/dev/shm"; - int fdesc; - uint32_t handle; - char namebuf[MAXPGPATH]; - size_t off = 0; - - if (!shmem_tracker_htab) - { - HASHCTL hctl; - - my_random_seed = (uint)MyProcPid ^ 0xcafebabeU; - - memset(&hctl, 0, sizeof(HASHCTL)); - hctl.keysize = sizeof(uint32_t); - hctl.entrysize = sizeof(shmemEntry); - shmem_tracker_htab = hash_create("shmem_tracker_htab", - 256, - &hctl, - HASH_ELEM | HASH_BLOBS); - RegisterResourceReleaseCallback(cleanup_shmem_chunks, 0); - } - - if (ds_entry) - shmem_dir = DpuStorageEntryBaseDir(ds_entry); - off = snprintf(namebuf, sizeof(namebuf), "%s/", shmem_dir); - do { - handle = rand_r(&my_random_seed); - if (handle == 0) - continue; - /* to avoid hash conflict */ - if (!shmem_dir) - handle |= IS_POSIX_SHMEM; - else - handle &= ~IS_POSIX_SHMEM; - - snprintf(namebuf + off, sizeof(namebuf) - off, - ".pgstrom_shmbuf_%u_%d", - PostPortNumber, handle); - fdesc = open(namebuf, O_RDWR | O_CREAT | O_EXCL, 0600); - if (fdesc < 0 && errno != EEXIST) - elog(ERROR, "failed on open('%s'): %m", namebuf); - } while (fdesc < 0); - - PG_TRY(); - { - shmemEntry *entry; - bool found; - - entry = hash_search(shmem_tracker_htab, - &handle, - HASH_ENTER, - &found); - if (found) - elog(ERROR, "Bug? duplicated shmem entry"); - entry->shmem_handle = handle; - entry->shmem_fdesc = fdesc; - strcpy(entry->shmem_name, namebuf); - entry->owner = CurrentResourceOwner; - } - PG_CATCH(); - { - if (close(fdesc) != 0) - elog(WARNING, "failed on close('%s'): %m", namebuf); - if (unlink(namebuf) != 0) - elog(WARNING, "failed on unlink('%s'): %m", namebuf); - PG_RE_THROW(); - } - PG_END_TRY(); - - return handle; -} - -void -__shmemDrop(uint32_t shmem_handle) -{ - if (shmem_tracker_htab) - { - shmemEntry *entry; - - entry = hash_search(shmem_tracker_htab, - &shmem_handle, - HASH_REMOVE, - NULL); - if (entry) - { - if (unlink(entry->shmem_name) != 0) - elog(WARNING, "failed on unlink('%s'): %m", entry->shmem_name); - if (close(entry->shmem_fdesc) != 0) - elog(WARNING, "failed on close('%s'): %m", entry->shmem_name); - return; - } - } - elog(ERROR, "failed on __shmemDrop - no such segment (%u)", shmem_handle); -} - void * -__mmapShmem(uint32_t shmem_handle, - size_t shmem_length, - const DpuStorageEntry *ds_entry) +__mmapFile(void *addr, size_t length, + int prot, int flags, int fdesc, off_t offset) { - void *mmap_addr = MAP_FAILED; - size_t mmap_size = TYPEALIGN(PAGE_SIZE, shmem_length); - int mmap_prot = PROT_READ | PROT_WRITE; - int mmap_flags = MAP_SHARED; - mmapEntry *mmap_entry = NULL; - shmemEntry *shmem_entry = NULL; - int fdesc = -1; - const char *shmem_dir = "/dev/shm"; - const char *fname = NULL; - struct stat stat_buf; + void *mmap_addr; + size_t mmap_size = TYPEALIGN(PAGE_SIZE, length); + mmapEntry *entry; bool found; - char namebuf[MAXPGPATH]; - if (ds_entry) - shmem_dir = DpuStorageEntryBaseDir(ds_entry); if (!mmap_tracker_htab) { HASHCTL hctl; @@ -1703,83 +1672,35 @@ __mmapShmem(uint32_t shmem_handle, memset(&hctl, 0, sizeof(HASHCTL)); hctl.keysize = sizeof(void *); hctl.entrysize = sizeof(mmapEntry); + hctl.hcxt = CacheMemoryContext; mmap_tracker_htab = hash_create("mmap_tracker_htab", 256, &hctl, - HASH_ELEM | HASH_BLOBS); + HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); RegisterResourceReleaseCallback(cleanup_mmap_chunks, 0); } - - if (shmem_tracker_htab) - { - shmem_entry = hash_search(shmem_tracker_htab, - &shmem_handle, - HASH_FIND, - NULL); - if (shmem_entry) - { - size_t len = strlen(shmem_dir); - - if (strncmp(shmem_entry->shmem_name, shmem_dir, len) != 0 || - shmem_entry->shmem_name[len] != '/') - elog(ERROR, "Bug? shmem_dir mismatch '%s'", shmem_dir); - fdesc = shmem_entry->shmem_fdesc; - fname = shmem_entry->shmem_name; - } - } - if (fdesc < 0) - { - snprintf(namebuf, sizeof(namebuf), - "%s/.pgstrom_shmbuf_%u_%d", - shmem_dir, PostPortNumber, shmem_handle); - fdesc = open(namebuf, O_RDWR, 0600); - if (fdesc < 0) - elog(ERROR, "failed on open('%s'): %m", namebuf); - fname = namebuf; - } - + mmap_addr = mmap(addr, mmap_size, prot, flags, fdesc, offset); + if (mmap_addr == MAP_FAILED) + return MAP_FAILED; PG_TRY(); { - if (fstat(fdesc, &stat_buf) != 0) - elog(ERROR, "failed on fstat('%s'): %m", fname); - if (stat_buf.st_size < mmap_size) - { - while (fallocate(fdesc, 0, 0, mmap_size) != 0) - { - if (errno != EINTR) - elog(ERROR, "failed on fallocate('%s', %lu): %m", - fname, mmap_size); - } - } - mmap_addr = mmap(NULL, mmap_size, mmap_prot, mmap_flags, fdesc, 0); - if (mmap_addr == MAP_FAILED) - elog(ERROR, "failed on mmap(2): %m"); - - mmap_entry = hash_search(mmap_tracker_htab, - &mmap_addr, - HASH_ENTER, - &found); + entry = hash_search(mmap_tracker_htab, + &mmap_addr, + HASH_ENTER, + &found); if (found) elog(ERROR, "Bug? duplicated mmap entry"); - Assert(mmap_entry->mmap_addr == mmap_addr); - mmap_entry->mmap_size = mmap_size; - mmap_entry->mmap_prot = mmap_prot; - mmap_entry->mmap_flags = mmap_flags; - mmap_entry->owner = CurrentResourceOwner; - - if (!shmem_entry) - close(fdesc); + Assert(entry->mmap_addr == mmap_addr); + entry->mmap_size = mmap_size; + entry->mmap_prot = prot; + entry->mmap_flags = flags; + entry->owner = CurrentResourceOwner; } PG_CATCH(); { - if (mmap_addr != MAP_FAILED) - { - if (munmap(mmap_addr, mmap_size) != 0) - elog(WARNING, "failed on munmap(%p, %zu) of '%s': %m", - mmap_addr, mmap_size, fname); - } - if (!shmem_entry && close(fdesc) != 0) - elog(WARNING, "failed on close('%s'): %m", fname); + if (munmap(mmap_addr, mmap_size) != 0) + elog(WARNING, "failed on munmap(%p, %zu): %m", + mmap_addr, mmap_size); PG_RE_THROW(); } PG_END_TRY(); @@ -1787,26 +1708,213 @@ __mmapShmem(uint32_t shmem_handle, return mmap_addr; } -bool -__munmapShmem(void *mmap_addr) +int +__munmapFile(void *mmap_addr) { + mmapEntry *entry; + int rv; + if (mmap_tracker_htab) { - mmapEntry *entry - = hash_search(mmap_tracker_htab, - &mmap_addr, - HASH_REMOVE, - NULL); + entry = hash_search(mmap_tracker_htab, + &mmap_addr, HASH_REMOVE, NULL); if (entry) { - if (munmap(entry->mmap_addr, - entry->mmap_size) != 0) + rv = munmap(entry->mmap_addr, + entry->mmap_size); + if (rv != 0) + { + int errno_saved = errno; + elog(WARNING, "failed on munmap(%p, %zu): %m", entry->mmap_addr, entry->mmap_size); - return true; + errno = errno_saved; + } + return rv; } } - elog(ERROR, "it looks addr=%p not memory-mapped", mmap_addr); - return false; + /* mmapEntry not found */ + errno = EINVAL; + return -1; } + +void * +__mremapFile(void *mmap_addr, size_t new_size) +{ + mmapEntry *entry = NULL; + void *addr; + + if (mmap_tracker_htab) + { + entry = hash_search(mmap_tracker_htab, + &mmap_addr, HASH_FIND, NULL); + } + if (!entry) + { + errno = EINVAL; + return MAP_FAILED; + } + /* nothing to do */ + if (new_size <= entry->mmap_size) + return entry->mmap_addr; + addr = mremap(entry->mmap_addr, + entry->mmap_size, + new_size, + MREMAP_MAYMOVE); + if (addr == MAP_FAILED) + return MAP_FAILED; + + entry->mmap_addr = addr; + entry->mmap_size = new_size; + return addr; +} + +/* + * dummy entry for deprecated functions + */ +static void +__pg_deprecated_function(PG_FUNCTION_ARGS, const char *cfunc_name) +{ + FmgrInfo *flinfo = fcinfo->flinfo; + + if (OidIsValid(flinfo->fn_oid)) + elog(ERROR, "'%s' on behalf of %s is already deprecated", + cfunc_name, format_procedure(flinfo->fn_oid)); + elog(ERROR, "'%s' is already deprecated", cfunc_name); +} + +#define PG_DEPRECATED_FUNCTION(cfunc_name) \ + Datum cfunc_name(PG_FUNCTION_ARGS); \ + Datum cfunc_name(PG_FUNCTION_ARGS) \ + { \ + __pg_deprecated_function(fcinfo, __FUNCTION__); \ + PG_RETURN_NULL(); \ + } \ + PG_FUNCTION_INFO_V1(cfunc_name) + +/* deprecated functions */ +/* + * SQL functions for GPU attributes (deprecated) + */ +PG_DEPRECATED_FUNCTION(pgstrom_gpu_device_name); +PG_DEPRECATED_FUNCTION(pgstrom_gpu_global_memsize); +PG_DEPRECATED_FUNCTION(pgstrom_gpu_max_blocksize); +PG_DEPRECATED_FUNCTION(pgstrom_gpu_warp_size); +PG_DEPRECATED_FUNCTION(pgstrom_gpu_max_shared_memory_perblock); +PG_DEPRECATED_FUNCTION(pgstrom_gpu_num_registers_perblock); +PG_DEPRECATED_FUNCTION(pgstrom_gpu_num_multiptocessors); +PG_DEPRECATED_FUNCTION(pgstrom_gpu_num_cuda_cores); +PG_DEPRECATED_FUNCTION(pgstrom_gpu_cc_major); +PG_DEPRECATED_FUNCTION(pgstrom_gpu_cc_minor); +PG_DEPRECATED_FUNCTION(pgstrom_gpu_pci_id); + +/* deadcode/gstore_(fdw|buf).c */ +PG_DEPRECATED_FUNCTION(pgstrom_reggstore_in); +PG_DEPRECATED_FUNCTION(pgstrom_reggstore_out); +PG_DEPRECATED_FUNCTION(pgstrom_reggstore_recv); +PG_DEPRECATED_FUNCTION(pgstrom_reggstore_send); + +PG_DEPRECATED_FUNCTION(pgstrom_gstore_fdw_chunk_info); +PG_DEPRECATED_FUNCTION(pgstrom_gstore_fdw_format); +PG_DEPRECATED_FUNCTION(pgstrom_gstore_fdw_nitems); +PG_DEPRECATED_FUNCTION(pgstrom_gstore_fdw_nattrs); +PG_DEPRECATED_FUNCTION(pgstrom_gstore_fdw_rawsize); +PG_DEPRECATED_FUNCTION(pgstrom_gstore_export_ipchandle); + +/* deadcode/largeobject.c */ +PG_DEPRECATED_FUNCTION(pgstrom_lo_import_gpu); +PG_DEPRECATED_FUNCTION(pgstrom_lo_export_gpu); + +/* deadcode/pl_cuda_v2.c */ +PG_DEPRECATED_FUNCTION(plcuda_function_validator); +PG_DEPRECATED_FUNCTION(plcuda_function_handler); +PG_DEPRECATED_FUNCTION(pgsql_table_attr_numbers_by_names); +PG_DEPRECATED_FUNCTION(pgsql_table_attr_number_by_name); +PG_DEPRECATED_FUNCTION(pgsql_table_attr_types_by_names); +PG_DEPRECATED_FUNCTION(pgsql_table_attr_type_by_name); +PG_DEPRECATED_FUNCTION(pgsql_check_attrs_of_types); +PG_DEPRECATED_FUNCTION(pgsql_check_attrs_of_type); +PG_DEPRECATED_FUNCTION(pgsql_check_attr_of_type); + +/* arrow_fdw.c */ +PG_DEPRECATED_FUNCTION(pgstrom_arrow_fdw_export_cupy); +PG_DEPRECATED_FUNCTION(pgstrom_arrow_fdw_export_cupy_pinned); +PG_DEPRECATED_FUNCTION(pgstrom_arrow_fdw_unpin_gpu_buffer); +PG_DEPRECATED_FUNCTION(pgstrom_arrow_fdw_put_gpu_buffer); + +/* deadcode/matrix.c */ +PG_DEPRECATED_FUNCTION(array_matrix_accum); +PG_DEPRECATED_FUNCTION(array_matrix_accum_varbit); +PG_DEPRECATED_FUNCTION(varbit_to_int4_array); +PG_DEPRECATED_FUNCTION(int4_array_to_varbit); +PG_DEPRECATED_FUNCTION(array_matrix_final_bool); +PG_DEPRECATED_FUNCTION(array_matrix_final_int2); +PG_DEPRECATED_FUNCTION(array_matrix_final_int4); +PG_DEPRECATED_FUNCTION(array_matrix_final_int8); +PG_DEPRECATED_FUNCTION(array_matrix_final_float4); +PG_DEPRECATED_FUNCTION(array_matrix_final_float8); +PG_DEPRECATED_FUNCTION(array_matrix_unnest); +PG_DEPRECATED_FUNCTION(array_matrix_rbind_bool); +PG_DEPRECATED_FUNCTION(array_matrix_rbind_int2); +PG_DEPRECATED_FUNCTION(array_matrix_rbind_int4); +PG_DEPRECATED_FUNCTION(array_matrix_rbind_int8); +PG_DEPRECATED_FUNCTION(array_matrix_rbind_float4); +PG_DEPRECATED_FUNCTION(array_matrix_rbind_float8); +PG_DEPRECATED_FUNCTION(array_matrix_rbind_scalar_boolt); +PG_DEPRECATED_FUNCTION(array_matrix_rbind_scalar_boolb); +PG_DEPRECATED_FUNCTION(array_matrix_rbind_scalar_int2t); +PG_DEPRECATED_FUNCTION(array_matrix_rbind_scalar_int2b); +PG_DEPRECATED_FUNCTION(array_matrix_rbind_scalar_int4t); +PG_DEPRECATED_FUNCTION(array_matrix_rbind_scalar_int4b); +PG_DEPRECATED_FUNCTION(array_matrix_rbind_scalar_int8t); +PG_DEPRECATED_FUNCTION(array_matrix_rbind_scalar_int8b); +PG_DEPRECATED_FUNCTION(array_matrix_rbind_scalar_float4t); +PG_DEPRECATED_FUNCTION(array_matrix_rbind_scalar_float4b); +PG_DEPRECATED_FUNCTION(array_matrix_rbind_scalar_float8t); +PG_DEPRECATED_FUNCTION(array_matrix_rbind_scalar_float8b); +PG_DEPRECATED_FUNCTION(array_matrix_cbind_bool); +PG_DEPRECATED_FUNCTION(array_matrix_cbind_int2); +PG_DEPRECATED_FUNCTION(array_matrix_cbind_int4); +PG_DEPRECATED_FUNCTION(array_matrix_cbind_int8); +PG_DEPRECATED_FUNCTION(array_matrix_cbind_float4); +PG_DEPRECATED_FUNCTION(array_matrix_cbind_float8); +PG_DEPRECATED_FUNCTION(array_matrix_cbind_scalar_booll); +PG_DEPRECATED_FUNCTION(array_matrix_cbind_scalar_boolr); +PG_DEPRECATED_FUNCTION(array_matrix_cbind_scalar_int2l); +PG_DEPRECATED_FUNCTION(array_matrix_cbind_scalar_int2r); +PG_DEPRECATED_FUNCTION(array_matrix_cbind_scalar_int4l); +PG_DEPRECATED_FUNCTION(array_matrix_cbind_scalar_int4r); +PG_DEPRECATED_FUNCTION(array_matrix_cbind_scalar_int8l); +PG_DEPRECATED_FUNCTION(array_matrix_cbind_scalar_int8r); +PG_DEPRECATED_FUNCTION(array_matrix_cbind_scalar_float4l); +PG_DEPRECATED_FUNCTION(array_matrix_cbind_scalar_float4r); +PG_DEPRECATED_FUNCTION(array_matrix_cbind_scalar_float8l); +PG_DEPRECATED_FUNCTION(array_matrix_cbind_scalar_float8r); +PG_DEPRECATED_FUNCTION(array_matrix_rbind_accum); +PG_DEPRECATED_FUNCTION(array_matrix_rbind_final_bool); +PG_DEPRECATED_FUNCTION(array_matrix_rbind_final_int2); +PG_DEPRECATED_FUNCTION(array_matrix_rbind_final_int4); +PG_DEPRECATED_FUNCTION(array_matrix_rbind_final_int8); +PG_DEPRECATED_FUNCTION(array_matrix_rbind_final_float4); +PG_DEPRECATED_FUNCTION(array_matrix_rbind_final_float8); +PG_DEPRECATED_FUNCTION(array_matrix_cbind_accum); +PG_DEPRECATED_FUNCTION(array_matrix_cbind_final_bool); +PG_DEPRECATED_FUNCTION(array_matrix_cbind_final_int2); +PG_DEPRECATED_FUNCTION(array_matrix_cbind_final_int4); +PG_DEPRECATED_FUNCTION(array_matrix_cbind_final_int8); +PG_DEPRECATED_FUNCTION(array_matrix_cbind_final_float4); +PG_DEPRECATED_FUNCTION(array_matrix_cbind_final_float8); +PG_DEPRECATED_FUNCTION(array_matrix_transpose_bool); +PG_DEPRECATED_FUNCTION(array_matrix_transpose_int2); +PG_DEPRECATED_FUNCTION(array_matrix_transpose_int4); +PG_DEPRECATED_FUNCTION(array_matrix_transpose_int8); +PG_DEPRECATED_FUNCTION(array_matrix_transpose_float4); +PG_DEPRECATED_FUNCTION(array_matrix_transpose_float8); +PG_DEPRECATED_FUNCTION(float4_as_int4); /* duplicated, see float2.c */ +PG_DEPRECATED_FUNCTION(int4_as_float4); /* duplicated, see float2.c */ +PG_DEPRECATED_FUNCTION(float8_as_int8); /* duplicated, see float2.c */ +PG_DEPRECATED_FUNCTION(int8_as_float8); /* duplicated, see float2.c */ +PG_DEPRECATED_FUNCTION(array_matrix_validation); +PG_DEPRECATED_FUNCTION(array_matrix_height); +PG_DEPRECATED_FUNCTION(array_matrix_width); diff --git a/src/nvrtc.c b/old/nvrtc.c similarity index 100% rename from src/nvrtc.c rename to old/nvrtc.c diff --git a/src/pg_compat.h b/old/pg_compat.h similarity index 100% rename from src/pg_compat.h rename to old/pg_compat.h diff --git a/pg_strom.control b/old/pg_strom.control similarity index 100% rename from pg_strom.control rename to old/pg_strom.control diff --git a/old/pg_strom.h b/old/pg_strom.h new file mode 100644 index 000000000..739bb3825 --- /dev/null +++ b/old/pg_strom.h @@ -0,0 +1,1938 @@ +/* + * pg_strom.h + * + * Header file of pg_strom module + * -- + * Copyright 2011-2021 (C) KaiGai Kohei + * Copyright 2014-2021 (C) PG-Strom Developers Team + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the PostgreSQL License. + */ +#ifndef PG_STROM_H +#define PG_STROM_H + +#include "postgres.h" +#if PG_VERSION_NUM < 110000 +#error Base PostgreSQL version must be v11 or later +#endif +#define PG_MAJOR_VERSION (PG_VERSION_NUM / 100) +#define PG_MINOR_VERSION (PG_VERSION_NUM % 100) + +#include "access/brin.h" +#include "access/brin_revmap.h" +#include "access/generic_xlog.h" +#include "access/gist.h" +#include "access/hash.h" +#include "access/heapam.h" +#include "access/heapam_xlog.h" +#if PG_VERSION_NUM >= 130000 +#include "access/heaptoast.h" +#endif +#include "access/htup_details.h" +#include "access/reloptions.h" +#include "access/relscan.h" +#if PG_VERSION_NUM >= 140000 +#include "access/syncscan.h" +#endif +#include "access/sysattr.h" +#if PG_VERSION_NUM < 130000 +#include "access/tuptoaster.h" +#endif +#include "access/twophase.h" +#include "access/visibilitymap.h" +#include "access/xact.h" +#include "catalog/catalog.h" +#include "catalog/dependency.h" +#include "catalog/heap.h" +#include "catalog/indexing.h" +#include "catalog/namespace.h" +#include "catalog/objectaccess.h" +#include "catalog/objectaddress.h" +#include "catalog/pg_aggregate.h" +#include "catalog/pg_am.h" +#include "catalog/pg_amop.h" +#include "catalog/pg_attribute.h" +#include "catalog/pg_cast.h" +#include "catalog/pg_class.h" +#include "catalog/pg_database.h" +#include "catalog/pg_depend.h" +#include "catalog/pg_extension.h" +#include "catalog/pg_foreign_data_wrapper.h" +#include "catalog/pg_foreign_server.h" +#include "catalog/pg_foreign_table.h" +#include "catalog/pg_language.h" +#include "catalog/pg_namespace.h" +#include "catalog/pg_operator.h" +#include "catalog/pg_proc.h" +#include "catalog/pg_statistic.h" +#include "catalog/pg_tablespace.h" +#include "catalog/pg_trigger.h" +#include "catalog/pg_type.h" +#if PG_VERSION_NUM < 110000 +#include "catalog/pg_type_fn.h" +#else +#include "catalog/pg_type_d.h" +#endif +#include "catalog/pg_user_mapping.h" +#include "commands/dbcommands.h" +#include "commands/defrem.h" +#include "commands/event_trigger.h" +#include "commands/explain.h" +#include "commands/extension.h" +#include "commands/proclang.h" +#include "commands/tablecmds.h" +#include "commands/tablespace.h" +#include "commands/trigger.h" +#include "commands/typecmds.h" +#include "commands/variable.h" +#include "common/base64.h" +#if PG_VERSION_NUM >= 130000 +#include "common/hashfn.h" +#endif +#include "common/int.h" +#include "common/md5.h" +#include "executor/executor.h" +#include "executor/nodeAgg.h" +#include "executor/nodeIndexscan.h" +#include "executor/nodeCustom.h" +#include "executor/nodeSubplan.h" +#include "fmgr.h" +#include "foreign/fdwapi.h" +#include "foreign/foreign.h" +#include "funcapi.h" +#include "lib/ilist.h" +#include "lib/stringinfo.h" +#include "libpq/be-fsstubs.h" +#include "libpq/libpq-fs.h" +#include "libpq/pqformat.h" +#include "libpq/pqsignal.h" +#include "miscadmin.h" +#include "nodes/execnodes.h" +#include "nodes/extensible.h" +#include "nodes/makefuncs.h" +#include "nodes/nodeFuncs.h" +#include "nodes/pg_list.h" +#include "nodes/plannodes.h" +#include "nodes/primnodes.h" +#include "nodes/readfuncs.h" +#if PG_VERSION_NUM < 120000 +#include "nodes/relation.h" +#endif +#if PG_VERSION_NUM >= 120000 +#include "nodes/supportnodes.h" +#endif +#if PG_VERSION_NUM >= 120000 +#include "optimizer/appendinfo.h" +#endif +#include "optimizer/clauses.h" +#include "optimizer/cost.h" +#if PG_VERSION_NUM >= 120000 +#include "optimizer/optimizer.h" +#endif +#include "optimizer/pathnode.h" +#include "optimizer/paths.h" +#include "optimizer/plancat.h" +#include "optimizer/planmain.h" +#include "optimizer/planner.h" +#include "optimizer/prep.h" +#include "optimizer/restrictinfo.h" +#include "optimizer/tlist.h" +#if PG_VERSION_NUM < 120000 +#include "optimizer/var.h" +#endif +#include "parser/parse_coerce.h" +#include "parser/parsetree.h" +#include "parser/parse_func.h" +#include "parser/parse_oper.h" +#include "parser/scansup.h" +#include "pgstat.h" +#include "port/atomics.h" +#include "postmaster/bgworker.h" +#include "postmaster/postmaster.h" +#include "storage/buf.h" +#include "storage/buf_internals.h" +#include "storage/ipc.h" +#include "storage/itemptr.h" +#include "storage/fd.h" +#include "storage/large_object.h" +#include "storage/latch.h" +#include "storage/lmgr.h" +#include "storage/lock.h" +#include "storage/pg_shmem.h" +#include "storage/predicate.h" +#include "storage/proc.h" +#include "storage/procarray.h" +#include "storage/shmem.h" +#include "storage/smgr.h" +#include "storage/spin.h" +#include "utils/array.h" +#include "utils/arrayaccess.h" +#include "utils/builtins.h" +#include "utils/bytea.h" +#include "utils/cash.h" +#include "utils/catcache.h" +#include "utils/date.h" +#include "utils/datetime.h" +#if PG_VERSION_NUM >= 120000 +#include "utils/float.h" +#endif +#include "utils/fmgroids.h" +#include "utils/guc.h" +#include "utils/json.h" +#include "utils/jsonb.h" +#include "utils/inet.h" +#if PG_VERSION_NUM < 150000 +#include "utils/int8.h" +#endif +#include "utils/inval.h" +#include "utils/lsyscache.h" +#include "utils/memutils.h" +#include "utils/numeric.h" +#include "utils/pg_crc.h" +#include "utils/pg_locale.h" +#include "utils/rangetypes.h" +#include "utils/regproc.h" +#include "utils/rel.h" +#include "utils/resowner.h" +#include "utils/ruleutils.h" +#include "utils/selfuncs.h" +#include "utils/snapmgr.h" +#include "utils/spccache.h" +#include "utils/syscache.h" +#if PG_VERSION_NUM < 120000 +#include "utils/tqual.h" +#endif +#include "utils/typcache.h" +#include "utils/uuid.h" +#include "utils/varbit.h" +#include "utils/varlena.h" + +#define CUDA_API_PER_THREAD_DEFAULT_STREAM 1 +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "heterodb_extra.h" +#include "arrow_defs.h" + +/* + * -------------------------------------------------------------------- + * + * Configuration sections + * + * NOTE: We uses configuration of the host PostgreSQL system, instead of + * own configure script, not to mismatch prerequisites for module build. + * However, some (possible) configuration will lead unexpected behavior. + * So, we put some checks to prevent unexpected host configurations. + * + * -------------------------------------------------------------------- + */ +#if SIZEOF_DATUM != 8 +#error PG-Strom expects 64bit platform +#endif +#if PG_VERSION_NUM < 130000 +/* + * At PG13, 2e4db241bfd3206bad8286f8ffc2db6bbdaefcdf removed + * '--disable-float4-byval' configure flag, thus, float32 should be + * always passed by value. + */ +#ifndef USE_FLOAT4_BYVAL +#error PG-Strom expects float32 is referenced by value, not reference +#endif +#endif /* VER < PG13*/ +#ifndef USE_FLOAT8_BYVAL +#error PG-Strom expexts float64 is referenced by value, not reference +#endif +#ifndef HAVE_INT64_TIMESTAMP +#error PG-Strom expects timestamp has 64bit integer format +#endif +#include "cuda_common.h" +#include "pg_compat.h" + +#define RESTRACK_HASHSIZE 53 +typedef struct GpuContext +{ + dlist_node chain; + pg_atomic_uint32 refcnt; + ResourceOwner resowner; + /* cuda resources per GpuContext */ + cl_int cuda_dindex; + CUdevice cuda_device; + CUcontext cuda_context; + /* resource management */ + slock_t restrack_lock; + dlist_head restrack[RESTRACK_HASHSIZE]; + /* GPU device memory management */ + pthread_rwlock_t gm_rwlock; + dlist_head gm_normal_list; /* list of device memory segments */ + dlist_head gm_iomap_list; /* list of I/O map memory segments */ + dlist_head gm_managed_list; /* list of managed memory segments */ + dlist_head gm_hostmem_list; /* list of Host memory segments */ + /* error information buffer */ + pg_atomic_uint32 error_level; + int error_code; + const char *error_filename; + int error_lineno; + const char *error_funcname; + char error_message[200]; + /* debug counter */ + pg_atomic_uint64 debug_count1; + pg_atomic_uint64 debug_count2; + pg_atomic_uint64 debug_count3; + pg_atomic_uint64 debug_count4; + /* management of the work-queue */ + bool worker_is_running; + pthread_mutex_t worker_mutex; + pthread_cond_t worker_cond; + pg_atomic_uint32 terminate_workers; + dlist_head pending_tasks; /* list of GpuTask */ + cl_int num_workers; + pg_atomic_uint32 worker_index; + pthread_t worker_threads[FLEXIBLE_ARRAY_MEMBER]; +} GpuContext; + +/* Identifier of the Gpu Programs */ +typedef cl_long ProgramId; +#define INVALID_PROGRAM_ID (-1L) + +/* + * GpuTask and related + */ +typedef enum { + GpuTaskKind_GpuScan, + GpuTaskKind_GpuJoin, + GpuTaskKind_GpuPreAgg, + GpuTaskKind_GpuSort, + GpuTaskKind_PL_CUDA, +} GpuTaskKind; + +typedef struct GpuTask GpuTask; +typedef struct GpuTaskState GpuTaskState; +typedef struct GpuTaskSharedState GpuTaskSharedState; +typedef struct ArrowFdwState ArrowFdwState; +typedef struct GpuCacheState GpuCacheState; + +/* + * GpuTaskState + * + * A common structure of the state machine of GPU related tasks. + */ +struct NVMEScanState; +struct GpuTaskSharedState; + +struct GpuTaskState +{ + CustomScanState css; + GpuContext *gcontext; + GpuTaskKind task_kind; /* one of GpuTaskKind_* */ + ProgramId program_id; /* CUDA Program (to be acquired) */ + CUmodule cuda_module; /* CUDA binary module */ + CUdeviceptr kern_params; /* Const/Param buffer */ + List *used_params; /* Const/Param expressions */ + const Bitmapset *optimal_gpus; /* GPUs preference on plan time */ + bool scan_done; /* True, if no more rows to read */ + + /* fields for outer scan */ + Cost outer_startup_cost; /* copy from the outer path node */ + Cost outer_total_cost; /* copy from the outer path node */ + double outer_plan_rows; /* copy from the outer path node */ + int outer_plan_width; /* copy from the outer path node */ + cl_uint outer_nrows_per_block; + Bitmapset *outer_refs; /* referenced outer attributes */ + Instrumentation outer_instrument; /* runtime statistics, if any */ + TupleTableSlot *scan_overflow; /* temporary buffer, if no space on PDS */ + /* BRIN index support on outer relation, if any */ + struct pgstromIndexState *outer_index_state; + Bitmapset *outer_index_map; + + IndexScanDesc outer_brin_index; /* brin index of outer scan, if any */ + long outer_brin_count; /* # of blocks skipped by index */ + + ArrowFdwState *af_state; /* for GpuTask on Arrow_Fdw */ + GpuCacheState *gc_state; /* for GpuTask on GpuCache */ + + /* + * A state object for NVMe-Strom. If not NULL, GTS prefers BLOCK format + * as source data store. Then, SSD2GPU Direct SQL Execution will be kicked. + */ + struct NVMEScanState *nvme_sstate; + long nvme_count; /* # of blocks loaded by SSD2GPU */ + + /* + * fields to fetch rows from the current task + * + * NOTE: @curr_index is sufficient to point a particular row of KDS, + * if format is ROW, HASH and SLOT. However, BLOCK format has no direct + * pointer for each rows. It contains @nitems blocks and individual block + * contains uncertain number of rows. So, at BLOCK format, @curr_index + * is index of the current block, and @curr_lp_index is also index of + * the current line pointer. + * For all format, @curr_index == @nitems means no rows any more. + */ + cl_long curr_index; /* current position on the curr_task */ + cl_long curr_lp_index; /* index of LinePointer in a block */ + HeapTupleData curr_tuple; /* internal use of PDS_fetch() */ + struct GpuTask *curr_task; /* a GpuTask currently processed */ + + /* callbacks used by gputasks.c */ + GpuTask *(*cb_next_task)(GpuTaskState *gts); + GpuTask *(*cb_terminator_task)(GpuTaskState *gts, + cl_bool *task_is_ready); + void (*cb_switch_task)(GpuTaskState *gts, GpuTask *gtask); + TupleTableSlot *(*cb_next_tuple)(GpuTaskState *gts); + int (*cb_process_task)(GpuTask *gtask, + CUmodule cuda_module); + void (*cb_release_task)(GpuTask *gtask); + /* list of GpuTasks (protexted with GpuContext->mutex) */ + dlist_head ready_tasks; /* list of tasks already processed */ + cl_uint num_running_tasks; /* # of running tasks */ + cl_uint num_ready_tasks; /* # of ready tasks */ + + /* misc fields */ + cl_long num_cpu_fallbacks; /* # of CPU fallback chunks */ + uint64 debug_counter0; + uint64 debug_counter1; + uint64 debug_counter2; + uint64 debug_counter3; + + /* co-operation with CPU parallel */ + GpuTaskSharedState *gtss; /* DSM segment of GTS if any */ + ParallelContext *pcxt; /* Parallel context of PostgreSQL */ +}; + +/* + * GpuTaskSharedState + */ +struct GpuTaskSharedState +{ + /* for arrow_fdw file scan */ + pg_atomic_uint32 af_rbatch_index; + pg_atomic_uint32 af_rbatch_nload; /* # of loaded record-batches */ + pg_atomic_uint32 af_rbatch_nskip; /* # of skipped record-batches */ + /* for gpu_cache file scan */ + pg_atomic_uint32 gc_fetch_count; + /* for block-based regular table scan */ + BlockNumber pbs_nblocks; /* # blocks in relation at start of scan */ + slock_t pbs_mutex; /* lock of the fields below */ + BlockNumber pbs_startblock; /* starting block number */ + BlockNumber pbs_nallocated; /* # of blocks allocated to workers */ + + /* common parallel table scan descriptor */ + ParallelTableScanDescData phscan; +}; + +/* + * GpuTaskRuntimeStat - common statistics + */ +typedef struct +{ + slock_t lock; + Instrumentation outer_instrument; + pg_atomic_uint64 source_nitems; + pg_atomic_uint64 nitems_filtered; + pg_atomic_uint64 nvme_count; + pg_atomic_uint64 brin_count; + pg_atomic_uint64 fallback_count; + /* debug counter */ + pg_atomic_uint64 debug_counter0; + pg_atomic_uint64 debug_counter1; + pg_atomic_uint64 debug_counter2; + pg_atomic_uint64 debug_counter3; +} GpuTaskRuntimeStat; + +static inline void +mergeGpuTaskRuntimeStatParallelWorker(GpuTaskState *gts, + GpuTaskRuntimeStat *gt_rtstat) +{ + Assert(IsParallelWorker()); + if (!gt_rtstat) + return; + SpinLockAcquire(>_rtstat->lock); + InstrAggNode(>_rtstat->outer_instrument, + >s->outer_instrument); + SpinLockRelease(>_rtstat->lock); + pg_atomic_add_fetch_u64(>_rtstat->nvme_count, gts->nvme_count); + pg_atomic_add_fetch_u64(>_rtstat->brin_count, gts->outer_brin_count); + pg_atomic_add_fetch_u64(>_rtstat->fallback_count, + gts->num_cpu_fallbacks); + /* debug counter */ + if (gts->debug_counter0 != 0) + pg_atomic_add_fetch_u64(>_rtstat->debug_counter0, gts->debug_counter0); + if (gts->debug_counter1 != 0) + pg_atomic_add_fetch_u64(>_rtstat->debug_counter1, gts->debug_counter1); + if (gts->debug_counter2 != 0) + pg_atomic_add_fetch_u64(>_rtstat->debug_counter2, gts->debug_counter2); + if (gts->debug_counter3 != 0) + pg_atomic_add_fetch_u64(>_rtstat->debug_counter3, gts->debug_counter3); +} + +static inline void +mergeGpuTaskRuntimeStat(GpuTaskState *gts, + GpuTaskRuntimeStat *gt_rtstat) +{ + InstrAggNode(>s->outer_instrument, + >_rtstat->outer_instrument); + gts->outer_instrument.tuplecount = (double) + pg_atomic_read_u64(>_rtstat->source_nitems); + gts->outer_instrument.nfiltered1 = (double) + pg_atomic_read_u64(>_rtstat->nitems_filtered); + gts->nvme_count += pg_atomic_read_u64(>_rtstat->nvme_count); + gts->outer_brin_count += pg_atomic_read_u64(>_rtstat->brin_count); + gts->num_cpu_fallbacks += pg_atomic_read_u64(>_rtstat->fallback_count); + + gts->debug_counter0 += pg_atomic_read_u64(>_rtstat->debug_counter0); + gts->debug_counter1 += pg_atomic_read_u64(>_rtstat->debug_counter1); + gts->debug_counter2 += pg_atomic_read_u64(>_rtstat->debug_counter2); + gts->debug_counter3 += pg_atomic_read_u64(>_rtstat->debug_counter3); + + if (gts->css.ss.ps.instrument) + memcpy(>s->css.ss.ps.instrument->bufusage, + >s->outer_instrument.bufusage, + sizeof(BufferUsage)); +} + +/* + * GpuTask + * + * It is a unit of task to be sent GPU server. Thus, this object must be + * allocated on the DMA buffer area. + */ +struct GpuTask +{ + kern_errorbuf kerror; /* error status of the task */ + dlist_node chain; /* link to the task state list */ + GpuTaskKind task_kind; /* same with GTS's one */ + ProgramId program_id; /* same with GTS's one */ + GpuTaskState *gts; /* GTS reference in the backend */ + bool cpu_fallback; /* true, if task needs CPU fallback */ +}; + +/* + * State structure of NVMe-Strom per GpuTaskState + */ +typedef struct NVMEScanState +{ + cl_uint nrows_per_block; + cl_uint nblocks_per_chunk; + BlockNumber curr_segno; + Buffer curr_vmbuffer; + BlockNumber nr_segs; + GPUDirectFileDesc files[FLEXIBLE_ARRAY_MEMBER]; +} NVMEScanState; + +/* + * pgstrom_data_store - a data structure with various format to exchange + * a data chunk between the host and CUDA server. + */ +typedef struct pgstrom_data_store +{ + /* GpuContext which owns this data store */ + GpuContext *gcontext; + + /* reference counter */ + pg_atomic_uint32 refcnt; + + /* + * NOTE: Extra information for KDS_FORMAT_BLOCK. + * @nblocks_uncached is number of PostgreSQL blocks, to be processed + * by NVMe-Strom. If @nblocks_uncached > 0, the tail of PDS shall be + * filled up by an array of strom_dma_chunk. + * @filedesc is file-descriptor of the underlying blocks. + * + * NOTE: Extra information for KDS_FORMAT_ARROW + * @iovec introduces pairs of destination offset, file offset and + * chunk length to be read (usually by SSD-to-GPU Direct SQL). + * If NULL, KDS is preliminary loaded by CPU and filesystem, and + * PDS is also allocated on managed memory area. So, worker don't + * need to kick DMA operations explicitly. + * + * NOTE: Extra information for KDS_FORMAT_COLUMN + * @gc_sstate points the GpuCacheShareState for reference IPC handle + * of the main/extra buffer on the device. This IPC handle is only + * valid under the read lock. + */ + cl_uint nblocks_uncached; /* for KDS_FORMAT_BLOCK */ + GPUDirectFileDesc filedesc; + strom_io_vector *iovec; /* for KDS_FORMAT_ARROW */ + /* for KDS_FORMAT_COLUMN */ + void *gc_sstate; + CUdeviceptr m_kds_main; + CUdeviceptr m_kds_extra; + /* data chunk in kernel portion */ + kern_data_store kds __attribute__ ((aligned (STROMALIGN_LEN))); +} pgstrom_data_store; + +/* -------------------------------------------------------------------- + * + * PG-Strom GUC variables + * + * -------------------------------------------------------------------- */ +extern bool pgstrom_enabled; +extern bool pgstrom_bulkexec_enabled; +extern bool pgstrom_cpu_fallback_enabled; +extern bool pgstrom_regression_test_mode; +extern int pgstrom_max_async_tasks; +extern double pgstrom_gpu_setup_cost; +extern double pgstrom_gpu_dma_cost; +extern double pgstrom_gpu_operator_cost; +extern Size pgstrom_chunk_size(void); +extern long PAGE_SIZE; +extern long PAGE_MASK; +extern int PAGE_SHIFT; +extern long PHYS_PAGES; +#define PAGE_ALIGN(sz) TYPEALIGN(PAGE_SIZE,(sz)) + +/* -------------------------------------------------------------------- + * + * Function Declarations + * + * -------------------------------------------------------------------- */ + +/* + * gpu_device.c + */ +typedef struct DevAttributes +{ + cl_int NUMA_NODE_ID; + cl_int DEV_ID; + char DEV_NAME[256]; + char DEV_BRAND[16]; + char DEV_UUID[48]; + size_t DEV_TOTAL_MEMSZ; + size_t DEV_BAR1_MEMSZ; + bool DEV_SUPPORT_GPUDIRECTSQL; +#define DEV_ATTR(LABEL,a,b,c) \ + cl_int LABEL; +#include "device_attrs.h" +#undef DEV_ATTR +} DevAttributes; + +extern DevAttributes *devAttrs; +extern cl_int numDevAttrs; +extern cl_uint devBaselineMaxThreadsPerBlock; +#define cpu_only_mode() (numDevAttrs == 0) +extern void pgstrom_init_gpu_device(void); + +#define GPUKERNEL_MAX_SM_MULTIPLICITY 4 + +extern CUresult gpuOccupancyMaxPotentialBlockSize(int *p_min_grid_sz, + int *p_max_block_sz, + CUfunction kern_function, + size_t dyn_shmem_per_block, + size_t dyn_shmem_per_thread); +extern CUresult gpuOptimalBlockSize(int *p_grid_sz, + int *p_block_sz, + CUfunction kern_function, + CUdevice cuda_device, + size_t dyn_shmem_per_block, + size_t dyn_shmem_per_thread); +extern CUresult __gpuOptimalBlockSize(int *p_grid_sz, + int *p_block_sz, + CUfunction kern_function, + int cuda_dindex, + size_t dyn_shmem_per_block, + size_t dyn_shmem_per_thread); +/* + * shmbuf.c + */ +extern void *shmbufAlloc(size_t sz); +extern void *shmbufAllocZero(size_t sz); +extern void shmbufFree(void *addr); +extern void pgstrom_init_shmbuf(void); +extern MemoryContext TopSharedMemoryContext; + +/* + * gpu_mmgr.c + */ +extern CUresult __gpuMemAllocRaw(GpuContext *gcontext, + CUdeviceptr *p_devptr, + size_t bytesize, + const char *filename, int lineno); +extern CUresult __gpuMemAllocManagedRaw(GpuContext *gcontext, + CUdeviceptr *p_devptr, + size_t bytesize, + int flags, + const char *filename, int lineno); +extern CUresult __gpuMemAllocHostRaw(GpuContext *gcontext, + void **p_hostptr, + size_t bytesize, + const char *filename, int lineno); +extern CUresult __gpuMemAllocDev(GpuContext *gcontext, + CUdeviceptr *p_deviceptr, + size_t bytesize, + CUipcMemHandle *p_mhandle, + const char *filename, int lineno); +extern CUresult __gpuMemAlloc(GpuContext *gcontext, + CUdeviceptr *p_devptr, + size_t bytesize, + const char *filename, int lineno); +extern CUresult __gpuMemAllocManaged(GpuContext *gcontext, + CUdeviceptr *p_devptr, + size_t bytesize, + int flags, + const char *filename, int lineno); +extern CUresult __gpuMemAllocIOMap(GpuContext *gcontext, + CUdeviceptr *p_devptr, + size_t bytesize, + const char *filename, int lineno); +extern size_t gpuMemAllocIOMapMaxLength(void); +extern CUresult __gpuMemAllocHost(GpuContext *gcontext, + void **p_hostptr, + size_t bytesize, + const char *filename, int lineno); +extern CUresult __gpuMemAllocPreserved(cl_int cuda_dindex, + CUipcMemHandle *ipc_mhandle, + ssize_t bytesize, + const char *filename, int lineno); +extern CUresult __gpuIpcOpenMemHandle(GpuContext *gcontext, + CUdeviceptr *p_deviceptr, + CUipcMemHandle m_handle, + unsigned int flags, + const char *filename, int lineno); +extern CUresult gpuMemFree(GpuContext *gcontext, + CUdeviceptr devptr); +extern CUresult gpuMemFreeHost(GpuContext *gcontext, + void *hostptr); +extern CUresult gpuMemFreePreserved(cl_int cuda_dindex, + CUipcMemHandle m_handle); +extern CUresult gpuIpcCloseMemHandle(GpuContext *gcontext, + CUdeviceptr m_deviceptr); + +#define gpuMemAllocRaw(a,b,c) \ + __gpuMemAllocRaw((a),(b),(c),__FILE__,__LINE__) +#define gpuMemAllocManagedRaw(a,b,c,d) \ + __gpuMemAllocManagedRaw((a),(b),(c),(d),__FILE__,__LINE__) +#define gpuMemAllocHostRaw(a,b,c) \ + __gpuMemAllocHostRaw((a),(b),(c),__FILE__,__LINE__) +#define gpuMemAllocDev(a,b,c,d) \ + __gpuMemAllocDev((a),(b),(c),(d),__FILE__,__LINE__) +#define gpuMemAlloc(a,b,c) \ + __gpuMemAlloc((a),(b),(c),__FILE__,__LINE__) +#define gpuMemAllocManaged(a,b,c,d) \ + __gpuMemAllocManaged((a),(b),(c),(d),__FILE__,__LINE__) +#define gpuMemAllocIOMap(a,b,c) \ + __gpuMemAllocIOMap((a),(b),(c),__FILE__,__LINE__) +#define gpuMemAllocHost(a,b,c) \ + __gpuMemAllocHost((a),(b),(c),__FILE__,__LINE__) +#define gpuMemAllocPreserved(a,b,c) \ + __gpuMemAllocPreserved((a),(b),(c),__FILE__,__LINE__) +#define gpuIpcOpenMemHandle(a,b,c,d) \ + __gpuIpcOpenMemHandle((a),(b),(c),(d),__FILE__,__LINE__) + +extern void gpuMemReclaimSegment(GpuContext *gcontext); + +extern void gpuMemCopyFromSSD(CUdeviceptr m_kds, pgstrom_data_store *pds); + +extern void pgstrom_gpu_mmgr_init_gpucontext(GpuContext *gcontext); +extern void pgstrom_gpu_mmgr_cleanup_gpucontext(GpuContext *gcontext); +extern void pgstrom_init_gpu_mmgr(void); + +/* + * gpu_context.c + */ +extern int pgstrom_max_async_tasks; /* GUC */ +extern __thread GpuContext *GpuWorkerCurrentContext; +extern __thread sigjmp_buf *GpuWorkerExceptionStack; +extern __thread int GpuWorkerIndex; +#define CU_CONTEXT_PER_THREAD \ + (GpuWorkerCurrentContext->cuda_context) +#define CU_DEVICE_PER_THREAD \ + (GpuWorkerCurrentContext->cuda_device) +#define CU_DINDEX_PER_THREAD \ + (GpuWorkerCurrentContext->cuda_dindex) + +extern __thread CUevent CU_EVENT_PER_THREAD; + +extern void GpuContextWorkerReportError(int elevel, + int errcode, + const char *__filename, int lineno, + const char *funcname, + const char *fmt, ...) + pg_attribute_printf(6,7); + +static inline void +CHECK_FOR_GPUCONTEXT(GpuContext *gcontext) +{ + uint32 error_level = pg_atomic_read_u32(&gcontext->error_level); + /* + * NOTE: The least bit of the error_level is a flag to indicate + * whether the error information is ready or not. + */ + if (error_level >= 2 * ERROR) + { + while ((error_level & 1) != 0) + { + pg_usleep(1000L); + error_level = pg_atomic_read_u32(&gcontext->error_level); + } + ereport(error_level / 2, + (errcode(gcontext->error_code), + errmsg("%s", gcontext->error_message), + (pgstrom_regression_test_mode ? 0 : + errdetail("GPU kernel location: %s:%d [%s]", + gcontext->error_filename, + gcontext->error_lineno, + gcontext->error_funcname)))); + } + CHECK_FOR_INTERRUPTS(); +} +extern CUresult gpuInit(unsigned int flags); +extern GpuContext *AllocGpuContext(const Bitmapset *optimal_gpus, + bool activate_context, + bool activate_workers); +extern void ActivateGpuContext(GpuContext *gcontext); +extern void ActivateGpuContextNoWorkers(GpuContext *gcontext); +extern GpuContext *GetGpuContext(GpuContext *gcontext); +extern void PutGpuContext(GpuContext *gcontext); +extern void SynchronizeGpuContext(GpuContext *gcontext); +extern void SynchronizeGpuContextOnDSMDetach(dsm_segment *seg, Datum arg); + +#define GPUMEM_DEVICE_RAW_EXTRA ((void *)(~0L)) +#define GPUMEM_HOST_RAW_EXTRA ((void *)(~1L)) + +extern bool trackCudaProgram(GpuContext *gcontext, ProgramId program_id, + const char *filename, int lineno); +extern void untrackCudaProgram(GpuContext *gcontext, ProgramId program_id); +extern bool trackGpuMem(GpuContext *gcontext, CUdeviceptr devptr, void *extra, + const char *filename, int lineno); +extern void *lookupGpuMem(GpuContext *gcontext, CUdeviceptr devptr); +extern void *untrackGpuMem(GpuContext *gcontext, CUdeviceptr devptr); +extern bool trackGpuMemIPC(GpuContext *gcontext, + CUdeviceptr devptr, void *extra, + const char *filename, int lineno); +extern void *untrackGpuMemIPC(GpuContext *gcontext, CUdeviceptr devptr); +extern bool trackRawFileDesc(GpuContext *gcontext, GPUDirectFileDesc *fdesc, + const char *filename, int lineno); +extern void untrackRawFileDesc(GpuContext *gcontext, GPUDirectFileDesc *fdesc); +extern CUmodule __GpuContextLookupModule(GpuContext *gcontext, + ProgramId program_id, + const char *filename, int lineno); +#define GpuContextLookupModule(a,b) \ + __GpuContextLookupModule((a),(b),__FILE__,__LINE__) + +extern void pgstrom_init_gpu_context(void); + +/* + * Exception handling for work-queue of GpuContext + */ +#define STROM_TRY() \ + do { \ + sigjmp_buf *saved_exception_stack = GpuWorkerExceptionStack; \ + sigjmp_buf local_sigjmp_buf; \ + Assert(GpuWorkerCurrentContext != NULL); \ + if (sigsetjmp(local_sigjmp_buf, 0) == 0) \ + { \ + GpuWorkerExceptionStack = &local_sigjmp_buf; + +#define STROM_CATCH() \ + } \ + else \ + { \ + GpuWorkerExceptionStack = saved_exception_stack + +#define STROM_END_TRY() \ + } \ + GpuWorkerExceptionStack = saved_exception_stack; \ + } while(0) + +#define STROM_RE_THROW() \ + siglongjmp(*GpuWorkerExceptionStack, 1) + +#define STROM_REPORT_ERROR(elevel,elabel,fmt,...) \ + do { \ + if (!GpuWorkerCurrentContext) \ + elog((elevel), fmt, ##__VA_ARGS__); \ + else if ((elevel) < ERROR) \ + { \ + if ((elevel) >= log_min_messages) \ + fprintf(stderr, "%s: " fmt " (%s:%d)\n", \ + (elabel), ##__VA_ARGS__, \ + __FILE__, __LINE__); \ + } \ + else \ + { \ + GpuContextWorkerReportError((elevel), \ + ERRCODE_INTERNAL_ERROR, \ + __FILE__, __LINE__, \ + PG_FUNCNAME_MACRO, \ + fmt, ##__VA_ARGS__); \ + pg_unreachable(); \ + } \ + } while(0) + +#define wlog(fmt,...) \ + STROM_REPORT_ERROR(LOG,"Log",fmt,##__VA_ARGS__) +#define wnotice(fmt,...) \ + STROM_REPORT_ERROR(NOTICE,"Notice",fmt,##__VA_ARGS__) +#define werror(fmt,...) \ + STROM_REPORT_ERROR(ERROR,"Error",fmt,##__VA_ARGS__) +#define wfatal(fmt,...) \ + STROM_REPORT_ERROR(FATAL,"Fatal",fmt,##__VA_ARGS__) +#define wpanic(fmt,...) \ + STROM_REPORT_ERROR(PANIC,"Panic",fmt,##__VA_ARGS__) + +static inline void +CHECK_WORKER_TERMINATION(void) +{ + if (pg_atomic_read_u32(&GpuWorkerCurrentContext->terminate_workers)) + werror("GpuContext worker termination"); +} + +#define GPUCONTEXT_PUSH(gcontext) \ + do { \ + CUresult ____rc; \ + \ + ____rc = cuCtxPushCurrent((gcontext)->cuda_context); \ + if (____rc != CUDA_SUCCESS) \ + wfatal("failed on cuCtxPushCurrent: %s", errorText(____rc)) + +#define GPUCONTEXT_POP(gcontext) \ + ____rc = cuCtxPopCurrent(NULL); \ + if (____rc != CUDA_SUCCESS) \ + wfatal("failed on cuCtxPopCurrent: %s", errorText(____rc)); \ + } while(0) + +/* + * gpu_tasks.c + */ +extern CUdeviceptr pgstromSetupKernParambuf(GpuTaskState *gts); +extern void pgstromInitGpuTaskState(GpuTaskState *gts, + GpuContext *gcontext, + GpuTaskKind task_kind, + List *outer_quals, + List *outer_refs, + List *used_params, + const Bitmapset *optimal_gpus, + cl_uint outer_nrows_per_block, + cl_int eflags); +extern TupleTableSlot *pgstromExecGpuTaskState(GpuTaskState *gts); +extern void pgstromRescanGpuTaskState(GpuTaskState *gts); +extern void pgstromReleaseGpuTaskState(GpuTaskState *gts, + GpuTaskRuntimeStat *gt_rtstat); +extern void pgstromExplainGpuTaskState(GpuTaskState *gts, + ExplainState *es, + List *dcontext); +extern Size pgstromEstimateDSMGpuTaskState(GpuTaskState *gts, + ParallelContext *pcxt); +extern void pgstromInitDSMGpuTaskState(GpuTaskState *gts, + ParallelContext *pcxt, + void *coordinate); +extern void pgstromInitWorkerGpuTaskState(GpuTaskState *gts, + void *coordinate); +extern void pgstromReInitializeDSMGpuTaskState(GpuTaskState *gts); +extern void pgstromShutdownDSMGpuTaskState(GpuTaskState *gts); + +extern void pgstromInitGpuTask(GpuTaskState *gts, GpuTask *gtask); +extern void pgstrom_init_gputasks(void); + +/* + * cuda_program.c + */ +extern ProgramId __pgstrom_create_cuda_program(GpuContext *gcontext, + cl_uint extra_flags, + cl_uint varlena_bufsz, + const char *kern_source, + const char *kern_define, + bool wait_for_build, + bool explain_only, + const char *filename, + int lineno); +#define pgstrom_create_cuda_program(a,b,c,d,e,f,g) \ + __pgstrom_create_cuda_program((a),(b),(c),(d),(e),(f),(g), \ + __FILE__,__LINE__) +extern CUmodule pgstrom_load_cuda_program(ProgramId program_id); +extern void pgstrom_put_cuda_program(GpuContext *gcontext, + ProgramId program_id); +extern void pgstrom_build_session_info(StringInfo str, + GpuTaskState *gts, + cl_uint extra_flags); + +extern char *pgstrom_cuda_source_string(ProgramId program_id); +extern const char *pgstrom_cuda_source_file(ProgramId program_id); +extern const char *pgstrom_cuda_binary_file(ProgramId program_id); +extern void pgstrom_init_cuda_program(void); + +/* + * codegen.c + */ +#include "cuda_codegen.h" + +typedef struct codegen_context { + StringInfoData decl; /* declarations of functions for complex expression */ + int decl_count; /* # of temporary variabes in decl */ + PlannerInfo *root; //not necessary? + RelOptInfo *baserel; /* scope of Var-node, if any */ + List *used_params;/* list of Const/Param in use */ + List *used_vars; /* list of Var in use */ + List *pseudo_tlist; /* pseudo tlist expression, if any */ + uint32_t extra_flags; /* external libraries to be included */ + uint32_t extra_bufsz; /* required size of temporary varlena buffer */ + int devcost; /* relative device cost */ +} codegen_context; + +extern size_t pgstrom_codegen_extra_devtypes(char *buf, size_t bufsz, + uint32 extra_flags); +extern devtype_info *pgstrom_devtype_lookup(Oid type_oid); +extern devtype_info *pgstrom_devtype_lookup_and_track(Oid type_oid, + codegen_context *context); +extern devfunc_info *pgstrom_devfunc_lookup(Oid func_oid, + Oid func_rettype, + List *func_args, + Oid func_collid); +extern devfunc_info *pgstrom_devfunc_lookup_type_equal(devtype_info *dtype, + Oid type_collid); +extern devfunc_info *pgstrom_devfunc_lookup_type_compare(devtype_info *dtype, + Oid type_collid); +extern void pgstrom_devfunc_track(codegen_context *context, + devfunc_info *dfunc); +extern devcast_info *pgstrom_devcast_lookup(Oid src_type_oid, + Oid dst_type_oid); +extern bool pgstrom_devtype_can_relabel(Oid src_type_oid, + Oid dst_type_oid); +extern devindex_info *pgstrom_devindex_lookup(Oid opcode, + Oid opfamily); +extern char *pgstrom_codegen_expression(Node *expr, codegen_context *context); +extern void pgstrom_union_type_declarations(StringInfo buf, + const char *name, + List *type_oid_list); +extern bool __pgstrom_device_expression(PlannerInfo *root, + RelOptInfo *baserel, + Expr *expr, + int *p_devcost, + int *p_extra_sz, + const char *filename, int lineno); +#define pgstrom_device_expression(a,b,c) \ + __pgstrom_device_expression((a),(b),(c),NULL,NULL, \ + __FILE__,__LINE__) +#define pgstrom_device_expression_devcost(a,b,c,d) \ + __pgstrom_device_expression((a),(b),(c),(d),NULL, \ + __FILE__,__LINE__) +#define pgstrom_device_expression_extrasz(a,b,c,d) \ + __pgstrom_device_expression((a),(b),(c),NULL,(d), \ + __FILE__,__LINE__) + +extern void pgstrom_init_codegen_context(codegen_context *context, + PlannerInfo *root, + RelOptInfo *baserel); +extern void pgstrom_init_codegen(void); + +/* + * datastore.c + */ +#define pgstrom_chunk_size() ((Size)(65534UL << 10)) /* almost 64MB */ + +extern cl_uint estimate_num_chunks(Path *pathnode); +extern bool KDS_fetch_tuple_row(TupleTableSlot *slot, + kern_data_store *kds, + HeapTuple tuple_buf, + size_t row_index); +extern bool KDS_fetch_tuple_slot(TupleTableSlot *slot, + kern_data_store *kds, + size_t row_index); +extern bool PDS_fetch_tuple(TupleTableSlot *slot, + pgstrom_data_store *pds, + GpuTaskState *gts); +extern kern_data_store *__KDS_clone(GpuContext *gcontext, + kern_data_store *kds, + const char *filename, int lineno); +extern pgstrom_data_store *__PDS_clone(pgstrom_data_store *pds, + const char *filename, int lineno); +extern pgstrom_data_store *PDS_retain(pgstrom_data_store *pds); +extern void PDS_release(pgstrom_data_store *pds); + +extern size_t KDS_calculateHeadSize(TupleDesc tupdesc); +extern bool KDS_schemaIsCompatible(TupleDesc tupdesc, + kern_data_store *kds); +extern void init_kernel_data_store(kern_data_store *kds, + TupleDesc tupdesc, + Size length, + int format, + uint nrooms); + +extern pgstrom_data_store *__PDS_create_row(GpuContext *gcontext, + TupleDesc tupdesc, + Size length, + const char *fname, int lineno); +extern pgstrom_data_store *__PDS_create_hash(GpuContext *gcontext, + TupleDesc tupdesc, + Size length, + const char *fname, int lineno); +extern pgstrom_data_store *__PDS_create_slot(GpuContext *gcontext, + TupleDesc tupdesc, + size_t bytesize, + const char *filename, int lineno); +extern pgstrom_data_store *__PDS_create_block(GpuContext *gcontext, + TupleDesc tupdesc, + NVMEScanState *nvme_sstate, + const char *fname, int lineno); +#define PDS_create_row(a,b,c) \ + __PDS_create_row((a),(b),(c),__FILE__,__LINE__) +#define PDS_create_hash(a,b,c) \ + __PDS_create_hash((a),(b),(c),__FILE__,__LINE__) +#define PDS_create_slot(a,b,c) \ + __PDS_create_slot((a),(b),(c),__FILE__,__LINE__) +#define PDS_create_block(a,b,c) \ + __PDS_create_block((a),(b),(c),__FILE__,__LINE__) +#define KDS_clone(a,b) \ + __KDS_clone((a),(b),__FILE__,__LINE__) +#define PDS_clone(a) \ + __PDS_clone((a),__FILE__,__LINE__) + +extern void KDS_dump_schema(kern_data_store *kds); + +//XXX - to be gpu_task.c? +extern void PDS_init_heapscan_state(GpuTaskState *gts); +extern void PDS_end_heapscan_state(GpuTaskState *gts); +extern void PDS_fillup_blocks(pgstrom_data_store *pds); +extern void __PDS_fillup_arrow(pgstrom_data_store *pds_dst, + GpuContext *gcontext, + kern_data_store *kds_head, + int fdesc, strom_io_vector *iovec); +extern pgstrom_data_store *PDS_fillup_arrow(pgstrom_data_store *pds_src); +extern pgstrom_data_store *PDS_writeback_arrow(pgstrom_data_store *pds_src, + CUdeviceptr m_kds_src); +extern bool KDS_insert_tuple(kern_data_store *kds, + TupleTableSlot *slot); +#define PDS_insert_tuple(pds,slot) KDS_insert_tuple(&(pds)->kds,slot) + +extern bool KDS_insert_hashitem(kern_data_store *kds, + TupleTableSlot *slot, + cl_uint hash_value); +extern void pgstrom_init_datastore(void); + +/* + * relscan.c + */ +extern IndexOptInfo *pgstrom_tryfind_brinindex(PlannerInfo *root, + RelOptInfo *baserel, + List **p_indexConds, + List **p_indexQuals, + cl_long *p_indexNBlocks); +#define PGSTROM_RELSCAN_SSD2GPU 0x0001 +#define PGSTROM_RELSCAN_BRIN_INDEX 0x0002 +#define PGSTROM_RELSCAN_ARROW_FDW 0x0004 +#define PGSTROM_RELSCAN_GPU_CACHE 0x0008 +extern int pgstrom_common_relscan_cost(PlannerInfo *root, + RelOptInfo *scan_rel, + List *scan_quals, + int parallel_workers, + IndexOptInfo *indexOpt, + List *indexQuals, + cl_long indexNBlocks, + double *p_parallel_divisor, + double *p_scan_ntuples, + double *p_scan_nchunks, + cl_uint *p_nrows_per_block, + Cost *p_startup_cost, + Cost *p_run_cost); +extern Bitmapset *pgstrom_pullup_outer_refs(PlannerInfo *root, + RelOptInfo *base_rel, + Bitmapset *referenced); + +extern const Bitmapset *GetOptimalGpusForRelation(PlannerInfo *root, + RelOptInfo *rel); +extern bool ScanPathWillUseNvmeStrom(PlannerInfo *root, + RelOptInfo *baserel); +extern bool RelationCanUseNvmeStrom(Relation relation); + +extern void pgstromExecInitBrinIndexMap(GpuTaskState *gts, + Oid index_oid, + List *index_conds, + List *index_quals); +extern Size pgstromSizeOfBrinIndexMap(GpuTaskState *gts); +extern void pgstromExecGetBrinIndexMap(GpuTaskState *gts); +extern void pgstromExecEndBrinIndexMap(GpuTaskState *gts); +extern void pgstromExecRewindBrinIndexMap(GpuTaskState *gts); +extern void pgstromExplainBrinIndexMap(GpuTaskState *gts, + ExplainState *es, + List *dcontext); + +extern pgstrom_data_store *pgstromExecScanChunk(GpuTaskState *gts); +extern void pgstromRewindScanChunk(GpuTaskState *gts); + +extern void pgstromExplainOuterScan(GpuTaskState *gts, + List *deparse_context, + List *ancestors, + ExplainState *es, + List *outer_quals, + Cost outer_startup_cost, + Cost outer_total_cost, + double outer_plan_rows, + int outer_plan_width); + +extern void pgstrom_init_relscan(void); + +/* + * gpuscan.c + */ +extern bool enable_gpuscan; /* GUC */ +extern Cost cost_for_dma_receive(RelOptInfo *rel, double ntuples); +extern void codegen_gpuscan_quals(StringInfo kern, + codegen_context *context, + const char *component, + Index scanrelid, + List *dev_quals_list); +extern bool pgstrom_pullup_outer_scan(PlannerInfo *root, + const Path *outer_path, + Index *p_outer_relid, + List **p_outer_quals, + const Bitmapset **p_optimal_gpus, + IndexOptInfo **p_index_opt, + List **p_index_conds, + List **p_index_quals, + cl_long *p_index_nblocks); +extern bool pgstrom_path_is_gpuscan(const Path *path); +extern bool pgstrom_plan_is_gpuscan(const Plan *plan); +extern bool pgstrom_planstate_is_gpuscan(const PlanState *ps); +extern Path *pgstrom_copy_gpuscan_path(const Path *pathnode); +extern void assign_gpuscan_session_info(StringInfo buf, GpuTaskState *gts); +extern void pgstrom_init_gpuscan(void); + +/* + * gpujoin.c + */ +struct GpuJoinSharedState; +struct kern_gpujoin; + +extern bool pgstrom_path_is_gpujoin(const Path *pathnode); +extern bool pgstrom_plan_is_gpujoin(const Plan *plannode); +extern bool pgstrom_planstate_is_gpujoin(const PlanState *ps); +extern Path *pgstrom_copy_gpujoin_path(const Path *pathnode); +extern const Bitmapset *gpujoin_get_optimal_gpus(const Path *pathnode); + +#if PG_VERSION_NUM >= 110000 +extern List *extract_partitionwise_pathlist(PlannerInfo *root, + Path *outer_path, + bool try_outer_parallel, + bool try_inner_parallel, + AppendPath **p_append_path, + int *p_parallel_nworkers, + Cost *p_discount_cost); +#endif +extern int gpujoin_process_task(GpuTask *gtask, CUmodule cuda_module); +extern void gpujoin_release_task(GpuTask *gtask); +extern void assign_gpujoin_session_info(StringInfo buf, + GpuTaskState *gts); +extern void pgstrom_init_gpujoin(void); + +extern Size GpuJoinSetupTask(struct kern_gpujoin *kgjoin, + GpuTaskState *gts, + pgstrom_data_store *pds_src); +extern ProgramId GpuJoinCreateCombinedProgram(PlanState *node, + GpuTaskState *gpa_gts, + cl_uint gpa_extra_flags, + cl_uint gpa_varlena_bufsz, + const char *gpa_kern_source, + bool explain_only); +extern bool GpuJoinInnerPreload(GpuTaskState *gts, CUdeviceptr *p_m_kmrels); +extern void GpuJoinInnerUnload(GpuTaskState *gts, bool is_rescan); +extern pgstrom_data_store *GpuJoinExecOuterScanChunk(GpuTaskState *gts); +extern int gpujoinNextRightOuterJoinIfAny(GpuTaskState *gts); +extern TupleTableSlot *gpujoinNextTupleFallbackUpper(GpuTaskState *gts, + struct kern_gpujoin *kgjoin, + pgstrom_data_store *pds_src, + cl_int outer_depth); +extern void gpujoinUpdateRunTimeStat(GpuTaskState *gts, + struct kern_gpujoin *kgjoin); + +/* + * gpupreagg.c + */ +extern int pgstrom_hll_register_bits; +extern bool pgstrom_path_is_gpupreagg(const Path *pathnode); +extern bool pgstrom_plan_is_gpupreagg(const Plan *plan); +extern bool pgstrom_planstate_is_gpupreagg(const PlanState *ps); +extern Path *pgstrom_copy_gpupreagg_path(const Path *pathnode); +extern void gpupreagg_post_planner(PlannedStmt *pstmt, CustomScan *cscan); +extern void assign_gpupreagg_session_info(StringInfo buf, + GpuTaskState *gts); +extern void pgstrom_init_gpupreagg(void); + +/* + * arrow_fdw.c and arrow_read.c + */ +extern bool baseRelIsArrowFdw(RelOptInfo *baserel); +extern bool RelationIsArrowFdw(Relation frel); +extern Bitmapset *GetOptimalGpusForArrowFdw(PlannerInfo *root, + RelOptInfo *baserel); +extern bool KDS_fetch_tuple_arrow(TupleTableSlot *slot, + kern_data_store *kds, + size_t row_index); + +extern ArrowFdwState *ExecInitArrowFdw(ScanState *ss, + GpuContext *gcontext, + List *outer_quals, + Bitmapset *outer_refs); +extern pgstrom_data_store *ExecScanChunkArrowFdw(GpuTaskState *gts); +extern void ExecReScanArrowFdw(ArrowFdwState *af_state); +extern void ExecEndArrowFdw(ArrowFdwState *af_state); + +extern void ExecInitDSMArrowFdw(ArrowFdwState *af_state, + GpuTaskSharedState *gtss); +extern void ExecReInitDSMArrowFdw(ArrowFdwState *af_state); +extern void ExecInitWorkerArrowFdw(ArrowFdwState *af_state, + GpuTaskSharedState *gtss); +extern void ExecShutdownArrowFdw(ArrowFdwState *af_state); +extern void ExplainArrowFdw(ArrowFdwState *af_state, + Relation frel, + ExplainState *es, + List *dcontext); +extern void pgstrom_init_arrow_fdw(void); + +/* + * gpu_cache.c + */ +extern bool baseRelHasGpuCache(PlannerInfo *root, + RelOptInfo *baserel); +extern bool RelationHasGpuCache(Relation rel); +extern GpuCacheState *ExecInitGpuCache(ScanState *ss, int eflags, + Bitmapset *outer_refs); +extern pgstrom_data_store *ExecScanChunkGpuCache(GpuTaskState *gts); +extern void ExecReScanGpuCache(GpuCacheState *gcache_state); +extern void ExecEndGpuCache(GpuCacheState *gcache_state); + +extern void ExecInitDSMGpuCache(GpuCacheState *gcache_state, + GpuTaskSharedState *gtss); +extern void ExecReInitDSMGpuCache(GpuCacheState *gcache_state); +extern void ExecInitWorkerGpuCache(GpuCacheState *gcache_state, + GpuTaskSharedState *gtss); +extern void ExecShutdownGpuCache(GpuCacheState *gcache_state); +extern void ExplainGpuCache(GpuCacheState *gcache_state, + Relation frel, ExplainState *es); +extern CUresult gpuCacheMapDeviceMemory(GpuContext *gcontext, + pgstrom_data_store *pds); +extern void gpuCacheUnmapDeviceMemory(GpuContext *gcontext, + pgstrom_data_store *pds); +extern void gpuCacheBgWorkerBegin(int cuda_dindex); +extern bool gpuCacheBgWorkerDispatch(int cuda_dindex); +extern bool gpuCacheBgWorkerIdleTask(int cuda_dindex); +extern void gpuCacheBgWorkerEnd(int cuda_dindex); +extern void pgstrom_init_gpu_cache(void); + +/* + * misc.c + */ +extern Node *fixup_varnode_to_origin(Node *expr, List *cscan_tlist); +extern Expr *make_flat_ands_explicit(List *andclauses); +extern AppendRelInfo **find_appinfos_by_relids_nofail(PlannerInfo *root, + Relids relids, + int *nappinfos); +extern double get_parallel_divisor(Path *path); +#if PG_VERSION_NUM < 110000 +/* PG11 changed pg_proc definition */ +extern char get_func_prokind(Oid funcid); +#define PROKIND_FUNCTION 'f' +#define PROKIND_AGGREGATE 'a' +#define PROKIND_WINDOW 'w' +#define PROKIND_PROCEDURE 'p' +#endif +extern int get_relnatts(Oid relid); +extern Oid get_function_oid(const char *func_name, + oidvector *func_args, + Oid namespace_oid, + bool missing_ok); +extern Oid get_type_oid(const char *type_name, + Oid namespace_oid, + bool missing_ok); +extern char *get_type_name(Oid type_oid, bool missing_ok); +extern char *get_proc_library(HeapTuple protup); +extern Oid get_object_extension_oid(Oid class_id, + Oid object_id, + int32 objsub_id, + bool missing_ok); +extern char *bms_to_cstring(Bitmapset *x); +extern List *bms_to_pglist(const Bitmapset *bms); +extern Bitmapset *bms_from_pglist(List *pglist); +extern bool pathtree_has_gpupath(Path *node); +extern bool pathtree_has_parallel_aware(Path *node); +extern Path *pgstrom_copy_pathnode(const Path *pathnode); +extern const char *errorText(int errcode); + +extern ssize_t __readFile(int fdesc, void *buffer, size_t nbytes); +extern ssize_t __writeFile(int fdesc, const void *buffer, size_t nbytes); +extern ssize_t __preadFile(int fdesc, void *buffer, size_t nbytes, off_t f_pos); +extern ssize_t __pwriteFile(int fdesc, const void *buffer, size_t nbytes, off_t f_pos); +extern void *__mmapFile(void *addr, size_t length, + int prot, int flags, int fdesc, off_t offset); +extern int __munmapFile(void *mmap_addr); +extern void *__mremapFile(void *mmap_addr, size_t new_size); + +/* + * nvrtc.c + */ +extern int pgstrom_nvrtc_version(void); +extern void pgstrom_init_nvrtc(void); + +/* + * cufile.c + */ +extern bool cuFileDriverLoaded(void); +extern void pgstrom_init_cufile(void); + +/* + * extra.c + */ +extern bool pgstrom_gpudirect_enabled(void); +extern Size pgstrom_gpudirect_threshold(void); +extern void pgstrom_init_extra(void); +extern bool heterodbLicenseCheck(void); +extern int gpuDirectInitDriver(void); +extern void gpuDirectFileDescOpen(GPUDirectFileDesc *gds_fdesc, + File pg_fdesc); +extern void gpuDirectFileDescOpenByPath(GPUDirectFileDesc *gds_fdesc, + const char *pathname); +extern void gpuDirectFileDescClose(const GPUDirectFileDesc *gds_fdesc); +extern CUresult gpuDirectMapGpuMemory(CUdeviceptr m_segment, + size_t m_segment_sz, + unsigned long *p_iomap_handle); +extern CUresult gpuDirectUnmapGpuMemory(CUdeviceptr m_segment, + unsigned long iomap_handle); + +extern void gpuDirectFileReadIOV(const GPUDirectFileDesc *gds_fdesc, + CUdeviceptr m_segment, + unsigned long iomap_handle, + off_t m_offset, + strom_io_vector *iovec); +extern void extraSysfsSetupDistanceMap(const char *manual_config); +extern Bitmapset *extraSysfsLookupOptimalGpus(File filp); +extern ssize_t extraSysfsPrintNvmeInfo(int index, char *buffer, ssize_t buffer_sz); + +/* + * float2.c + */ +#ifndef FLOAT2OID +#define FLOAT2OID 421 +#endif + +/* + * tinyint.c + */ +#ifndef INT1OID +#define INT1OID 606 +#endif + +/* + * main.c + */ +extern int pgstrom_num_users_extra; +extern pgstromUsersExtraDescriptor pgstrom_users_extra_desc[]; +extern Path *pgstrom_create_dummy_path(PlannerInfo *root, Path *subpath); +extern const Path *gpu_path_find_cheapest(PlannerInfo *root, + RelOptInfo *rel, + bool outer_parallel, + bool inner_parallel); +extern bool gpu_path_remember(PlannerInfo *root, + RelOptInfo *rel, + bool outer_parallel, + bool inner_parallel, + const Path *gpu_path); + +extern void _PG_init(void); +extern const char *pgstrom_strerror(cl_int errcode); + +extern void pgstrom_explain_expression(List *expr_list, const char *qlabel, + PlanState *planstate, + List *deparse_context, + List *ancestors, ExplainState *es, + bool force_prefix, + bool convert_to_and); +extern void show_scan_qual(List *qual, const char *qlabel, + PlanState *planstate, List *ancestors, + ExplainState *es); +extern void show_instrumentation_count(const char *qlabel, int which, + PlanState *planstate, ExplainState *es); + +/* ---------------------------------------------------------------- + * + * Miscellaneous static inline functions + * + * ---------------------------------------------------------------- */ + +/* looong label is not friendly for indent */ +#define NumOfSystemAttrs (-(1+FirstLowInvalidHeapAttributeNumber)) + +/* Max/Min macros that takes 3 or more arguments */ +#define Max3(a,b,c) ((a) > (b) ? Max((a),(c)) : Max((b),(c))) +#define Max4(a,b,c,d) Max(Max((a),(b)), Max((c),(d))) + +#define Min3(a,b,c) ((a) > (b) ? Min((a),(c)) : Min((b),(c))) +#define Min4(a,b,c,d) Min(Min((a),(b)), Min((c),(d))) + +#ifndef SAMESIGN +#define SAMESIGN(a,b) (((a) < 0) == ((b) < 0)) +#endif + +/* + * trim_cstring - remove spaces from head/tail + */ +static inline char * +trim_cstring(char *str) +{ + char *end; + + while (isspace(*str)) + str++; + end = str + strlen(str) - 1; + while (end >= str && isspace(*end)) + *end-- = '\0'; + + return str; +} + +/* + * pmakeFloat - for convenient; makeFloat + psprintf + */ +#define pmakeFloat(fval) \ + makeFloat(psprintf("%.*e", DBL_DIG+3, (double)(fval))) + +/* + * get_prev_log2 + * + * It returns N of the largest 2^N value that is smaller than or equal to + * the supplied value. + */ +static inline int +get_prev_log2(Size size) +{ + int shift = 0; + + if (size == 0 || size == 1) + return 0; + size >>= 1; +#if __GNUC__ + shift = sizeof(Size) * BITS_PER_BYTE - __builtin_clzl(size); +#else +#if SIZEOF_VOID_P == 8 + if ((size & 0xffffffff00000000UL) != 0) + { + size >>= 32; + shift += 32; + } +#endif + if ((size & 0xffff0000UL) != 0) + { + size >>= 16; + shift += 16; + } + if ((size & 0x0000ff00UL) != 0) + { + size >>= 8; + shift += 8; + } + if ((size & 0x000000f0UL) != 0) + { + size >>= 4; + shift += 4; + } + if ((size & 0x0000000cUL) != 0) + { + size >>= 2; + shift += 2; + } + if ((size & 0x00000002UL) != 0) + { + size >>= 1; + shift += 1; + } + if ((size & 0x00000001UL) != 0) + shift += 1; +#endif /* !__GNUC__ */ + return shift; +} + +/* + * get_next_log2 + * + * It returns N of the least 2^N value that is larger than or equal to + * the supplied value. + */ +static inline int +get_next_log2(Size size) +{ + int shift = 0; + + if (size == 0 || size == 1) + return 0; + size--; +#ifdef __GNUC__ + shift = sizeof(Size) * BITS_PER_BYTE - __builtin_clzl(size); +#else +#if SIZEOF_VOID_P == 8 + if ((size & 0xffffffff00000000UL) != 0) + { + size >>= 32; + shift += 32; + } +#endif + if ((size & 0xffff0000UL) != 0) + { + size >>= 16; + shift += 16; + } + if ((size & 0x0000ff00UL) != 0) + { + size >>= 8; + shift += 8; + } + if ((size & 0x000000f0UL) != 0) + { + size >>= 4; + shift += 4; + } + if ((size & 0x0000000cUL) != 0) + { + size >>= 2; + shift += 2; + } + if ((size & 0x00000002UL) != 0) + { + size >>= 1; + shift += 1; + } + if ((size & 0x00000001UL) != 0) + shift += 1; +#endif /* !__GNUC__ */ + return shift; +} + +/* + * __trim - remove whitespace at the head/tail of cstring + */ +static inline char * +__trim(char *token) +{ + char *tail = token + strlen(token) - 1; + + while (*token == ' ' || *token == '\t') + token++; + while (tail >= token && (*tail == ' ' || *tail == '\t')) + *tail-- = '\0'; + return token; +} + +/* + * It translate an alignment character into width + */ +static inline int +typealign_get_width(char type_align) +{ + switch (type_align) + { + case 'c': + return 1; + case 's': + return ALIGNOF_SHORT; + case 'i': + return ALIGNOF_INT; + case 'd': + return ALIGNOF_DOUBLE; + default: + elog(ERROR, "unexpected type alignment: %c", type_align); + } + return -1; /* be compiler quiet */ +} + +#ifndef forfour +/* XXX - PG12 added forfour() macro */ +#define forfour(lc1, list1, lc2, list2, lc3, list3, lc4, list4) \ + for ((lc1) = list_head(list1), (lc2) = list_head(list2), \ + (lc3) = list_head(list3), (lc4) = list_head(list4); \ + (lc1) != NULL && (lc2) != NULL && (lc3) != NULL && \ + (lc4) != NULL; \ + (lc1) = lnext(lc1), (lc2) = lnext(lc2), (lc3) = lnext(lc3),\ + (lc4) = lnext(lc4)) +#endif + +/* XXX - PG10 added lfirst_node() and related */ +#ifndef lfirst_node +#define lfirst_node(T,x) ((T *)lfirst(x)) +#endif +#ifndef linitial_node +#define linitial_node(T,x) ((T *)linitial(x)) +#endif +#ifndef lsecond_node +#define lsecond_node(T,x) ((T *)lsecond(x)) +#endif +#ifndef lthird_node +#define lthird_node(T,x) ((T *)lthird(x)) +#endif + +/* lappend on the specified memory-context */ +static inline List * +lappend_cxt(MemoryContext memcxt, List *list, void *datum) +{ + MemoryContext oldcxt = MemoryContextSwitchTo(memcxt); + List *r; + + r = lappend(list, datum); + MemoryContextSwitchTo(oldcxt); + + return r; +} + +/* initStringInfo on a particular memory context */ +static inline void +initStringInfoContext(StringInfo str, MemoryContext memcxt) +{ + MemoryContext oldcxt = MemoryContextSwitchTo(memcxt); + initStringInfo(str); + MemoryContextSwitchTo(oldcxt); +} + +static inline char * +format_numeric(cl_long value) +{ + if (value > 8000000000000L || value < -8000000000000L) + return psprintf("%.2fT", (double)value / 1000000000000.0); + else if (value > 8000000000L || value < -8000000000L) + return psprintf("%.2fG", (double)value / 1000000000.0); + else if (value > 8000000L || value < -8000000L) + return psprintf("%.2fM", (double)value / 1000000.0); + else if (value > 8000L || value < -8000L) + return psprintf("%.2fK", (double)value / 1000.0); + else + return psprintf("%ld", value); +} + +static inline char * +format_bytesz(Size nbytes) +{ + if (nbytes > (Size)(1UL << 43)) + return psprintf("%.2fTB", (double)nbytes / (double)(1UL << 40)); + else if (nbytes > (double)(1UL << 33)) + return psprintf("%.2fGB", (double)nbytes / (double)(1UL << 30)); + else if (nbytes > (double)(1UL << 23)) + return psprintf("%.2fMB", (double)nbytes / (double)(1UL << 20)); + else if (nbytes > (double)(1UL << 13)) + return psprintf("%.2fKB", (double)nbytes / (double)(1UL << 10)); + return psprintf("%uB", (unsigned int)nbytes); +} + +static inline char * +format_millisec(double milliseconds) +{ + if (milliseconds > 300000.0) /* more then 5min */ + return psprintf("%.2fmin", milliseconds / 60000.0); + else if (milliseconds > 8000.0) /* more than 8sec */ + return psprintf("%.2fsec", milliseconds / 1000.0); + return psprintf("%.2fms", milliseconds); +} + +static inline const char * +__basename(const char *filename) +{ + const char *pos = strrchr(filename, '/'); + + return pos ? pos + 1 : filename; +} + +/* + * merge two dlist_head + */ +static inline void +dlist_append_tail(dlist_head *base, dlist_head *items) +{ + if (dlist_is_empty(items)) + return; + items->head.next->prev = base->head.prev; + items->head.prev->next = &base->head; + base->head.prev->next = items->head.next; + base->head.prev = items->head.prev; +} + +/* + * Some usuful memory allocation wrapper + */ +#define palloc_huge(sz) MemoryContextAllocHuge(CurrentMemoryContext,(sz)) +static inline void * +pmemdup(const void *src, Size sz) +{ + void *dst = palloc(sz); + + memcpy(dst, src, sz); + + return dst; +} + +/* + * simple wrapper for pthread_mutex_lock + */ +static inline void +pthreadMutexInit(pthread_mutex_t *mutex, int pshared) +{ + pthread_mutexattr_t mattr; + + if ((errno = pthread_mutexattr_init(&mattr)) != 0) + wfatal("failed on pthread_mutexattr_init: %m"); + if ((errno = pthread_mutexattr_setpshared(&mattr, pshared)) != 0) + wfatal("failed on pthread_mutexattr_setpshared: %m"); + if ((errno = pthread_mutex_init(mutex, &mattr)) != 0) + wfatal("failed on pthread_mutex_init: %m"); + if ((errno = pthread_mutexattr_destroy(&mattr)) != 0) + wfatal("failed on pthread_mutexattr_destroy: %m"); +} + +static inline void +pthreadMutexLock(pthread_mutex_t *mutex) +{ + if ((errno = pthread_mutex_lock(mutex)) != 0) + wfatal("failed on pthread_mutex_lock: %m"); +} + +static inline bool +pthreadMutexLockTimeout(pthread_mutex_t *mutex, cl_ulong timeout_ms) +{ + struct timespec tm; + + if (clock_gettime(CLOCK_REALTIME, &tm) != 0) + wfatal("failed on clock_gettime: %m"); + tm.tv_sec += (timeout_ms / 1000); + tm.tv_nsec += (timeout_ms % 1000) * 1000000; + if (tm.tv_nsec >= 1000000000L) + { + tm.tv_sec += tm.tv_nsec / 1000000000L; + tm.tv_nsec = tm.tv_nsec % 1000000000L; + } + + errno = pthread_mutex_timedlock(mutex, &tm); + if (errno == ETIMEDOUT) + return false; + else if (errno != 0) + wfatal("failed on pthread_mutex_timedlock: %m"); + return true; +} + +static inline void +pthreadMutexUnlock(pthread_mutex_t *mutex) +{ + if ((errno = pthread_mutex_unlock(mutex)) != 0) + wfatal("failed on pthread_mutex_unlock: %m"); +} + +static inline void +pthreadRWLockInit(pthread_rwlock_t *rwlock) +{ + pthread_rwlockattr_t rwattr; + + if ((errno = pthread_rwlockattr_init(&rwattr)) != 0) + wfatal("failed on pthread_rwlockattr_init: %m"); + if ((errno = pthread_rwlockattr_setpshared(&rwattr, 1)) != 0) + wfatal("failed on pthread_rwlockattr_setpshared: %m"); + if ((errno = pthread_rwlock_init(rwlock, &rwattr)) != 0) + wfatal("failed on pthread_rwlock_init: %m"); +} + +static inline void +pthreadRWLockReadLock(pthread_rwlock_t *rwlock) +{ + if ((errno = pthread_rwlock_rdlock(rwlock)) != 0) + wfatal("failed on pthread_rwlock_rdlock: %m"); +} + +static inline void +pthreadRWLockWriteLock(pthread_rwlock_t *rwlock) +{ + if ((errno = pthread_rwlock_wrlock(rwlock)) != 0) + wfatal("failed on pthread_rwlock_wrlock: %m"); +} + +static inline bool +pthreadRWLockWriteTryLock(pthread_rwlock_t *rwlock) +{ + if ((errno = pthread_rwlock_trywrlock(rwlock)) == 0) + return true; + if (errno != EBUSY) + wfatal("failed on pthread_rwlock_trywrlock: %m"); + return false; +} + +static inline void +pthreadRWLockUnlock(pthread_rwlock_t *rwlock) +{ + if ((errno = pthread_rwlock_unlock(rwlock)) != 0) + wfatal("failed on pthread_rwlock_unlock: %m"); +} + +static inline void +pthreadCondInit(pthread_cond_t *cond, int pshared) +{ + pthread_condattr_t condattr; + + if ((errno = pthread_condattr_init(&condattr)) != 0) + wfatal("failed on pthread_condattr_init: %m"); + if ((errno = pthread_condattr_setpshared(&condattr, pshared)) != 0) + wfatal("failed on pthread_condattr_setpshared: %m"); + if ((errno = pthread_cond_init(cond, &condattr)) != 0) + wfatal("failed on pthread_cond_init: %m"); + if ((errno = pthread_condattr_destroy(&condattr)) != 0) + wfatal("failed on pthread_condattr_destroy: %m"); +} + +static inline void +pthreadCondWait(pthread_cond_t *cond, pthread_mutex_t *mutex) +{ + if ((errno = pthread_cond_wait(cond, mutex)) != 0) + wfatal("failed on pthread_cond_wait: %m"); +} + +static inline bool +pthreadCondWaitTimeout(pthread_cond_t *cond, pthread_mutex_t *mutex, + long timeout_ms) +{ + struct timespec tm; + + clock_gettime(CLOCK_REALTIME, &tm); + tm.tv_sec += timeout_ms / 1000; + tm.tv_nsec += (timeout_ms % 1000) * 1000000; + if (tm.tv_nsec > 1000000000) + { + tm.tv_sec += tm.tv_nsec / 1000000000; + tm.tv_nsec = tm.tv_nsec % 1000000000; + } + + errno = pthread_cond_timedwait(cond, mutex, &tm); + if (errno == 0) + return true; + else if (errno == ETIMEDOUT) + return false; + wfatal("failed on pthread_cond_timedwait: %m"); +} + +static inline void +pthreadCondBroadcast(pthread_cond_t *cond) +{ + if ((errno = pthread_cond_broadcast(cond)) != 0) + wfatal("failed on pthread_cond_broadcast: %m"); +} + +static inline void +pthreadCondSignal(pthread_cond_t *cond) +{ + if ((errno = pthread_cond_signal(cond)) != 0) + wfatal("failed on pthread_cond_signal: %m"); +} + +/* + * utility to calculate time diff + */ +#define TV_DIFF(tv2,tv1) \ + (((double)(tv2.tv_sec - tv1.tv_sec) * 1000000.0 + \ + (double)(tv2.tv_usec - tv1.tv_usec)) / 1000.0) +#define TP_DIFF(tp2,tp1) \ + ((tp2.tv_sec - tp1.tv_sec) * 1000000000UL + (tp2.tv_nsec - tp1.tv_nsec)) + +#endif /* PG_STROM_H */ diff --git a/old/relscan.c b/old/relscan.c new file mode 100644 index 000000000..70219efe6 --- /dev/null +++ b/old/relscan.c @@ -0,0 +1,2222 @@ +/* + * relscan.c + * + * Common routines related to relation scan + * ---- + * Copyright 2011-2021 (C) KaiGai Kohei + * Copyright 2014-2021 (C) PG-Strom Developers Team + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the PostgreSQL License. + */ +#include "pg_strom.h" + +/* Data structure for collecting qual clauses that match an index */ +typedef struct +{ + bool nonempty; /* True if lists are not all empty */ + /* Lists of RestrictInfos, one per index column */ + List *indexclauses[INDEX_MAX_KEYS]; +} IndexClauseSet; + +/*--- static variables ---*/ +static bool pgstrom_enable_brin; + +/* + * simple_match_clause_to_indexcol + * + * It is a simplified version of match_clause_to_indexcol. + * Also see optimizer/path/indxpath.c + */ +static bool +simple_match_clause_to_indexcol(IndexOptInfo *index, + int indexcol, + RestrictInfo *rinfo) +{ + Expr *clause = rinfo->clause; + Index index_relid = index->rel->relid; + Oid opfamily = index->opfamily[indexcol]; + Oid idxcollation = index->indexcollations[indexcol]; + Node *leftop; + Node *rightop; + Relids left_relids; + Relids right_relids; + Oid expr_op; + Oid expr_coll; + + /* Clause must be a binary opclause */ + if (!is_opclause(clause)) + return false; + + leftop = get_leftop(clause); + rightop = get_rightop(clause); + if (!leftop || !rightop) + return false; + left_relids = rinfo->left_relids; + right_relids = rinfo->right_relids; + expr_op = ((OpExpr *) clause)->opno; + expr_coll = ((OpExpr *) clause)->inputcollid; + + if (OidIsValid(idxcollation) && idxcollation != expr_coll) + return false; + + /* + * Check for clauses of the form: + * (indexkey operator constant) OR + * (constant operator indexkey) + */ + if (match_index_to_operand(leftop, indexcol, index) && + !bms_is_member(index_relid, right_relids) && + !contain_volatile_functions(rightop) && + op_in_opfamily(expr_op, opfamily)) + return true; + + if (match_index_to_operand(rightop, indexcol, index) && + !bms_is_member(index_relid, left_relids) && + !contain_volatile_functions(leftop) && + op_in_opfamily(get_commutator(expr_op), opfamily)) + return true; + + return false; +} + +/* + * simple_match_clause_to_index + * + * It is a simplified version of match_clause_to_index. + * Also see optimizer/path/indxpath.c + */ +static void +simple_match_clause_to_index(IndexOptInfo *index, + RestrictInfo *rinfo, + IndexClauseSet *clauseset) +{ + int indexcol; + + /* + * Never match pseudoconstants to indexes. (Normally a match could not + * happen anyway, since a pseudoconstant clause couldn't contain a Var, + * but what if someone builds an expression index on a constant? It's not + * totally unreasonable to do so with a partial index, either.) + */ + if (rinfo->pseudoconstant) + return; + + /* + * If clause can't be used as an indexqual because it must wait till after + * some lower-security-level restriction clause, reject it. + */ + if (!restriction_is_securely_promotable(rinfo, index->rel)) + return; + + /* OK, check each index column for a match */ + for (indexcol = 0; indexcol < index->ncolumns; indexcol++) + { + if (simple_match_clause_to_indexcol(index, + indexcol, + rinfo)) + { + clauseset->indexclauses[indexcol] = + list_append_unique_ptr(clauseset->indexclauses[indexcol], + rinfo); + clauseset->nonempty = true; + break; + } + } +} + +/* + * estimate_brinindex_scan_nblocks + * + * Also see brincostestimate at utils/adt/selfuncs.c + */ +static cl_long +estimate_brinindex_scan_nblocks(PlannerInfo *root, + RelOptInfo *baserel, + IndexOptInfo *index, + IndexClauseSet *clauseset, + List **p_indexQuals) +{ + Relation indexRel; + BrinStatsData statsData; + List *indexQuals = NIL; + ListCell *lc __attribute__((unused)); + int icol __attribute__((unused)); + Selectivity qualSelectivity; + Selectivity indexSelectivity; + double indexCorrelation = 0.0; + double indexRanges; + double minimalRanges; + double estimatedRanges; + + /* Obtain some data from the index itself. */ + indexRel = index_open(index->indexoid, AccessShareLock); + brinGetStats(indexRel, &statsData); + index_close(indexRel, AccessShareLock); + + /* Get selectivity of the index qualifiers */ + icol = 1; + foreach (lc, index->indextlist) + { + TargetEntry *tle = lfirst(lc); + ListCell *cell; + VariableStatData vardata; + + foreach (cell, clauseset->indexclauses[icol-1]) + { + RestrictInfo *rinfo = lfirst(cell); + + indexQuals = lappend(indexQuals, rinfo); + } + + if (IsA(tle->expr, Var)) + { + Var *var = (Var *) tle->expr; + RangeTblEntry *rte; + + /* in case of BRIN index on simple column */ + rte = root->simple_rte_array[var->varno]; + if (get_relation_stats_hook && + (*get_relation_stats_hook)(root, rte, var->varattno, + &vardata)) + { + if (HeapTupleIsValid(vardata.statsTuple) && !vardata.freefunc) + elog(ERROR, "no callback to release stats variable"); + } + else + { + vardata.statsTuple = + SearchSysCache3(STATRELATTINH, + ObjectIdGetDatum(rte->relid), + Int16GetDatum(var->varattno), + BoolGetDatum(false)); + vardata.freefunc = ReleaseSysCache; + } + } + else + { + if (get_index_stats_hook && + (*get_index_stats_hook)(root, index->indexoid, icol, + &vardata)) + { + if (HeapTupleIsValid(vardata.statsTuple) && !vardata.freefunc) + elog(ERROR, "no callback to release stats variable"); + } + else + { + vardata.statsTuple + = SearchSysCache3(STATRELATTINH, + ObjectIdGetDatum(index->indexoid), + Int16GetDatum(icol), + BoolGetDatum(false)); + vardata.freefunc = ReleaseSysCache; + } + } + + if (HeapTupleIsValid(vardata.statsTuple)) + { + AttStatsSlot sslot; + + if (get_attstatsslot(&sslot, vardata.statsTuple, + STATISTIC_KIND_CORRELATION, + InvalidOid, + ATTSTATSSLOT_NUMBERS)) + { + double varCorrelation = 0.0; + + if (sslot.nnumbers > 0) + varCorrelation = Abs(sslot.numbers[0]); + + if (varCorrelation > indexCorrelation) + indexCorrelation = varCorrelation; + + free_attstatsslot(&sslot); + } + } + ReleaseVariableStats(vardata); + + icol++; + } + qualSelectivity = clauselist_selectivity(root, + indexQuals, + baserel->relid, + JOIN_INNER, + NULL); + + /* estimate number of blocks to read */ + indexRanges = ceil((double) baserel->pages / statsData.pagesPerRange); + if (indexRanges < 1.0) + indexRanges = 1.0; + minimalRanges = ceil(indexRanges * qualSelectivity); + + //elog(INFO, "strom: qualSelectivity=%.6f indexRanges=%.6f minimalRanges=%.6f indexCorrelation=%.6f", qualSelectivity, indexRanges, minimalRanges, indexCorrelation); + + if (indexCorrelation < 1.0e-10) + estimatedRanges = indexRanges; + else + estimatedRanges = Min(minimalRanges / indexCorrelation, indexRanges); + + indexSelectivity = estimatedRanges / indexRanges; + if (indexSelectivity < 0.0) + indexSelectivity = 0.0; + if (indexSelectivity > 1.0) + indexSelectivity = 1.0; + + /* index quals, if any */ + if (p_indexQuals) + *p_indexQuals = indexQuals; + /* estimated number of blocks to read */ + return (cl_long)(indexSelectivity * (double) baserel->pages); +} + +/* + * extract_index_conditions + */ +static Node * +__fixup_indexqual_operand(Node *node, IndexOptInfo *indexOpt) +{ + ListCell *lc; + + if (!node) + return NULL; + + if (IsA(node, RelabelType)) + { + RelabelType *relabel = (RelabelType *) node; + + return __fixup_indexqual_operand((Node *)relabel->arg, indexOpt); + } + + foreach (lc, indexOpt->indextlist) + { + TargetEntry *tle = lfirst(lc); + + if (equal(node, tle->expr)) + { + return (Node *)makeVar(INDEX_VAR, + tle->resno, + exprType((Node *)tle->expr), + exprTypmod((Node *) tle->expr), + exprCollation((Node *) tle->expr), + 0); + } + } + if (IsA(node, Var)) + elog(ERROR, "Bug? variable is not found at index tlist"); + return expression_tree_mutator(node, __fixup_indexqual_operand, indexOpt); +} + +static List * +extract_index_conditions(List *index_quals, IndexOptInfo *indexOpt) +{ + List *result = NIL; + ListCell *lc; + + foreach (lc, index_quals) + { + RestrictInfo *rinfo = lfirst(lc); + OpExpr *op = (OpExpr *) rinfo->clause; + + if (!IsA(rinfo->clause, OpExpr)) + elog(ERROR, "Bug? unexpected index clause: %s", + nodeToString(rinfo->clause)); + if (list_length(((OpExpr *)rinfo->clause)->args) != 2) + elog(ERROR, "indexqual clause must be binary opclause"); + op = (OpExpr *)copyObject(rinfo->clause); + if (!bms_equal(rinfo->left_relids, indexOpt->rel->relids)) + CommuteOpExpr(op); + /* replace the indexkey expression with an index Var */ + linitial(op->args) = __fixup_indexqual_operand(linitial(op->args), + indexOpt); + result = lappend(result, op); + } + return result; +} + +/* + * pgstrom_tryfind_brinindex + */ +IndexOptInfo * +pgstrom_tryfind_brinindex(PlannerInfo *root, + RelOptInfo *baserel, + List **p_indexConds, + List **p_indexQuals, + cl_long *p_indexNBlocks) +{ + cl_long indexNBlocks = LONG_MAX; + IndexOptInfo *indexOpt = NULL; + List *indexQuals = NIL; + ListCell *cell; + + /* skip if GUC disables BRIN-index */ + if (!pgstrom_enable_brin) + return NULL; + + /* skip if no indexes */ + if (baserel->indexlist == NIL) + return NULL; + + foreach (cell, baserel->indexlist) + { + IndexOptInfo *index = (IndexOptInfo *) lfirst(cell); + List *temp = NIL; + ListCell *lc; + cl_long nblocks; + IndexClauseSet clauseset; + + /* Protect limited-size array in IndexClauseSets */ + Assert(index->ncolumns <= INDEX_MAX_KEYS); + + /* Ignore partial indexes that do not match the query. */ + if (index->indpred != NIL && !index->predOK) + continue; + + /* Only BRIN-indexes are now supported */ + if (index->relam != BRIN_AM_OID) + continue; + + /* see match_clauses_to_index */ + memset(&clauseset, 0, sizeof(IndexClauseSet)); + foreach (lc, index->indrestrictinfo) + { + RestrictInfo *rinfo = lfirst_node(RestrictInfo, lc); + + simple_match_clause_to_index(index, rinfo, &clauseset); + } + if (!clauseset.nonempty) + continue; + + /* + * In case when multiple BRIN-indexes are configured, + * the one with minimal selectivity is the best choice. + */ + nblocks = estimate_brinindex_scan_nblocks(root, baserel, + index, + &clauseset, + &temp); + if (indexNBlocks > nblocks) + { + indexOpt = index; + indexQuals = temp; + indexNBlocks = nblocks; + } + } + + if (indexOpt) + { + if (p_indexConds) + *p_indexConds = extract_index_conditions(indexQuals, indexOpt); + if (p_indexQuals) + *p_indexQuals = indexQuals; + if (p_indexNBlocks) + *p_indexNBlocks = indexNBlocks; + } + return indexOpt; +} + +/* + * pgstrom_common_relscan_cost + */ +int +pgstrom_common_relscan_cost(PlannerInfo *root, + RelOptInfo *scan_rel, + List *scan_quals, + int parallel_workers, + IndexOptInfo *indexOpt, + List *indexQuals, + cl_long indexNBlocks, + double *p_parallel_divisor, + double *p_scan_ntuples, + double *p_scan_nchunks, + cl_uint *p_nrows_per_block, + Cost *p_startup_cost, + Cost *p_run_cost) +{ + int scan_mode = 0; + Cost startup_cost = 0.0; + Cost run_cost = 0.0; + Cost index_scan_cost = 0.0; + Cost disk_scan_cost = 0.0; + double gpu_ratio = pgstrom_gpu_operator_cost / cpu_operator_cost; + double parallel_divisor; + double ntuples = scan_rel->tuples; + double nblocks = scan_rel->pages; + double nchunks; + double selectivity; + double spc_seq_page_cost; + double spc_rand_page_cost; + cl_uint nrows_per_block = 0; + Size heap_size; + Size htup_size; + QualCost qcost; + ListCell *lc; + + Assert((scan_rel->reloptkind == RELOPT_BASEREL || + scan_rel->reloptkind == RELOPT_OTHER_MEMBER_REL) && + scan_rel->relid > 0 && + scan_rel->relid < root->simple_rel_array_size); + /* mark if special storage layer */ + if (baseRelIsArrowFdw(scan_rel)) + scan_mode |= PGSTROM_RELSCAN_ARROW_FDW; + if (baseRelHasGpuCache(root, scan_rel)) + scan_mode |= PGSTROM_RELSCAN_GPU_CACHE; + + /* selectivity of device executable qualifiers */ + selectivity = clauselist_selectivity(root, + scan_quals, + scan_rel->relid, + JOIN_INNER, + NULL); + /* cost of full-table scan, if not gpu memory store */ + if ((scan_mode & PGSTROM_RELSCAN_GPU_CACHE) == 0) + { + get_tablespace_page_costs(scan_rel->reltablespace, + &spc_rand_page_cost, + &spc_seq_page_cost); + disk_scan_cost = spc_seq_page_cost * nblocks; + } + + /* consideration for BRIN-index, if any */ + if (indexOpt) + { + BrinStatsData statsData; + Relation index_rel; + Cost x; + + index_rel = index_open(indexOpt->indexoid, AccessShareLock); + brinGetStats(index_rel, &statsData); + index_close(index_rel, AccessShareLock); + + get_tablespace_page_costs(indexOpt->reltablespace, + &spc_rand_page_cost, + &spc_seq_page_cost); + index_scan_cost = spc_seq_page_cost * statsData.revmapNumPages; + foreach (lc, indexQuals) + { + cost_qual_eval_node(&qcost, (Node *)lfirst(lc), root); + index_scan_cost += qcost.startup + qcost.per_tuple; + } + + x = index_scan_cost + spc_rand_page_cost * (double)indexNBlocks; + if (disk_scan_cost > x) + { + disk_scan_cost = x; + ntuples = scan_rel->tuples * ((double) indexNBlocks / nblocks); + nblocks = indexNBlocks; + scan_mode |= PGSTROM_RELSCAN_BRIN_INDEX; + } + } + + /* check whether NVMe-Strom is capable */ + if (ScanPathWillUseNvmeStrom(root, scan_rel)) + scan_mode |= PGSTROM_RELSCAN_SSD2GPU; + + /* + * Cost adjustment by CPU parallelism, if used. + * (overall logic is equivalent to cost_seqscan()) + */ + if (parallel_workers > 0) + { + parallel_divisor = (double) parallel_workers; +#if PG_VERSION_NUM >= 110000 + if (parallel_leader_participation) +#endif + { + double leader_contribution; + + leader_contribution = 1.0 - (0.3 * (double) parallel_workers); + if (leader_contribution > 0) + parallel_divisor += leader_contribution; + } + /* number of tuples to be actually processed */ + ntuples = clamp_row_est(ntuples / parallel_divisor); + + /* + * After the v2.0, pg_strom.gpu_setup_cost represents the cost for + * run-time code build by NVRTC. Once binary is constructed, it can + * be shared with all the worker process, so we can discount the + * cost by parallel_divisor. + */ + startup_cost += pgstrom_gpu_setup_cost / 2 + + (pgstrom_gpu_setup_cost / (2 * parallel_divisor)); + } + else + { + parallel_divisor = 1.0; + startup_cost += pgstrom_gpu_setup_cost; + } + /* + * Cost discount for more efficient I/O with multiplexing. + * PG background workers can issue read request to filesystem + * concurrently. It enables to work I/O subsystem during blocking- + * time for other workers, then, it pulls up usage ratio of the + * storage system. + */ + disk_scan_cost /= Min(2.0, sqrt(parallel_divisor)); + + /* more disk i/o discount if NVMe-Strom is available */ + if ((scan_mode & PGSTROM_RELSCAN_SSD2GPU) != 0) + disk_scan_cost /= 1.5; + run_cost += disk_scan_cost; + + /* + * Rough estimation for number of chunks if KDS_FORMAT_ROW. + * Also note that we roughly assume KDS_HeadSz is BLCKSZ to + * reduce estimation cycle. + */ + heap_size = (double)(BLCKSZ - SizeOfPageHeaderData) * nblocks; + htup_size = (MAXALIGN(offsetof(HeapTupleHeaderData, + t_bits[BITMAPLEN(scan_rel->max_attr)])) + + MAXALIGN(heap_size / Max(scan_rel->tuples, 1.0) - + sizeof(ItemIdData) - SizeofHeapTupleHeader)); + nchunks = (((double)(offsetof(kern_tupitem, htup) + htup_size + + sizeof(cl_uint)) * Max(ntuples, 1.0)) / + ((double)(pgstrom_chunk_size() - BLCKSZ))); + nchunks = Max(nchunks, 1); + + /* + * estimation of the tuple density per block - this logic follows + * the manner in estimate_rel_size() + */ + if (scan_rel->pages > 0) + nrows_per_block = ceil(scan_rel->tuples / (double)scan_rel->pages); + else + { + RangeTblEntry *rte = root->simple_rte_array[scan_rel->relid]; + size_t tuple_width = get_relation_data_width(rte->relid, NULL); + + tuple_width += MAXALIGN(SizeofHeapTupleHeader); + tuple_width += sizeof(ItemIdData); + /* note: integer division is intentional here */ + nrows_per_block = (BLCKSZ - SizeOfPageHeaderData) / tuple_width; + } + + /* Cost for GPU qualifiers */ + cost_qual_eval_node(&qcost, (Node *)scan_quals, root); + startup_cost += qcost.startup; + run_cost += qcost.per_tuple * gpu_ratio * ntuples; + ntuples *= selectivity; + + /* Cost for DMA transfer (host/storage --> GPU) */ + run_cost += pgstrom_gpu_dma_cost * nchunks; + + *p_parallel_divisor = parallel_divisor; + *p_scan_ntuples = ntuples / parallel_divisor; + *p_scan_nchunks = nchunks / parallel_divisor; + *p_nrows_per_block = + ((scan_mode & PGSTROM_RELSCAN_SSD2GPU) != 0 ? nrows_per_block : 0); + *p_startup_cost = startup_cost; + *p_run_cost = run_cost; + + return scan_mode; +} + +/* + * pgstrom_pullup_outer_refs + * + * setup outer_refs bitmap according to the attr_needed of RelOptInfo. + * If base_rel is a partition leaf, we have to look at parent relation + * instead. + */ +Bitmapset * +pgstrom_pullup_outer_refs(PlannerInfo *root, + RelOptInfo *base_rel, + Bitmapset *referenced) +{ + ListCell *lc; + int i, j, k; + + if (base_rel->reloptkind == RELOPT_BASEREL) + { + for (i=base_rel->min_attr, j=0; i <= base_rel->max_attr; i++, j++) + { + if (i < 0 || base_rel->attr_needed[j] == NULL) + continue; + k = i - FirstLowInvalidHeapAttributeNumber; + referenced = bms_add_member(referenced, k); + } + } + else if (base_rel->reloptkind == RELOPT_OTHER_MEMBER_REL) + { + foreach (lc, root->append_rel_list) + { + AppendRelInfo *apinfo = lfirst(lc); + RelOptInfo *parent_rel; + Bitmapset *parent_refs; + Var *var; + + if (apinfo->child_relid != base_rel->relid) + continue; + Assert(apinfo->parent_relid < root->simple_rel_array_size); + parent_rel = root->simple_rel_array[apinfo->parent_relid]; + parent_refs = pgstrom_pullup_outer_refs(root, parent_rel, NULL); + + for (k = bms_next_member(parent_refs, -1); + k >= 0; + k = bms_next_member(parent_refs, k)) + { + i = k + FirstLowInvalidHeapAttributeNumber; + if (i <= 0) + bms_add_member(referenced, k); + if (i > list_length(apinfo->translated_vars)) + elog(ERROR, "Bug? column reference out of range"); + var = list_nth(apinfo->translated_vars, i-1); + Assert(IsA(var, Var)); + j = var->varattno - FirstLowInvalidHeapAttributeNumber; + referenced = bms_add_member(referenced, j); + } + break; + } + if (lc == NULL) + elog(ERROR, "Bug? AppendRelInfo not found (relid=%u)", + base_rel->relid); + } + else + { + elog(ERROR, "Bug? outer is not a simple relation"); + } + return referenced; +} + +/* ---------------------------------------------------------------- + * + * GPUDirectSQL related routines + * + * ---------------------------------------------------------------- + */ +typedef struct +{ + Oid tablespace_oid; + bool is_valid; + Bitmapset optimal_gpus; +} tablespace_optimal_gpu_hentry; + +static HTAB *tablespace_optimal_gpu_htable = NULL; + +static void +tablespace_optimal_gpu_cache_callback(Datum arg, int cacheid, uint32 hashvalue) +{ + /* invalidate all the cached status */ + if (tablespace_optimal_gpu_htable) + { + hash_destroy(tablespace_optimal_gpu_htable); + tablespace_optimal_gpu_htable = NULL; + } +} + +/* + * GetOptimalGpusForTablespace + */ +static const Bitmapset * +GetOptimalGpusForTablespace(Oid tablespace_oid) +{ + tablespace_optimal_gpu_hentry *hentry; + bool found; + + if (!pgstrom_gpudirect_enabled()) + return NULL; + + if (!OidIsValid(tablespace_oid)) + tablespace_oid = MyDatabaseTableSpace; + + if (!tablespace_optimal_gpu_htable) + { + HASHCTL hctl; + int nwords = (numDevAttrs / BITS_PER_BITMAPWORD) + 1; + + memset(&hctl, 0, sizeof(HASHCTL)); + hctl.keysize = sizeof(Oid); + hctl.entrysize = MAXALIGN(offsetof(tablespace_optimal_gpu_hentry, + optimal_gpus.words[nwords])); + tablespace_optimal_gpu_htable + = hash_create("TablespaceOptimalGpu", 128, + &hctl, HASH_ELEM | HASH_BLOBS); + } + hentry = (tablespace_optimal_gpu_hentry *) + hash_search(tablespace_optimal_gpu_htable, + &tablespace_oid, + HASH_ENTER, + &found); + if (!found || !hentry->is_valid) + { + char *pathname; + File filp; + Bitmapset *optimal_gpus; + + Assert(hentry->tablespace_oid == tablespace_oid); + + pathname = GetDatabasePath(MyDatabaseId, tablespace_oid); + filp = PathNameOpenFile(pathname, O_RDONLY); + if (filp < 0) + { + elog(WARNING, "failed on open('%s') of tablespace %u: %m", + pathname, tablespace_oid); + return NULL; + } + optimal_gpus = extraSysfsLookupOptimalGpus(filp); + if (!optimal_gpus) + hentry->optimal_gpus.nwords = 0; + else + { + Assert(optimal_gpus->nwords <= (numDevAttrs / BITS_PER_BITMAPWORD) + 1); + memcpy(&hentry->optimal_gpus, optimal_gpus, + offsetof(Bitmapset, words[optimal_gpus->nwords])); + bms_free(optimal_gpus); + } + FileClose(filp); + hentry->is_valid = true; + } + Assert(hentry->is_valid); + return (hentry->optimal_gpus.nwords > 0 ? &hentry->optimal_gpus : NULL); +} + +const Bitmapset * +GetOptimalGpusForRelation(PlannerInfo *root, RelOptInfo *rel) +{ + RangeTblEntry *rte; + HeapTuple tup; + char relpersistence; + const Bitmapset *optimal_gpus; + + if (baseRelIsArrowFdw(rel)) + { + if (pgstrom_gpudirect_enabled()) + return GetOptimalGpusForArrowFdw(root, rel); + return NULL; + } + + optimal_gpus = GetOptimalGpusForTablespace(rel->reltablespace); + if (!bms_is_empty(optimal_gpus)) + { + /* only permanent / unlogged table can use NVMe-Strom */ + rte = root->simple_rte_array[rel->relid]; + tup = SearchSysCache1(RELOID, ObjectIdGetDatum(rte->relid)); + if (!HeapTupleIsValid(tup)) + elog(ERROR, "cache lookup failed for relation %u", rte->relid); + relpersistence = ((Form_pg_class) GETSTRUCT(tup))->relpersistence; + ReleaseSysCache(tup); + + if (relpersistence == RELPERSISTENCE_PERMANENT || + relpersistence == RELPERSISTENCE_UNLOGGED) + return optimal_gpus; + } + return NULL; +} + +bool +RelationCanUseNvmeStrom(Relation relation) +{ + Oid tablespace_oid = RelationGetForm(relation)->reltablespace; + + /* SSD2GPU on temp relation is not supported */ + if (RelationUsesLocalBuffers(relation)) + return false; + return !bms_is_empty(GetOptimalGpusForTablespace(tablespace_oid)); +} + +/* + * ScanPathWillUseNvmeStrom - Optimizer Hint + */ +bool +ScanPathWillUseNvmeStrom(PlannerInfo *root, RelOptInfo *baserel) +{ + size_t num_scan_pages = 0; + + if (!pgstrom_gpudirect_enabled()) + return false; + + /* + * Check expected amount of the scan i/o. + * If 'baserel' is children of partition table, threshold shall be + * checked towards the entire partition size, because the range of + * child tables fully depend on scan qualifiers thus variable time + * by time. Once user focus on a particular range, but he wants to + * focus on other area. It leads potential thrashing on i/o. + */ + if (baserel->reloptkind == RELOPT_BASEREL) + { + if (!bms_is_empty(GetOptimalGpusForRelation(root, baserel))) + num_scan_pages = baserel->pages; + } + else if (baserel->reloptkind == RELOPT_OTHER_MEMBER_REL) + { + ListCell *lc; + Index parent_relid = 0; + + foreach (lc, root->append_rel_list) + { + AppendRelInfo *appinfo = (AppendRelInfo *) lfirst(lc); + + if (appinfo->child_relid == baserel->relid) + { + parent_relid = appinfo->parent_relid; + break; + } + } + if (!lc) + { + elog(NOTICE, "Bug? child table (%d) not found in append_rel_list", + baserel->relid); + return false; + } + + foreach (lc, root->append_rel_list) + { + AppendRelInfo *appinfo = (AppendRelInfo *) lfirst(lc); + RelOptInfo *rel; + + if (appinfo->parent_relid != parent_relid) + continue; + rel = root->simple_rel_array[appinfo->child_relid]; + if (!bms_is_empty(GetOptimalGpusForRelation(root, rel))) + num_scan_pages += rel->pages; + } + } + else + elog(ERROR, "Bug? unexpected reloptkind of base relation: %d", + (int)baserel->reloptkind); + + if (num_scan_pages < pgstrom_gpudirect_threshold() / BLCKSZ) + return false; + /* ok, this table scan can use nvme-strom */ + return true; +} + +/* + * pgstromIndexState - runtime status of BRIN-index for relation scan + */ +typedef struct pgstromIndexState +{ + Oid index_oid; + Relation index_rel; + Node *index_quals; /* for EXPLAIN */ + BlockNumber nblocks; + BlockNumber range_sz; + BrinRevmap *brin_revmap; + BrinDesc *brin_desc; + ScanKey scan_keys; + int num_scan_keys; + IndexRuntimeKeyInfo *runtime_keys_info; + int num_runtime_keys; + bool runtime_key_ready; + ExprContext *runtime_econtext; +} pgstromIndexState; + +/* + * pgstromExecInitBrinIndexMap + */ +void +pgstromExecInitBrinIndexMap(GpuTaskState *gts, + Oid index_oid, + List *index_conds, + List *index_quals) +{ + pgstromIndexState *pi_state = NULL; + Relation relation = gts->css.ss.ss_currentRelation; + EState *estate = gts->css.ss.ps.state; + Index scanrelid; + LOCKMODE lockmode = NoLock; + + if (!OidIsValid(index_oid)) + { + Assert(index_conds == NIL); + gts->outer_index_state = NULL; + return; + } + Assert(relation != NULL); + scanrelid = ((Scan *) gts->css.ss.ps.plan)->scanrelid; + if (!ExecRelationIsTargetRelation(estate, scanrelid)) + lockmode = AccessShareLock; + + pi_state = palloc0(sizeof(pgstromIndexState)); + pi_state->index_oid = index_oid; + pi_state->index_rel = index_open(index_oid, lockmode); + pi_state->index_quals = (Node *)make_ands_explicit(index_quals); + ExecIndexBuildScanKeys(>s->css.ss.ps, + pi_state->index_rel, + index_conds, + false, + &pi_state->scan_keys, + &pi_state->num_scan_keys, + &pi_state->runtime_keys_info, + &pi_state->num_runtime_keys, + NULL, + NULL); + + /* ExprContext to evaluate runtime keys, if any */ + if (pi_state->num_runtime_keys != 0) + pi_state->runtime_econtext = CreateExprContext(estate); + else + pi_state->runtime_econtext = NULL; + + /* BRIN index specific initialization */ + pi_state->nblocks = RelationGetNumberOfBlocks(relation); + pi_state->brin_revmap = brinRevmapInitialize(pi_state->index_rel, + &pi_state->range_sz, + estate->es_snapshot); + pi_state->brin_desc = brin_build_desc(pi_state->index_rel); + + /* save the state */ + gts->outer_index_state = pi_state; +} + +/* + * pgstromSizeOfBrinIndexMap + */ +Size +pgstromSizeOfBrinIndexMap(GpuTaskState *gts) +{ + pgstromIndexState *pi_state = gts->outer_index_state; + int nwords; + + if (!pi_state) + return 0; + + nwords = (pi_state->nblocks + + pi_state->range_sz - 1) / pi_state->range_sz; + return STROMALIGN(offsetof(Bitmapset, words) + + sizeof(bitmapword) * nwords); + +} + +/* + * pgstromExecGetBrinIndexMap + * + * Also see bringetbitmap + */ +static void +__pgstromExecGetBrinIndexMap(pgstromIndexState *pi_state, + Bitmapset *brin_map, + Snapshot snapshot) +{ + BrinDesc *bdesc = pi_state->brin_desc; + TupleDesc bd_tupdesc = bdesc->bd_tupdesc; + BlockNumber nblocks = pi_state->nblocks; + BlockNumber range_sz = pi_state->range_sz; + BlockNumber heapBlk; + BlockNumber index; + Buffer buf = InvalidBuffer; + FmgrInfo *consistentFn; + BrinMemTuple *dtup; + BrinTuple *btup __attribute__((unused)) = NULL; + Size btupsz __attribute__((unused)) = 0; + int nranges; + int nwords; + MemoryContext oldcxt; + MemoryContext perRangeCxt; + + /* rooms for the consistent support procedures of indexed columns */ + consistentFn = palloc0(sizeof(FmgrInfo) * bd_tupdesc->natts); + /* allocate an initial in-memory tuple */ + dtup = brin_new_memtuple(bdesc); + + /* moves to the working memory context per range */ + perRangeCxt = AllocSetContextCreate(CurrentMemoryContext, + "PG-Strom BRIN-index temporary", + ALLOCSET_DEFAULT_SIZES); + oldcxt = MemoryContextSwitchTo(perRangeCxt); + + nranges = (pi_state->nblocks + + pi_state->range_sz - 1) / pi_state->range_sz; + nwords = (nranges + BITS_PER_BITMAPWORD - 1) / BITS_PER_BITMAPWORD; + Assert(brin_map->nwords < 0); + memset(brin_map->words, 0, sizeof(bitmapword) * nwords); + /* + * Now scan the revmap. We start by querying for heap page 0, + * incrementing by the number of pages per range; this gives us a full + * view of the table. + */ + for (heapBlk = 0, index = 0; + heapBlk < nblocks; + heapBlk += range_sz, index++) + { + BrinTuple *tup; + OffsetNumber off; + Size size; + int keyno; + + CHECK_FOR_INTERRUPTS(); + + MemoryContextResetAndDeleteChildren(perRangeCxt); + + tup = brinGetTupleForHeapBlock(pi_state->brin_revmap, heapBlk, + &buf, &off, &size, + BUFFER_LOCK_SHARE, + snapshot); + if (tup) + { + btup = brin_copy_tuple(tup, size, btup, &btupsz); + + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + dtup = brin_deform_tuple(bdesc, btup, dtup); + if (!dtup->bt_placeholder) + { + for (keyno = 0; keyno < pi_state->num_scan_keys; keyno++) + { + ScanKey key = &pi_state->scan_keys[keyno]; + AttrNumber keyattno = key->sk_attno; + BrinValues *bval = &dtup->bt_columns[keyattno - 1]; + Datum rv; + Form_pg_attribute keyattr __attribute__((unused)); + + keyattr = tupleDescAttr(bd_tupdesc, keyattno-1); + Assert((key->sk_flags & SK_ISNULL) || + (key->sk_collation == keyattr->attcollation)); + /* First time this column? look up consistent function */ + if (consistentFn[keyattno - 1].fn_oid == InvalidOid) + { + FmgrInfo *tmp; + + tmp = index_getprocinfo(pi_state->index_rel, keyattno, + BRIN_PROCNUM_CONSISTENT); + fmgr_info_copy(&consistentFn[keyattno - 1], tmp, + CurrentMemoryContext); + } + + /* + * Check whether the scan key is consistent with the page + * range values; if so, pages in the range shall be + * skipped on the scan. + */ + rv = FunctionCall3Coll(&consistentFn[keyattno - 1], + key->sk_collation, + PointerGetDatum(bdesc), + PointerGetDatum(bval), + PointerGetDatum(key)); + if (!DatumGetBool(rv)) + { + if (index / BITS_PER_BITMAPWORD < nwords) + brin_map->words[index / BITS_PER_BITMAPWORD] + |= (1U << (index % BITS_PER_BITMAPWORD)); + break; + } + } + } + } + } + MemoryContextSwitchTo(oldcxt); + MemoryContextDelete(perRangeCxt); + + if (buf != InvalidBuffer) + ReleaseBuffer(buf); + /* mark this bitmapset is ready */ + pg_memory_barrier(); + brin_map->nwords = nwords; +} + +void +pgstromExecGetBrinIndexMap(GpuTaskState *gts) +{ + pgstromIndexState *pi_state = gts->outer_index_state; + + if (!gts->outer_index_map || gts->outer_index_map->nwords < 0) + { + EState *estate = gts->css.ss.ps.state; + + if (!gts->outer_index_map) + { + Assert(!IsParallelWorker()); + gts->outer_index_map + = MemoryContextAlloc(estate->es_query_cxt, + pgstromSizeOfBrinIndexMap(gts)); + gts->outer_index_map->nwords = -1; + } + + ResetLatch(MyLatch); + while (gts->outer_index_map->nwords < 0) + { + if (!IsParallelWorker()) + { + __pgstromExecGetBrinIndexMap(pi_state, + gts->outer_index_map, + estate->es_snapshot); + /* wake up parallel workers if any */ + if (gts->pcxt) + { + ParallelContext *pcxt = gts->pcxt; + pid_t pid; + int i; + + for (i=0; i < pcxt->nworkers_launched; i++) + { + if (GetBackgroundWorkerPid(pcxt->worker[i].bgwhandle, + &pid) == BGWH_STARTED) + ProcSendSignal(pid); + } + } +#if 0 + { + Bitmapset *map = gts->outer_index_map; + int i; + + elog(INFO, "BRIN-index (%s) range_sz = %d", + RelationGetRelationName(pi_state->index_rel), + pi_state->range_sz); + for (i=0; i < map->nwords; i += 4) + { + elog(INFO, "% 6d: %08x %08x %08x %08x", + i * BITS_PER_BITMAPWORD, + i+3 < map->nwords ? map->words[i+3] : 0, + i+2 < map->nwords ? map->words[i+2] : 0, + i+1 < map->nwords ? map->words[i+1] : 0, + i < map->nwords ? map->words[i] : 0); + } + } +#endif + } + else + { + int ev; + + /* wait for completion of BRIN-index preload */ + CHECK_FOR_INTERRUPTS(); + + ev = WaitLatch(MyLatch, + WL_LATCH_SET | + WL_POSTMASTER_DEATH, + -1, + PG_WAIT_EXTENSION); + if (ev & WL_POSTMASTER_DEATH) + elog(FATAL, "unexpected postmaster dead"); + ResetLatch(MyLatch); + } + } + } +} + +void +pgstromExecEndBrinIndexMap(GpuTaskState *gts) +{ + pgstromIndexState *pi_state = gts->outer_index_state; + + if (!pi_state) + return; + brinRevmapTerminate(pi_state->brin_revmap); + index_close(pi_state->index_rel, NoLock); +} + +void +pgstromExecRewindBrinIndexMap(GpuTaskState *gts) +{} + +/* + * pgstromExplainBrinIndexMap + */ +void +pgstromExplainBrinIndexMap(GpuTaskState *gts, + ExplainState *es, + List *dcontext) +{ + pgstromIndexState *pi_state = gts->outer_index_state; + char *conds_str; + char temp[128]; + + if (!pi_state) + return; + + conds_str = deparse_expression(pi_state->index_quals, + dcontext, es->verbose, false); + ExplainPropertyText("BRIN cond", conds_str, es); + if (es->analyze) + { + if (es->format == EXPLAIN_FORMAT_TEXT) + { + snprintf(temp, sizeof(temp), "%ld of %ld (%.2f%%)", + gts->outer_brin_count, + (long)pi_state->nblocks, + 100.0 * ((double) gts->outer_brin_count / + (double) pi_state->nblocks)); + ExplainPropertyText("BRIN skipped", temp, es); + } + else + { + ExplainPropertyInteger("BRIN fetched", NULL, + pi_state->nblocks - + gts->outer_brin_count, es); + ExplainPropertyInteger("BRIN skipped", NULL, + gts->outer_brin_count, es); + } + } +} + +/* + * PDS_exec_heapscan_block - PDS scan for KDS_FORMAT_BLOCK format + */ +typedef struct { + strom_io_vector *iovec; + BlockNumber *blknum; +} PDSHeapScanBlockState; + +#define initPDSHeapScanBlockState(pds, bstate) \ + do{ \ + (bstate).iovec = alloca(offsetof(strom_io_vector, \ + ioc[(pds)->kds.nrooms])); \ + (bstate).iovec->nr_chunks = 0; \ + (bstate).blknum = alloca(sizeof(BlockNumber) * (pds)->kds.nrooms); \ + }while(0) + +static inline void +updatePDSHeapScanBlockState(pgstrom_data_store *pds, + PDSHeapScanBlockState *bstate, + BlockNumber blknum) +{ + strom_io_vector *iovec = bstate->iovec; + strom_io_chunk *iochunk; + cl_uint pages_per_block = (BLCKSZ / PAGE_SIZE); + cl_uint fchunk_id = (blknum % RELSEG_SIZE) * pages_per_block; + + if (iovec->nr_chunks > 0) + { + iochunk = &iovec->ioc[iovec->nr_chunks - 1]; + if (iochunk->fchunk_id + iochunk->nr_pages == fchunk_id) + { + /* continuous region - expand the last chunk */ + iochunk->nr_pages += pages_per_block; + goto out; + } + } + /* discontinuous region - add a new chunk */ + iochunk = &iovec->ioc[iovec->nr_chunks++]; + iochunk->m_offset = BLCKSZ * pds->nblocks_uncached; + iochunk->fchunk_id = fchunk_id; + iochunk->nr_pages = pages_per_block; +out: + bstate->blknum[pds->nblocks_uncached++] = blknum; +} + +static void +mergePDSHeapScanBlockState(pgstrom_data_store *pds, + PDSHeapScanBlockState *bstate) +{ + strom_io_vector *iovec = bstate->iovec; + cl_uint nr_uncached = pds->nblocks_uncached; + cl_uint nr_loaded = pds->kds.nitems - nr_uncached; + BlockNumber *block_nums = (BlockNumber *)KERN_DATA_STORE_BODY(&pds->kds); + + Assert(pds->nblocks_uncached > 0); + Assert(iovec != NULL); + + /* copy BlockNumber array */ + memcpy(block_nums + nr_loaded, bstate->blknum, + sizeof(BlockNumber) * nr_uncached); + /* copy iovec */ + memcpy(pds->iovec, iovec, offsetof(strom_io_vector, + ioc[iovec->nr_chunks])); +} + +static bool +PDS_exec_heapscan_block(GpuTaskState *gts, + pgstrom_data_store *pds, + PDSHeapScanBlockState *bstate) +{ + Relation relation = gts->css.ss.ss_currentRelation; + HeapScanDesc hscan = (HeapScanDesc)gts->css.ss.ss_currentScanDesc; + NVMEScanState *nvme_sstate = gts->nvme_sstate; + BlockNumber blknum = hscan->rs_cblock; + BlockNumber *block_nums; + Snapshot snapshot = ((TableScanDesc)hscan)->rs_snapshot; + BufferAccessStrategy strategy = hscan->rs_strategy; + SMgrRelation smgr = relation->rd_smgr; + Buffer buffer; + Page spage; + Page dpage; + cl_uint nr_loaded; + bool all_visible; + + /* PDS cannot eat any blocks more, obviously */ + if (pds->kds.nitems >= pds->kds.nrooms) + return false; + + /* array of block numbers */ + block_nums = (BlockNumber *)KERN_DATA_STORE_BODY(&pds->kds); + + /* + * NVMe-Strom can be applied only when filesystem supports the feature, + * and the current source block is all-visible. + * Elsewhere, we will go fallback with synchronized buffer scan. + */ + if (RelationCanUseNvmeStrom(relation) && + VM_ALL_VISIBLE(relation, blknum, + &nvme_sstate->curr_vmbuffer)) + { + BufferTag newTag; + uint32 newHash; + LWLock *newPartitionLock = NULL; + bool retval; + int buf_id; + + /* create a tag so we can lookup the buffer */ + INIT_BUFFERTAG(newTag, smgr->smgr_rnode.node, MAIN_FORKNUM, blknum); + /* determine its hash code and partition lock ID */ + newHash = BufTableHashCode(&newTag); + newPartitionLock = BufMappingPartitionLock(newHash); + + /* check whether the block exists on the shared buffer? */ + LWLockAcquire(newPartitionLock, LW_SHARED); + buf_id = BufTableLookup(&newTag, newHash); + if (buf_id < 0) + { + BlockNumber segno = blknum / RELSEG_SIZE; + GPUDirectFileDesc *dfile; + + Assert(segno < nvme_sstate->nr_segs); + /* + * We cannot mix up multiple source files in a single PDS chunk. + * If heapscan_block comes across segment boundary, rest of the + * blocks must be read on the next PDS chunk. + */ + dfile = &nvme_sstate->files[segno]; + if (pds->filedesc.rawfd >= 0 && + pds->filedesc.rawfd != dfile->rawfd) + retval = false; + else + { + if (pds->filedesc.rawfd < 0) + memcpy(&pds->filedesc, dfile, sizeof(GPUDirectFileDesc)); + updatePDSHeapScanBlockState(pds, bstate, blknum); + pds->kds.nitems++; + retval = true; + } + LWLockRelease(newPartitionLock); + return retval; + } + LWLockRelease(newPartitionLock); + } + /* + * Load the source buffer with synchronous read + */ + buffer = ReadBufferExtended(relation, MAIN_FORKNUM, blknum, + RBM_NORMAL, strategy); +#if 1 + /* Just like heapgetpage(), however, jobs we focus on is OLAP + * workload, so it's uncertain whether we should vacuum the page + * here. + */ + heap_page_prune_opt(relation, buffer); +#endif + /* we will check tuple's visibility under the shared lock */ + LockBuffer(buffer, BUFFER_LOCK_SHARE); + nr_loaded = pds->kds.nitems - pds->nblocks_uncached; + spage = (Page) BufferGetPage(buffer); + dpage = (Page) KERN_DATA_STORE_BLOCK_PGPAGE(&pds->kds, nr_loaded); + memcpy(dpage, spage, BLCKSZ); + block_nums[nr_loaded] = blknum; + + /* + * Logic is almost same as heapgetpage() doing. We have to invalidate + * invisible tuples prior to GPU kernel execution, if not all-visible. + */ + all_visible = PageIsAllVisible(dpage) && !snapshot->takenDuringRecovery; + if (!all_visible) + { + int lines = PageGetMaxOffsetNumber(dpage); + OffsetNumber lineoff; + ItemId lpp; + + for (lineoff = FirstOffsetNumber, lpp = PageGetItemId(dpage, lineoff); + lineoff <= lines; + lineoff++, lpp++) + { + HeapTupleData tup; + bool valid; + + if (!ItemIdIsNormal(lpp)) + continue; + + tup.t_tableOid = RelationGetRelid(relation); + tup.t_data = (HeapTupleHeader) PageGetItem((Page) dpage, lpp); + tup.t_len = ItemIdGetLength(lpp); + ItemPointerSet(&tup.t_self, blknum, lineoff); + + valid = HeapTupleSatisfiesVisibility(&tup, snapshot, buffer); + HeapCheckForSerializableConflictOut(valid, relation, &tup, + buffer, snapshot); + if (!valid) + ItemIdSetUnused(lpp); + } + } + UnlockReleaseBuffer(buffer); + /* dpage became all-visible also */ + PageSetAllVisible(dpage); + pds->kds.nitems++; + + return true; +} + +/* + * PDS_exec_heapscan_row - PDS scan for KDS_FORMAT_ROW format + */ +static bool +PDS_exec_heapscan_row(GpuTaskState *gts, pgstrom_data_store *pds) +{ + Relation relation = gts->css.ss.ss_currentRelation; + HeapScanDesc hscan = (HeapScanDesc)gts->css.ss.ss_currentScanDesc; + BlockNumber blknum = hscan->rs_cblock; + Snapshot snapshot = ((TableScanDesc)hscan)->rs_snapshot; + BufferAccessStrategy strategy = hscan->rs_strategy; + kern_data_store *kds = &pds->kds; + Buffer buffer; + Page page; + int lines; + int ntup; + OffsetNumber lineoff; + ItemId lpp; + uint *tup_index; + kern_tupitem *tup_item; + bool all_visible; + Size max_consume; + + /* Load the target buffer */ + buffer = ReadBufferExtended(relation, MAIN_FORKNUM, blknum, + RBM_NORMAL, strategy); +#if 1 + /* Just like heapgetpage(), however, jobs we focus on is OLAP + * workload, so it's uncertain whether we should vacuum the page + * here. + */ + heap_page_prune_opt(relation, buffer); +#endif + /* we will check tuple's visibility under the shared lock */ + LockBuffer(buffer, BUFFER_LOCK_SHARE); + page = (Page) BufferGetPage(buffer); + lines = PageGetMaxOffsetNumber(page); + ntup = 0; + + /* + * Check whether we have enough rooms to store expected number of + * tuples on the remaining space. If it is hopeless to load all + * the items in a block, we inform the caller this block shall be + * loaded on the next data store. + */ + max_consume = KERN_DATA_STORE_HEAD_LENGTH(kds) + + STROMALIGN(sizeof(cl_uint) * (kds->nitems + lines)) + + offsetof(kern_tupitem, htup) * lines + BLCKSZ + + __kds_unpack(kds->usage); + if (max_consume > kds->length) + { + UnlockReleaseBuffer(buffer); + return false; + } + + /* + * Logic is almost same as heapgetpage() doing. + */ + all_visible = PageIsAllVisible(page) && !snapshot->takenDuringRecovery; + + /* TODO: make SerializationNeededForRead() an external function + * on the core side. It kills necessity of setting up HeapTupleData + * when all_visible and non-serialized transaction. + */ + tup_index = KERN_DATA_STORE_ROWINDEX(kds) + kds->nitems; + for (lineoff = FirstOffsetNumber, lpp = PageGetItemId(page, lineoff); + lineoff <= lines; + lineoff++, lpp++) + { + HeapTupleData tup; + size_t curr_usage; + bool valid; + + if (!ItemIdIsNormal(lpp)) + continue; + + tup.t_tableOid = RelationGetRelid(relation); + tup.t_data = (HeapTupleHeader) PageGetItem((Page) page, lpp); + tup.t_len = ItemIdGetLength(lpp); + ItemPointerSet(&tup.t_self, blknum, lineoff); + + if (all_visible) + valid = true; + else + valid = HeapTupleSatisfiesVisibility(&tup, snapshot, buffer); + + HeapCheckForSerializableConflictOut(valid, relation, + &tup, buffer, snapshot); + if (!valid) + continue; + + /* put tuple */ + curr_usage = (__kds_unpack(kds->usage) + + MAXALIGN(offsetof(kern_tupitem, htup) + tup.t_len)); + tup_item = (kern_tupitem *)((char *)kds + kds->length - curr_usage); + tup_item->rowid = kds->nitems + ntup; + tup_item->t_len = tup.t_len; + memcpy(&tup_item->htup, tup.t_data, tup.t_len); + memcpy(&tup_item->htup.t_ctid, &tup.t_self, sizeof(ItemPointerData)); + + tup_index[ntup++] = __kds_packed((uintptr_t)tup_item - (uintptr_t)kds); + kds->usage = __kds_packed(curr_usage); + } + UnlockReleaseBuffer(buffer); + Assert(ntup <= MaxHeapTuplesPerPage); + Assert(kds->nitems + ntup <= kds->nrooms); + kds->nitems += ntup; + + return true; +} + +/* + * heapscan_report_location + */ +static inline void +heapscan_report_location(HeapScanDesc hscan) +{ +#if PG_VERSION_NUM < 120000 + if (hscan->rs_syncscan) + ss_report_location(hscan->rs_rd, hscan->rs_cblock); +#else + if (hscan->rs_base.rs_flags & SO_ALLOW_SYNC) + ss_report_location(hscan->rs_base.rs_rd, hscan->rs_cblock); +#endif +} + +/* + * pgstromExecHeapScanChunkParallel - read the heap relation by parallel scan + */ +static pgstrom_data_store * +pgstromExecHeapScanChunkParallel(GpuTaskState *gts, + Bitmapset *brin_map, + cl_long brin_range_sz) +{ + GpuTaskSharedState *gtss = gts->gtss; + Relation relation = gts->css.ss.ss_currentRelation; + HeapScanDesc hscan = (HeapScanDesc)gts->css.ss.ss_currentScanDesc; + pgstrom_data_store *pds = NULL; + PDSHeapScanBlockState bstate; + + Assert(gts->css.ss.ss_currentScanDesc->rs_parallel); + memset(&bstate, 0, sizeof(PDSHeapScanBlockState)); + for (;;) + { + CHECK_FOR_INTERRUPTS(); + + if (!hscan->rs_inited) + { + if (hscan->rs_nblocks == 0) + { + /* no blocks to read */ + break; + } + hscan->rs_cblock = InvalidBlockNumber; + hscan->rs_numblocks = 0; /* force to get next blocks */ + hscan->rs_inited = true; + } + else if (hscan->rs_cblock == InvalidBlockNumber) + { + /* end of the scan */ + break; + } + + if (hscan->rs_numblocks == 0) + { + NVMEScanState *nvme_sstate = gts->nvme_sstate; + BlockNumber sync_startpage = InvalidBlockNumber; + cl_long nr_allocated; + cl_long startblock; + cl_long nr_blocks; + cl_long page; + + /* + * MEMO: A key of i/o performance is consolidation of continuous + * block reads with a small number of system-call invocation. + * The default one-by-one block read logic tend to generate i/o + * request fragmentation under CPU parallel execution, thus it + * leads larger number of read commands submit and performance + * slow-down. + * So, in case of NVMe-Strom under CPU parallel, we make the + * @scan->rs_cblock pointer advanced by multiple blocks at once. + * It ensures the block numbers to read are continuous, thus, + * i/o stack will be able to load storage blocks with minimum + * number of DMA requests. + */ + if (!nvme_sstate) + nr_blocks = 8; + else if (pds) + { + if (pds->kds.nitems >= pds->kds.nrooms) + break; /* no more rooms in this PDS */ + nr_blocks = pds->kds.nrooms - pds->kds.nitems; + } + else + nr_blocks = nvme_sstate->nblocks_per_chunk; + + retry_lock: + SpinLockAcquire(>ss->pbs_mutex); + /* + * If the scan's startblock has not yet been initialized, we must + * do it now. If this is not a synchronized scan, we just start + * at block 0, but if it is a synchronized scan, we must get + * the starting position from the synchronized scan facility. + * We can't hold the spinlock while doing that, though, so release + * the spinlock once, get the information we need, and retry. + * If nobody else has initialized the scan in the meantime, + * we'll fill in the value we fetched on the second time through. + */ + if (gtss->pbs_startblock == InvalidBlockNumber) + { + ParallelTableScanDesc ptscan + = gts->css.ss.ss_currentScanDesc->rs_parallel; + + if (!ptscan->phs_syncscan) + gtss->pbs_startblock = 0; + else if (sync_startpage != InvalidBlockNumber) + gtss->pbs_startblock = sync_startpage; + else + { + SpinLockRelease(>ss->pbs_mutex); + sync_startpage = ss_get_location(relation, + hscan->rs_nblocks); + goto retry_lock; + } + } + hscan->rs_startblock = startblock = gtss->pbs_startblock; + nr_allocated = gtss->pbs_nallocated; + + if (nr_allocated >= (cl_long)hscan->rs_nblocks) + { + SpinLockRelease(>ss->pbs_mutex); + hscan->rs_cblock = InvalidBlockNumber; /* end of the scan */ + break; + } + if (nr_allocated + nr_blocks >= (cl_long)hscan->rs_nblocks) + nr_blocks = (cl_long)hscan->rs_nblocks - nr_allocated; + page = (startblock + nr_allocated) % (cl_long)hscan->rs_nblocks; + if (page + nr_blocks >= (cl_long)hscan->rs_nblocks) + nr_blocks = (cl_long)hscan->rs_nblocks - page; + + /* should never read the blocks across segment boundary */ + Assert(nr_blocks > 0 && nr_blocks <= RELSEG_SIZE); + if ((page / RELSEG_SIZE) != (page + nr_blocks - 1) / RELSEG_SIZE) + nr_blocks = RELSEG_SIZE - (page % RELSEG_SIZE); + Assert(nr_blocks > 0); + + if (brin_map) + { + long pos = page / brin_range_sz; + long end = (page + nr_blocks - 1) / brin_range_sz; + long s_page = -1; + long e_page = page + nr_blocks; + + /* find the first valid range */ + while (pos <= end) + { + if (!bms_is_member(pos, brin_map)) + { + s_page = Max(page, pos * brin_range_sz); + break; + } + pos++; + } + + if (s_page < 0) + { + /* Oops, here is no valid range, so just skip it */ + gts->outer_brin_count += nr_blocks; + nr_allocated += nr_blocks; + nr_blocks = 0; + } + else + { + long prev = page; + /* find the continuous valid ranges */ + Assert(pos <= end); + Assert(!bms_is_member(pos, brin_map)); + while (pos <= end) + { + if (bms_is_member(pos, brin_map)) + { + e_page = Min(e_page, pos * brin_range_sz); + break; + } + pos++; + } + nr_allocated += (e_page - page); + nr_blocks = e_page - s_page; + page = s_page; + gts->outer_brin_count += page - prev; + } + } + else + { + /* elsewhere, just walk on the following blocks */ + nr_allocated += nr_blocks; + } + /* update # of blocks already allocated to workers */ + gtss->pbs_nallocated = nr_allocated; + SpinLockRelease(>ss->pbs_mutex); + + hscan->rs_cblock = page; + hscan->rs_numblocks = nr_blocks; + continue; + } + /* scan next block */ + if (gts->nvme_sstate) + { + /* KDS_FORMAT_BLOCK */ + if (!pds) + { + pds = PDS_create_block(gts->gcontext, + RelationGetDescr(relation), + gts->nvme_sstate); + pds->kds.table_oid = RelationGetRelid(relation); + initPDSHeapScanBlockState(pds, bstate); + } + if (!PDS_exec_heapscan_block(gts, pds, &bstate)) + break; + } + else + { + /* KDS_FORMAT_ROW */ + if (!pds) + { + pds = PDS_create_row(gts->gcontext, + RelationGetDescr(relation), + pgstrom_chunk_size()); + pds->kds.table_oid = RelationGetRelid(relation); + } + if (!PDS_exec_heapscan_row(gts, pds)) + break; + } + /* move to the next block */ + hscan->rs_numblocks--; + hscan->rs_cblock++; + if (hscan->rs_cblock >= hscan->rs_nblocks) + hscan->rs_cblock = 0; + heapscan_report_location(hscan); + /* end of the scan? */ + if (hscan->rs_cblock == hscan->rs_startblock) + hscan->rs_cblock = InvalidBlockNumber; + } + /* merge strom_io_vector to the PDS, if KDS_FORMAT_BLOCK */ + if (pds && pds->nblocks_uncached > 0) + mergePDSHeapScanBlockState(pds, &bstate); + + return pds; +} + +/* + * pgstromExecHeapScanChunk + */ +static pgstrom_data_store * +pgstromExecHeapScanChunk(GpuTaskState *gts, + Bitmapset *brin_map, cl_long brin_range_sz) +{ + Relation rel = gts->css.ss.ss_currentRelation; + HeapScanDesc hscan = (HeapScanDesc)gts->css.ss.ss_currentScanDesc; + pgstrom_data_store *pds = NULL; + PDSHeapScanBlockState bstate; + + memset(&bstate, 0, sizeof(PDSHeapScanBlockState)); + for (;;) + { + cl_long page; + + CHECK_FOR_INTERRUPTS(); + + if (!hscan->rs_inited) + { + /* no blocks to read? */ + if (hscan->rs_nblocks == 0) + break; + hscan->rs_cblock = hscan->rs_startblock; + Assert(hscan->rs_numblocks == InvalidBlockNumber); + hscan->rs_inited = true; + } + else if (hscan->rs_cblock == InvalidBlockNumber) + { + /* no more blocks to read */ + break; + } + page = hscan->rs_cblock; + + /* + * If any, check BRIN-index bitmap, then moves to the next range + * boundary if no tuple can match in this range. + */ + if (brin_map) + { + long pos = page / brin_range_sz; + + if (bms_is_member(pos, brin_map)) + { + long prev = page; + + page = (pos + 1) * brin_range_sz; + if (page <= (cl_long)MaxBlockNumber) + hscan->rs_cblock = (BlockNumber)page; + else + hscan->rs_cblock = 0; + gts->outer_brin_count += (page - prev); + goto skip; + } + } + /* scan the next block */ + if (gts->nvme_sstate) + { + if (!pds) + { + pds = PDS_create_block(gts->gcontext, + RelationGetDescr(rel), + gts->nvme_sstate); + pds->kds.table_oid = RelationGetRelid(rel); + initPDSHeapScanBlockState(pds, bstate); + } + if (!PDS_exec_heapscan_block(gts, pds, &bstate)) + break; + } + else + { + if (!pds) + { + pds = PDS_create_row(gts->gcontext, + RelationGetDescr(rel), + pgstrom_chunk_size()); + pds->kds.table_oid = RelationGetRelid(rel); + } + if (!PDS_exec_heapscan_row(gts, pds)) + break; + } + /* move to the next block */ + hscan->rs_cblock++; + skip: + if (hscan->rs_cblock >= hscan->rs_nblocks) + hscan->rs_cblock = 0; + Assert(hscan->rs_numblocks == InvalidBlockNumber); + heapscan_report_location(hscan); + /* end of the scan? */ + if (hscan->rs_cblock == hscan->rs_startblock) + hscan->rs_cblock = InvalidBlockNumber; + } + /* merge strom_io_vector to the PDS, if any */ + if (pds && pds->nblocks_uncached > 0) + mergePDSHeapScanBlockState(pds, &bstate); + + /* PDS is valid, or end of the relation */ + Assert(pds || !BlockNumberIsValid(hscan->rs_cblock)); + + return pds; +} + +/* + * pgstromExecScanChunk - read the relation by one chunk + */ +pgstrom_data_store * +pgstromExecScanChunk(GpuTaskState *gts) +{ + Relation rel = gts->css.ss.ss_currentRelation; + TableScanDesc tscan = gts->css.ss.ss_currentScanDesc; + Bitmapset *brin_map; + cl_long brin_range_sz = 0; + pgstrom_data_store *pds = NULL; + + /* + * Setup scan-descriptor, if the scan is not parallel, of if we're + * executing a scan that was intended to be parallel serially. + */ + if (!tscan) + { + EState *estate = gts->css.ss.ps.state; + + if (!gts->gtss) + tscan = table_beginscan(rel, estate->es_snapshot, 0, NULL); + else + tscan = table_beginscan_parallel(rel, >s->gtss->phscan); + + gts->css.ss.ss_currentScanDesc = tscan; + /* + * Try to choose NVMe-Strom, if relation is deployed on the supported + * tablespace and expected total i/o size is enough large than cache- + * only scan. + */ + PDS_init_heapscan_state(gts); + } + InstrStartNode(>s->outer_instrument); + /* Load the BRIN-index bitmap, if any */ + if (gts->outer_index_state) + pgstromExecGetBrinIndexMap(gts); + brin_map = gts->outer_index_map; + if (brin_map) + brin_range_sz = gts->outer_index_state->range_sz; + + if (gts->gtss) + pds = pgstromExecHeapScanChunkParallel(gts, brin_map, brin_range_sz); + else + pds = pgstromExecHeapScanChunk(gts, brin_map, brin_range_sz); + + if (pds) + { + if (pds->kds.nitems == 0) + { + /* empty result */ + PDS_release(pds); + pds = NULL; + } + else if (pds->kds.format == KDS_FORMAT_BLOCK && + pds->kds.nitems < pds->kds.nrooms && + pds->nblocks_uncached > 0) + { + /* + * MEMO: Special case handling if KDS_FORMAT_BLOCK was not filled + * up entirely. KDS_FORMAT_BLOCK has an array of block-number to + * support "ctid" system column, located on next to the KDS-head. + * Block-numbers of pre-loaded blocks (hit on shared buffer) are + * used from the head, and others (to be read from the file) are + * used from the tail. If nitems < nrooms, this array has a hole + * on the middle of array. + * So, we have to move later half of the array to close the hole + * and make a flat array. + */ + BlockNumber *block_nums + = (BlockNumber *)KERN_DATA_STORE_BODY(&pds->kds); + + memmove(block_nums + (pds->kds.nitems - pds->nblocks_uncached), + block_nums + (pds->kds.nrooms - pds->nblocks_uncached), + sizeof(BlockNumber) * pds->nblocks_uncached); + } + } + /* update statistics */ + if (pds) + { + if (pds->kds.format == KDS_FORMAT_BLOCK) + gts->nvme_count += pds->nblocks_uncached; + InstrStopNode(>s->outer_instrument, (double)pds->kds.nitems); + } + else + { + InstrStopNode(>s->outer_instrument, 0.0); + } + return pds; +} + +/* + * pgstromRewindScanChunk + */ +void +pgstromRewindScanChunk(GpuTaskState *gts) +{ + TableScanDesc tscan = gts->css.ss.ss_currentScanDesc; + + InstrEndLoop(>s->outer_instrument); + if (tscan) + { + table_rescan(tscan, NULL); + ExecScanReScan(>s->css.ss); + } +} + +/* + * pgstromExplainOuterScan + */ +void +pgstromExplainOuterScan(GpuTaskState *gts, + List *deparse_context, + List *ancestors, + ExplainState *es, + List *outer_quals, + Cost outer_startup_cost, + Cost outer_total_cost, + double outer_plan_rows, + int outer_plan_width) +{ + Plan *plannode = gts->css.ss.ps.plan; + Index scanrelid = ((Scan *) plannode)->scanrelid; + Instrumentation *instrument = >s->outer_instrument; + RangeTblEntry *rte; + const char *refname; + const char *relname; + const char *nspname = NULL; + StringInfoData str; + + /* Does this GpuTaskState has outer simple scan? */ + if (scanrelid == 0) + return; + + /* + * See the logic in ExplainTargetRel() + */ + rte = rt_fetch(scanrelid, es->rtable); + Assert(rte->rtekind == RTE_RELATION); + refname = (char *) list_nth(es->rtable_names, scanrelid - 1); + if (!refname) + refname = rte->eref->aliasname; + relname = get_rel_name(rte->relid); + if (es->verbose) + nspname = get_namespace_name(get_rel_namespace(rte->relid)); + + initStringInfo(&str); + if (es->format == EXPLAIN_FORMAT_TEXT) + { + if (nspname != NULL) + appendStringInfo(&str, "%s.%s", + quote_identifier(nspname), + quote_identifier(relname)); + else if (relname) + appendStringInfo(&str, "%s", + quote_identifier(relname)); + if (!relname || strcmp(refname, relname) != 0) + { + if (str.len > 0) + appendStringInfoChar(&str, ' '); + appendStringInfo(&str, "%s", refname); + } + } + else + { + ExplainPropertyText("Outer Scan Relation", relname, es); + if (nspname) + ExplainPropertyText("Outer Scan Schema", nspname, es); + ExplainPropertyText("Outer Scan Alias", refname, es); + } + + if (es->costs) + { + if (es->format == EXPLAIN_FORMAT_TEXT) + appendStringInfo(&str, " (cost=%.2f..%.2f rows=%.0f width=%d)", + outer_startup_cost, + outer_total_cost, + outer_plan_rows, + outer_plan_width); + else + { + ExplainPropertyFloat("Outer Startup Cost", + NULL, outer_startup_cost, 2, es); + ExplainPropertyFloat("Outer Total Cost", + NULL, outer_total_cost, 2, es); + ExplainPropertyFloat("Outer Plan Rows", + NULL, outer_plan_rows, 0, es); + ExplainPropertyFloat("Outer Plan Width", + NULL, outer_plan_width, 0, es); + } + } + + /* + * We have to forcibly clean up the instrumentation state because we + * haven't done ExecutorEnd yet. This is pretty grotty ... + * See the comment in ExplainNode() + */ + InstrEndLoop(instrument); + + if (es->analyze && instrument->nloops > 0) + { + double nloops = instrument->nloops; + double startup_sec = 1000.0 * instrument->startup / nloops; + double total_sec = 1000.0 * instrument->total / nloops; + double rows = instrument->ntuples / nloops; + + if (es->format == EXPLAIN_FORMAT_TEXT) + { + if (es->timing) + appendStringInfo( + &str, + " (actual time=%.3f..%.3f rows=%.0f loops=%.0f)", + startup_sec, total_sec, rows, nloops); + else + appendStringInfo( + &str, + " (actual rows=%.0f loops=%.0f)", + rows, nloops); + } + else + { + if (es->timing) + { + ExplainPropertyFloat("Outer Actual Startup Time", + NULL, startup_sec, 3, es); + ExplainPropertyFloat("Outer Actual Total Time", + NULL, total_sec, 3, es); + } + ExplainPropertyFloat("Outer Actual Rows", NULL, rows, 0, es); + ExplainPropertyFloat("Outer Actual Loops", NULL, nloops, 0, es); + } + } + else if (es->analyze) + { + if (es->format == EXPLAIN_FORMAT_TEXT) + appendStringInfoString(&str, " (never executed)"); + else + { + if (es->timing) + { + ExplainPropertyFloat("Outer Actual Startup Time", + NULL, 0.0, 3, es); + ExplainPropertyFloat("Outer Actual Total Time", + NULL, 0.0, 3, es); + } + ExplainPropertyFloat("Outer Actual Rows", + NULL, 0.0, 0, es); + ExplainPropertyFloat("Outer Actual Loops", + NULL, 0.0, 0, es); + } + } + if (es->format == EXPLAIN_FORMAT_TEXT) + ExplainPropertyText("Outer Scan", str.data, es); + + if (outer_quals) + { + Expr *quals_expr; + char *temp; + + quals_expr = make_ands_explicit(outer_quals); + temp = deparse_expression((Node *)quals_expr, + deparse_context, + es->verbose, false); + ExplainPropertyText("Outer Scan Filter", temp, es); + + if (gts->outer_instrument.nfiltered1 > 0.0) + ExplainPropertyFloat("Rows Removed by Outer Scan Filter", + NULL, + gts->outer_instrument.nfiltered1 / + gts->outer_instrument.nloops, + 0, es); + } + /* properties of BRIN-index */ + pgstromExplainBrinIndexMap(gts, es, deparse_context); +} + +/* + * pgstrom_init_relscan + */ +void +pgstrom_init_relscan(void) +{ + static char *nvme_manual_distance_map = NULL; + char buffer[1280]; + int index = 0; + + /* pg_strom.enable_brin */ + DefineCustomBoolVariable("pg_strom.enable_brin", + "Enables to use BRIN-index", + NULL, + &pgstrom_enable_brin, + true, + PGC_USERSET, + GUC_NOT_IN_SAMPLE, + NULL, NULL, NULL); + /* + * pg_strom.nvme_distance_map + * + * config := [,...] + * token := nvmeXX:gpuXX + * + * eg) nvme0:gpu0,nvme1:gpu1 + */ + DefineCustomStringVariable("pg_strom.nvme_distance_map", + "Manual configuration of optimal GPU for each NVME", + NULL, + &nvme_manual_distance_map, + NULL, + PGC_POSTMASTER, + GUC_NOT_IN_SAMPLE, + NULL, NULL, NULL); + extraSysfsSetupDistanceMap(nvme_manual_distance_map); + while (extraSysfsPrintNvmeInfo(index, buffer, sizeof(buffer)) >= 0) + { + elog(LOG, "- %s", buffer); + index++; + } + + /* hash table for tablespace <-> optimal GPU */ + tablespace_optimal_gpu_htable = NULL; + CacheRegisterSyscacheCallback(TABLESPACEOID, + tablespace_optimal_gpu_cache_callback, + (Datum) 0); +} diff --git a/src/shmbuf.c b/old/shmbuf.c similarity index 100% rename from src/shmbuf.c rename to old/shmbuf.c diff --git a/sql/pg_strom--2.2--2.3.sql b/old/sql/pg_strom--2.2--2.3.sql similarity index 100% rename from sql/pg_strom--2.2--2.3.sql rename to old/sql/pg_strom--2.2--2.3.sql diff --git a/sql/pg_strom--2.2.sql b/old/sql/pg_strom--2.2.sql similarity index 100% rename from sql/pg_strom--2.2.sql rename to old/sql/pg_strom--2.2.sql diff --git a/sql/pg_strom--2.3--3.0.sql b/old/sql/pg_strom--2.3--3.0.sql similarity index 100% rename from sql/pg_strom--2.3--3.0.sql rename to old/sql/pg_strom--2.3--3.0.sql diff --git a/sql/pg_strom--3.0--4.0.sql b/old/sql/pg_strom--3.0--4.0.sql similarity index 100% rename from sql/pg_strom--3.0--4.0.sql rename to old/sql/pg_strom--3.0--4.0.sql diff --git a/sql/pg_strom--3.0.sql b/old/sql/pg_strom--3.0.sql similarity index 100% rename from sql/pg_strom--3.0.sql rename to old/sql/pg_strom--3.0.sql diff --git a/next/tinyint.c b/old/tinyint.c similarity index 99% rename from next/tinyint.c rename to old/tinyint.c index 6fcc47211..3df806f51 100644 --- a/next/tinyint.c +++ b/old/tinyint.c @@ -3,8 +3,8 @@ * * 8bit-width integer data type support * ---- - * Copyright 2011-2023 (C) KaiGai Kohei - * Copyright 2014-2023 (C) PG-Strom Developers Team + * Copyright 2011-2021 (C) KaiGai Kohei + * Copyright 2014-2021 (C) PG-Strom Developers Team * * This program is free software; you can redistribute it and/or modify * it under the terms of the PostgreSQL License. diff --git a/src/Makefile b/src/Makefile index e7b608198..5fadbf75a 100644 --- a/src/Makefile +++ b/src/Makefile @@ -1,4 +1,132 @@ +# +# PG-Strom Makefile +# +PG_CONFIG ?= pg_config + ifndef STROM_BUILD_ROOT -STROM_BUILD_ROOT = .. +STROM_BUILD_ROOT=. endif -include $(STROM_BUILD_ROOT)/Makefile + +# +# PG-Strom version +# +PGSTROM_VERSION := 5.0 +PGSTROM_RELEASE := devel + +# +# Source of PG-Strom host code +# +__STROM_OBJS = main.o extra.o codegen.o misc.o executor.o \ + gpu_device.o gpu_scan.o gpu_join.o gpu_preagg.o \ + dpu_device.o dpu_scan.o dpu_join.o dpu_preagg.o \ + relscan.o brin.o gpu_service.o \ + arrow_fdw.o arrow_nodes.o \ + pcie.o float2.o tinyint.o aggfuncs.o +STROM_OBJS = $(addprefix $(STROM_BUILD_ROOT)/,$(__STROM_OBJS)) + +GPU_DEVATTRS_H = $(STROM_BUILD_ROOT)/gpu_devattrs.h +GENERATED-HEADERS = $(GPU_DEVATTRS_H) + +# +# Source of NVIDIA GPU device code +# +include $(STROM_BUILD_ROOT)/Makefile.cuda +__CUDA_OBJS = xpu_common cuda_gpuscan cuda_gpujoin cuda_gpupreagg \ + xpu_basetype xpu_numeric xpu_timelib xpu_textlib xpu_misclib +__CUDA_HEADERS = cuda_common.h xpu_common.h xpu_opcodes.h xpu_basetype.h \ + xpu_numeric.h xpu_textlib.h xpu_timelib.h xpu_misclib.h +__CUDA_OPT_OBJS = $(addsuffix .fatbin,$(__CUDA_OBJS)) +__CUDA_DBG_OBJS = $(addsuffix .debug.fatbin,$(__CUDA_OBJS)) +CUDA_HEADERS = $(addprefix $(STROM_BUILD_ROOT)/,$(__CUDA_HEADERS)) +CUDA_OPT_OBJS = $(addprefix $(STROM_BUILD_ROOT)/,$(__CUDA_OPT_OBJS)) +CUDA_DBG_OBJS = $(addprefix $(STROM_BUILD_ROOT)/,$(__CUDA_DBG_OBJS)) +CUDA_OPT_MODULE = $(STROM_BUILD_ROOT)/pgstrom-core.fatbin +CUDA_DBG_MODULE = $(STROM_BUILD_ROOT)/pgstrom-core.debug.fatbin + + +# +# Installation Scripts +# +__STROM_SQL = pg_strom--5.0.sql +STROM_SQL = $(addprefix $(STROM_BUILD_ROOT)/sql/,$(__STROM_SQL)) + +# +# GitHash to build +# +ifdef PGSTROM_GITHASH +ifeq ($(PGSTROM_GITHASH),HEAD) +PGSTROM_GITHASH = $(shell git rev-parse HEAD) +endif +else +ifeq ($(shell test -e $(STROM_BUILD_ROOT)/.git/config && echo -n 1),1) +PGSTROM_GITHASH = $(shell git rev-parse HEAD) +ifneq ($(shell git diff | wc -l),0) +PGSTROM_GITHASH_SUFFIX = ::local_changes +endif +else +ifeq ($(shell test -e $(STROM_BUILD_ROOT)/GITHASH && echo -n 1),1) +PGSTROM_GITHASH = $(shell cat $(STROM_BUILD_ROOT)/GITHASH) +else +PGSTROM_GITHASH = HEAD +endif +endif +endif + +# +# Flags to build +# +PGSTROM_FLAGS += $(PGSTROM_FLAGS_CUSTOM) +PGSTROM_FLAGS += -D__PGSTROM_MODULE__=1 +PGSTROM_FLAGS += "-DPGSTROM_VERSION=\"$(PGSTROM_VERSION)\"" + +PGSTROM_DEBUG = 1 +ifeq ($(PGSTROM_DEBUG),1) +PGSTROM_FLAGS += -g -O0 -DPGSTROM_DEBUG_BUILD=1 +endif +PGSTROM_FLAGS += -D__STROM_HOST__=1 +ifeq ($(shell uname -m),aarch64) +PGSTROM_FLAGS += -DHAVE_FLOAT2 -mfp16-format=ieee +endif +PGSTROM_FLAGS += -DPGSTROM_GITHASH=\"$(PGSTROM_GITHASH)$(PGSTROM_GITHASH_SUFFIX)\" +PGSTROM_FLAGS += -DPGSHAREDIR=\"$(shell $(PG_CONFIG) --sharedir)\" +PGSTROM_FLAGS += -DCUDA_MAXREGCOUNT=$(MAXREGCOUNT) +PGSTROM_FLAGS += -DCMD_GPUINFO_PATH=\"$(shell $(PG_CONFIG) --bindir)/gpuinfo\" +PGSTROM_FLAGS += -DCUDA_BUILTIN_OBJS="\"$(__CUDA_OBJS)\"" +PG_CPPFLAGS := $(PGSTROM_FLAGS) -I $(CUDA_IPATH) +SHLIB_LINK := -L $(CUDA_LPATH) -lcuda + +# +# Definition of PG-Strom Extension +# +MODULE_big = pg_strom +MODULEDIR = pg_strom +DATA = $(STROM_SQL) +OBJS = $(STROM_OBJS) +DATA_built = $(CUDA_OPT_OBJS) $(CUDA_DBG_OBJS) +EXTRA_CLEAN = $(DATA_built) $(GENERATED-HEADERS) +EXTENSION = pg_strom + +PGXS := $(shell $(PG_CONFIG) --pgxs) +include $(PGXS) + +# +# Device Attributes +# +submake-generated-headers: $(GENERATED-HEADERS) + +$(GPU_DEVATTRS_H): $(CUDA_IPATH)/cuda.h + cat $(CUDA_IPATH)/cuda.h | \ + grep -E '^[ ]+CU_DEVICE_ATTRIBUTE_' | \ + grep -v -E 'CU_DEVICE_ATTRIBUTE_MAX$$' | \ + grep -v 'Deprecated[,\.]' | \ + sed -e 's|[ ]*CU_DEVICE_ATTRIBUTE_|DEV_ATTR(|g' \ + -e 's| =.*/\*\*<[ ]*|, "|g' \ + -e 's|[ ]*\*/|")|g' > $@ + +# +# GPU Device Code +# +%.fatbin: %.cu $(CUDA_HEADERS) + $(NVCC) $(NVCC_FLAGS) -o $@ $< +%.debug.fatbin: %.cu $(CUDA_HEADERS) + $(NVCC) $(NVCC_DEBUG_FLAGS) -o $@ $< diff --git a/next/Makefile.cuda b/src/Makefile.cuda similarity index 100% rename from next/Makefile.cuda rename to src/Makefile.cuda diff --git a/src/aggfuncs.c b/src/aggfuncs.c index 7cc812141..a09def20f 100644 --- a/src/aggfuncs.c +++ b/src/aggfuncs.c @@ -3,72 +3,92 @@ * * Definition of self-defined aggregate functions, used by GpuPreAgg * ---- - * Copyright 2011-2021 (C) KaiGai Kohei - * Copyright 2014-2021 (C) PG-Strom Developers Team + * Copyright 2011-2023 (C) KaiGai Kohei + * Copyright 2014-2023 (C) PG-Strom Developers Team * * This program is free software; you can redistribute it and/or modify * it under the terms of the PostgreSQL License. */ #include "pg_strom.h" -#include "cuda_numeric.h" +#include "float2.h" /* - * declarations + * Functions Declaration */ PG_FUNCTION_INFO_V1(pgstrom_partial_nrows); -PG_FUNCTION_INFO_V1(pgstrom_partial_avg_int8); -PG_FUNCTION_INFO_V1(pgstrom_partial_avg_float8); -PG_FUNCTION_INFO_V1(pgstrom_final_avg_int8_accum); -PG_FUNCTION_INFO_V1(pgstrom_final_avg_int8_final); -PG_FUNCTION_INFO_V1(pgstrom_final_avg_float8_accum); -PG_FUNCTION_INFO_V1(pgstrom_final_avg_float8_final); -PG_FUNCTION_INFO_V1(pgstrom_final_avg_numeric_final); -PG_FUNCTION_INFO_V1(pgstrom_partial_min_any); -PG_FUNCTION_INFO_V1(pgstrom_partial_max_any); -PG_FUNCTION_INFO_V1(pgstrom_partial_sum_any); -PG_FUNCTION_INFO_V1(pgstrom_partial_sum_x2_float4); -PG_FUNCTION_INFO_V1(pgstrom_partial_sum_x2_float8); -PG_FUNCTION_INFO_V1(pgstrom_partial_sum_x2_numeric); -PG_FUNCTION_INFO_V1(pgstrom_partial_cov_x); -PG_FUNCTION_INFO_V1(pgstrom_partial_cov_y); -PG_FUNCTION_INFO_V1(pgstrom_partial_cov_x2); -PG_FUNCTION_INFO_V1(pgstrom_partial_cov_y2); -PG_FUNCTION_INFO_V1(pgstrom_partial_cov_xy); -PG_FUNCTION_INFO_V1(pgstrom_partial_variance_float8); -PG_FUNCTION_INFO_V1(pgstrom_partial_covariance_float8); -PG_FUNCTION_INFO_V1(pgstrom_float8_combine); -PG_FUNCTION_INFO_V1(pgstrom_float8_stddev_samp); -PG_FUNCTION_INFO_V1(pgstrom_float8_stddev_pop); -PG_FUNCTION_INFO_V1(pgstrom_float8_stddev_samp_numeric); -PG_FUNCTION_INFO_V1(pgstrom_float8_stddev_pop_numeric); -PG_FUNCTION_INFO_V1(pgstrom_float8_var_samp); -PG_FUNCTION_INFO_V1(pgstrom_float8_var_pop); -PG_FUNCTION_INFO_V1(pgstrom_float8_var_samp_numeric); -PG_FUNCTION_INFO_V1(pgstrom_float8_var_pop_numeric); -PG_FUNCTION_INFO_V1(pgstrom_float8_regr_combine); -PG_FUNCTION_INFO_V1(pgstrom_float8_corr); -PG_FUNCTION_INFO_V1(pgstrom_float8_covar_pop); -PG_FUNCTION_INFO_V1(pgstrom_float8_covar_samp); -PG_FUNCTION_INFO_V1(pgstrom_float8_regr_avgx); -PG_FUNCTION_INFO_V1(pgstrom_float8_regr_avgy); -PG_FUNCTION_INFO_V1(pgstrom_float8_regr_intercept); -PG_FUNCTION_INFO_V1(pgstrom_float8_regr_r2); -PG_FUNCTION_INFO_V1(pgstrom_float8_regr_slope); -PG_FUNCTION_INFO_V1(pgstrom_float8_regr_sxx); -PG_FUNCTION_INFO_V1(pgstrom_float8_regr_sxy); -PG_FUNCTION_INFO_V1(pgstrom_float8_regr_syy); -PG_FUNCTION_INFO_V1(pgstrom_hll_sketch_new); -PG_FUNCTION_INFO_V1(pgstrom_hll_sketch_merge); -PG_FUNCTION_INFO_V1(pgstrom_hll_count_final); -PG_FUNCTION_INFO_V1(pgstrom_hll_sketch_histogram); - -/* utility to reference numeric[] */ -static inline Datum -numeric_array_ref(ArrayType *array, int index, bool *p_isnull) -{ - return array_ref(array, 1, &index, -1, -1, false, 'i', p_isnull); + +PG_FUNCTION_INFO_V1(pgstrom_partial_minmax_int32); +PG_FUNCTION_INFO_V1(pgstrom_partial_minmax_int64); +PG_FUNCTION_INFO_V1(pgstrom_partial_minmax_fp32); +PG_FUNCTION_INFO_V1(pgstrom_partial_minmax_fp64); +PG_FUNCTION_INFO_V1(pgstrom_fmin_trans_int64); +PG_FUNCTION_INFO_V1(pgstrom_fmin_trans_fp64); +PG_FUNCTION_INFO_V1(pgstrom_fmax_trans_int64); +PG_FUNCTION_INFO_V1(pgstrom_fmax_trans_fp64); +PG_FUNCTION_INFO_V1(pgstrom_fminmax_final_int8); +PG_FUNCTION_INFO_V1(pgstrom_fminmax_final_int16); +PG_FUNCTION_INFO_V1(pgstrom_fminmax_final_int32); +PG_FUNCTION_INFO_V1(pgstrom_fminmax_final_int64); +PG_FUNCTION_INFO_V1(pgstrom_fminmax_final_fp16); +PG_FUNCTION_INFO_V1(pgstrom_fminmax_final_fp32); +PG_FUNCTION_INFO_V1(pgstrom_fminmax_final_fp64); +PG_FUNCTION_INFO_V1(pgstrom_fminmax_final_numeric); + +PG_FUNCTION_INFO_V1(pgstrom_partial_sum_asis); + +PG_FUNCTION_INFO_V1(pgstrom_partial_avg_int); +PG_FUNCTION_INFO_V1(pgstrom_partial_avg_fp); +PG_FUNCTION_INFO_V1(pgstrom_favg_trans_int); +PG_FUNCTION_INFO_V1(pgstrom_favg_trans_fp); +PG_FUNCTION_INFO_V1(pgstrom_favg_final_int); +PG_FUNCTION_INFO_V1(pgstrom_favg_final_fp); +PG_FUNCTION_INFO_V1(pgstrom_favg_final_num); + +PG_FUNCTION_INFO_V1(pgstrom_partial_variance); +PG_FUNCTION_INFO_V1(pgstrom_stddev_trans); +PG_FUNCTION_INFO_V1(pgstrom_stddev_samp_final); +PG_FUNCTION_INFO_V1(pgstrom_stddev_sampf_final); +PG_FUNCTION_INFO_V1(pgstrom_stddev_pop_final); +PG_FUNCTION_INFO_V1(pgstrom_stddev_popf_final); +PG_FUNCTION_INFO_V1(pgstrom_var_samp_final); +PG_FUNCTION_INFO_V1(pgstrom_var_sampf_final); +PG_FUNCTION_INFO_V1(pgstrom_var_pop_final); +PG_FUNCTION_INFO_V1(pgstrom_var_popf_final); + +PG_FUNCTION_INFO_V1(pgstrom_partial_covar); +PG_FUNCTION_INFO_V1(pgstrom_covar_accum); +PG_FUNCTION_INFO_V1(pgstrom_covar_samp_final); +PG_FUNCTION_INFO_V1(pgstrom_covar_pop_final); + +PG_FUNCTION_INFO_V1(pgstrom_regr_avgx_final); +PG_FUNCTION_INFO_V1(pgstrom_regr_avgy_final); +PG_FUNCTION_INFO_V1(pgstrom_regr_count_final); +PG_FUNCTION_INFO_V1(pgstrom_regr_intercept_final); +PG_FUNCTION_INFO_V1(pgstrom_regr_r2_final); +PG_FUNCTION_INFO_V1(pgstrom_regr_slope_final); +PG_FUNCTION_INFO_V1(pgstrom_regr_sxx_final); +PG_FUNCTION_INFO_V1(pgstrom_regr_sxy_final); +PG_FUNCTION_INFO_V1(pgstrom_regr_syy_final); + +/* + * float8 validator + */ +static inline void +check_float8_value(float8 value, bool inf_is_valid, bool zero_is_valid) +{ + if (isinf(value) && !inf_is_valid) + ereport(ERROR, + (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), + errmsg("value out of range: overflow"))); + if (value == 0.0 && !zero_is_valid) + ereport(ERROR, + (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), + errmsg("value out of range: underflow"))); } +/* + * NROWS + */ Datum pgstrom_partial_nrows(PG_FUNCTION_ARGS) { @@ -82,864 +102,661 @@ pgstrom_partial_nrows(PG_FUNCTION_ARGS) PG_RETURN_INT64(1); } +/* + * MIN(X) and MAX(X) functions + */ Datum -pgstrom_partial_avg_int8(PG_FUNCTION_ARGS) +pgstrom_partial_minmax_int64(PG_FUNCTION_ARGS) { - ArrayType *result; - Datum items[2]; - - items[0] = PG_GETARG_DATUM(0); /* nrows(int8) */ - items[1] = PG_GETARG_DATUM(1); /* p_sum(int8) */ - result = construct_array(items, 2, INT8OID, - sizeof(int64), FLOAT8PASSBYVAL, 'd'); - PG_RETURN_ARRAYTYPE_P(result); -} + kagg_state__pminmax_int64_packed *r; -Datum -pgstrom_partial_avg_float8(PG_FUNCTION_ARGS) -{ - int64 nrows = PG_GETARG_INT64(0); - ArrayType *result; - Datum items[2]; + r = palloc(sizeof(kagg_state__pminmax_int64_packed)); + r->nitems = 1; + r->value = PG_GETARG_INT64(0); + SET_VARSIZE(r, sizeof(kagg_state__pminmax_int64_packed)); - items[0] = Float8GetDatum((float8)nrows); - items[1] = PG_GETARG_DATUM(1); /* p_sum(float8) */ - result = construct_array(items, 2, FLOAT8OID, - sizeof(float8), FLOAT8PASSBYVAL, 'd'); - PG_RETURN_ARRAYTYPE_P(result); + PG_RETURN_POINTER(r); } Datum -pgstrom_final_avg_int8_accum(PG_FUNCTION_ARGS) +pgstrom_partial_minmax_fp64(PG_FUNCTION_ARGS) { - MemoryContext aggcxt; - MemoryContext oldcxt; - ArrayType *xarray; - ArrayType *yarray; - int64 *x, *y; - - if (!AggCheckCallContext(fcinfo, &aggcxt)) - elog(ERROR, "aggregate function called in non-aggregate context"); - if (PG_ARGISNULL(1)) - elog(ERROR, "Null state was supplied"); + kagg_state__pminmax_fp64_packed *r; - if (PG_ARGISNULL(0)) - { - oldcxt = MemoryContextSwitchTo(aggcxt); - xarray = PG_GETARG_ARRAYTYPE_P_COPY(1); - MemoryContextSwitchTo(oldcxt); - } - else - { - xarray = PG_GETARG_ARRAYTYPE_P(0); - yarray = PG_GETARG_ARRAYTYPE_P(1); - x = (int64 *)ARR_DATA_PTR(xarray); - y = (int64 *)ARR_DATA_PTR(yarray); + r = palloc(sizeof(kagg_state__pminmax_fp64_packed)); + r->nitems = 1; + r->value = PG_GETARG_FLOAT8(0); + SET_VARSIZE(r, sizeof(kagg_state__pminmax_fp64_packed)); - x[0] += y[0]; - x[1] += y[1]; - } - PG_RETURN_POINTER(xarray); + PG_RETURN_POINTER(r); } -Datum -pgstrom_final_avg_int8_final(PG_FUNCTION_ARGS) -{ - ArrayType *xarray = PG_GETARG_ARRAYTYPE_P(0); - int64 *x = (int64 *)ARR_DATA_PTR(xarray); - - return DirectFunctionCall2(numeric_div, - DirectFunctionCall1(int8_numeric, - Int64GetDatum(x[1])), - DirectFunctionCall1(int8_numeric, - Int64GetDatum(x[0]))); -} +#define __MINMAX_TRANS_TEMPLATE(TYPE,OPER) \ + kagg_state__pminmax_##TYPE##_packed *state; \ + kagg_state__pminmax_##TYPE##_packed *arg; \ + MemoryContext aggcxt; \ + \ + if (!AggCheckCallContext(fcinfo, &aggcxt)) \ + elog(ERROR, "aggregate function called in non-aggregate context"); \ + if (PG_ARGISNULL(0)) \ + { \ + if (PG_ARGISNULL(1)) \ + PG_RETURN_NULL(); \ + arg = (kagg_state__pminmax_##TYPE##_packed *) \ + PG_GETARG_BYTEA_P(1); \ + state = MemoryContextAlloc(aggcxt, sizeof(*state)); \ + memcpy(state, arg, sizeof(*state)); \ + } \ + else \ + { \ + state = (kagg_state__pminmax_##TYPE##_packed *) \ + PG_GETARG_BYTEA_P(0); \ + if (!PG_ARGISNULL(1)) \ + { \ + arg = (kagg_state__pminmax_##TYPE##_packed *) \ + PG_GETARG_BYTEA_P(1); \ + if (arg->nitems > 0) \ + { \ + if (state->nitems == 0) \ + memcpy(state, arg, sizeof(*state)); \ + else \ + state->value = OPER(state->value, arg->value); \ + } \ + } \ + } \ + PG_RETURN_POINTER(state); Datum -pgstrom_final_avg_float8_accum(PG_FUNCTION_ARGS) +pgstrom_fmin_trans_int64(PG_FUNCTION_ARGS) { - MemoryContext aggcxt; - MemoryContext oldcxt; - ArrayType *xarray; - ArrayType *yarray; - float8 *x, *y; - - if (!AggCheckCallContext(fcinfo, &aggcxt)) - elog(ERROR, "aggregate function called in non-aggregate context"); - if (PG_ARGISNULL(1)) - elog(ERROR, "Null state was supplied"); - - if (PG_ARGISNULL(0)) - { - oldcxt = MemoryContextSwitchTo(aggcxt); - xarray = PG_GETARG_ARRAYTYPE_P_COPY(1); - MemoryContextSwitchTo(oldcxt); - } - else - { - xarray = PG_GETARG_ARRAYTYPE_P(0); - yarray = PG_GETARG_ARRAYTYPE_P(1); - x = (float8 *)ARR_DATA_PTR(xarray); - y = (float8 *)ARR_DATA_PTR(yarray); - - x[0] += y[0]; - x[1] += y[1]; - } - PG_RETURN_POINTER(xarray); + __MINMAX_TRANS_TEMPLATE(int64,Min); } Datum -pgstrom_final_avg_float8_final(PG_FUNCTION_ARGS) +pgstrom_fmin_trans_fp64(PG_FUNCTION_ARGS) { - ArrayType *xarray = PG_GETARG_ARRAYTYPE_P(0); - float8 *x = (float8 *)ARR_DATA_PTR(xarray); - - PG_RETURN_FLOAT8(x[1] / x[0]); + __MINMAX_TRANS_TEMPLATE(fp64,Min); } Datum -pgstrom_final_avg_numeric_final(PG_FUNCTION_ARGS) +pgstrom_fmax_trans_int64(PG_FUNCTION_ARGS) { - ArrayType *xarray = PG_GETARG_ARRAYTYPE_P(0); - float8 *x = (float8 *)ARR_DATA_PTR(xarray); - Datum nrows, sum; - - nrows = DirectFunctionCall1(float8_numeric, Float8GetDatum(x[0])); - sum = DirectFunctionCall1(float8_numeric, Float8GetDatum(x[1])); - - return DirectFunctionCall2(numeric_div, sum, nrows); + __MINMAX_TRANS_TEMPLATE(int64,Max); } -/* - * pgstrom.pmin(anyelement) - */ Datum -pgstrom_partial_min_any(PG_FUNCTION_ARGS) +pgstrom_fmax_trans_fp64(PG_FUNCTION_ARGS) { - PG_RETURN_DATUM(PG_GETARG_DATUM(0)); + __MINMAX_TRANS_TEMPLATE(fp64,Max); } -/* - * pgstrom.pmax(anyelement) - */ Datum -pgstrom_partial_max_any(PG_FUNCTION_ARGS) +pgstrom_fminmax_final_int8(PG_FUNCTION_ARGS) { - PG_RETURN_DATUM(PG_GETARG_DATUM(0)); + kagg_state__pminmax_int64_packed *state + = (kagg_state__pminmax_int64_packed *)PG_GETARG_BYTEA_P(0); + if (state->nitems == 0) + PG_RETURN_NULL(); + if (state->value < SCHAR_MIN || state->value > SCHAR_MAX) + elog(ERROR, "min(int8) out of range"); + PG_RETURN_INT32(state->value); } -/* - * pgstrom.psum(anyelement) - */ Datum -pgstrom_partial_sum_any(PG_FUNCTION_ARGS) +pgstrom_fminmax_final_int16(PG_FUNCTION_ARGS) { - PG_RETURN_DATUM(PG_GETARG_DATUM(0)); + kagg_state__pminmax_int64_packed *state + = (kagg_state__pminmax_int64_packed *)PG_GETARG_BYTEA_P(0); + if (state->nitems == 0) + PG_RETURN_NULL(); + if (state->value < SHRT_MIN || state->value > SHRT_MAX) + elog(ERROR, "min(int16) out of range"); + PG_RETURN_INT32(state->value); } -/* - * pgstrom.psum_x2(float4) - */ Datum -pgstrom_partial_sum_x2_float4(PG_FUNCTION_ARGS) +pgstrom_fminmax_final_int32(PG_FUNCTION_ARGS) { - float4 value = (PG_ARGISNULL(0) ? 0.0 : PG_GETARG_FLOAT4(0)); - - PG_RETURN_FLOAT4(value * value); + kagg_state__pminmax_int64_packed *state + = (kagg_state__pminmax_int64_packed *)PG_GETARG_BYTEA_P(0); + if (state->nitems == 0) + PG_RETURN_NULL(); + if (state->value < INT_MIN || state->value > INT_MAX) + elog(ERROR, "min(int32) out of range"); + PG_RETURN_INT32(state->value); } -/* - * pgstrom.psum_x2(float8) - */ Datum -pgstrom_partial_sum_x2_float8(PG_FUNCTION_ARGS) +pgstrom_fminmax_final_int64(PG_FUNCTION_ARGS) { - float8 value = (PG_ARGISNULL(0) ? 0.0 : PG_GETARG_FLOAT8(0)); - - PG_RETURN_FLOAT8(value * value); + kagg_state__pminmax_int64_packed *state + = (kagg_state__pminmax_int64_packed *)PG_GETARG_BYTEA_P(0); + if (state->nitems == 0) + PG_RETURN_NULL(); + PG_RETURN_INT64(state->value); } -/* - * pgstrom.psum_x2(numeric) - */ Datum -pgstrom_partial_sum_x2_numeric(PG_FUNCTION_ARGS) +pgstrom_fminmax_final_fp16(PG_FUNCTION_ARGS) { - Datum value; - - if (!PG_ARGISNULL(0)) - value = PG_GETARG_DATUM(0); /* a valid numeric value */ - else - value = DirectFunctionCall3(numeric_in, - CStringGetDatum("0"), - ObjectIdGetDatum(InvalidOid), - Int32GetDatum(-1)); - return DirectFunctionCall2(numeric_mul, value, value); + kagg_state__pminmax_fp64_packed *state + = (kagg_state__pminmax_fp64_packed *)PG_GETARG_BYTEA_P(0); + if (state->nitems == 0) + PG_RETURN_NULL(); + PG_RETURN_UINT16(__half_as_short__(fp64_to_fp16(state->value))); } -/* - * pgstrom.pcov_x(float8) - */ Datum -pgstrom_partial_cov_x(PG_FUNCTION_ARGS) +pgstrom_fminmax_final_fp32(PG_FUNCTION_ARGS) { - if (!PG_GETARG_BOOL(0)) + kagg_state__pminmax_fp64_packed *state + = (kagg_state__pminmax_fp64_packed *)PG_GETARG_BYTEA_P(0); + if (state->nitems == 0) PG_RETURN_NULL(); - PG_RETURN_DATUM(PG_GETARG_DATUM(1)); + PG_RETURN_FLOAT4(state->value); } -/* - * pgstrom.pcov_y(float8) - */ Datum -pgstrom_partial_cov_y(PG_FUNCTION_ARGS) +pgstrom_fminmax_final_fp64(PG_FUNCTION_ARGS) { - if (!PG_GETARG_BOOL(0)) + kagg_state__pminmax_fp64_packed *state + = (kagg_state__pminmax_fp64_packed *)PG_GETARG_BYTEA_P(0); + if (state->nitems == 0) PG_RETURN_NULL(); - PG_RETURN_DATUM(PG_GETARG_DATUM(2)); + PG_RETURN_FLOAT8(state->value); } -/* - * pgstrom.pcov_x2(float8) - */ Datum -pgstrom_partial_cov_x2(PG_FUNCTION_ARGS) +pgstrom_fminmax_final_numeric(PG_FUNCTION_ARGS) { - float8 value = PG_GETARG_FLOAT8(1); - - if (!PG_GETARG_BOOL(0)) + kagg_state__pminmax_fp64_packed *state + = (kagg_state__pminmax_fp64_packed *)PG_GETARG_BYTEA_P(0); + if (state->nitems == 0) PG_RETURN_NULL(); - PG_RETURN_FLOAT8(value * value); + return DirectFunctionCall1(float8_numeric, + Float8GetDatum(state->value)); } /* - * pgstrom.pcov_y2(float8) + * SUM(X) functions */ Datum -pgstrom_partial_cov_y2(PG_FUNCTION_ARGS) +pgstrom_partial_sum_asis(PG_FUNCTION_ARGS) { - float8 value = PG_GETARG_FLOAT8(2); - - if (!PG_GETARG_BOOL(0)) - PG_RETURN_NULL(); - PG_RETURN_FLOAT8(value * value); + PG_RETURN_DATUM(PG_GETARG_DATUM(0)); } /* - * pgstrom.pcov_xy(float8) + * AVG(X) functions */ Datum -pgstrom_partial_cov_xy(PG_FUNCTION_ARGS) +pgstrom_partial_avg_int(PG_FUNCTION_ARGS) { - float8 x_value = PG_GETARG_FLOAT8(1); - float8 y_value = PG_GETARG_FLOAT8(2); + kagg_state__pavg_int_packed *r = palloc(sizeof(kagg_state__pavg_int_packed)); - if (!PG_GETARG_BOOL(0)) - PG_RETURN_NULL(); - PG_RETURN_FLOAT8(x_value * y_value); + r->nitems = 1; + r->sum = PG_GETARG_INT64(0); + SET_VARSIZE(r, sizeof(kagg_state__pavg_int_packed)); + + PG_RETURN_POINTER(r); } -/* - * pgstrom_partial_variance_float8 - */ Datum -pgstrom_partial_variance_float8(PG_FUNCTION_ARGS) +pgstrom_partial_avg_fp(PG_FUNCTION_ARGS) { - ArrayType *state; - Datum items[3]; + kagg_state__pavg_fp_packed *r = palloc(sizeof(kagg_state__pavg_fp_packed)); + + r->nitems = 1; + r->sum = PG_GETARG_FLOAT8(0); + SET_VARSIZE(r, sizeof(kagg_state__pavg_fp_packed)); - items[0] = Float8GetDatum((double)PG_GETARG_INT64(0)); /* nrows(int8) */ - items[1] = PG_GETARG_DATUM(1); /* sum of X */ - items[2] = PG_GETARG_DATUM(2); /* sum of X^2 */ - state = construct_array(items, 3, FLOAT8OID, - sizeof(float8), FLOAT8PASSBYVAL, 'd'); - PG_RETURN_ARRAYTYPE_P(state); + PG_RETURN_POINTER(r); } -/* - * pgstrom_partial_covariance_float8 - */ Datum -pgstrom_partial_covariance_float8(PG_FUNCTION_ARGS) +pgstrom_favg_trans_int(PG_FUNCTION_ARGS) { - ArrayType *state; - Datum items[6]; - - items[0] = Float8GetDatum((double)PG_GETARG_INT64(0)); /* nrows(int8) */ - items[1] = PG_GETARG_DATUM(1); /* sum of X */ - items[2] = PG_GETARG_DATUM(2); /* sum of X^2 */ - items[3] = PG_GETARG_DATUM(3); /* sum of Y */ - items[4] = PG_GETARG_DATUM(4); /* sum of Y^2 */ - items[5] = PG_GETARG_DATUM(5); /* sum of X*Y */ - state = construct_array(items, 6, FLOAT8OID, - sizeof(float8), FLOAT8PASSBYVAL, 'd'); - PG_RETURN_ARRAYTYPE_P(state); -} + kagg_state__pavg_int_packed *state; + kagg_state__pavg_int_packed *arg; + MemoryContext aggcxt; -/* - * float8 validator - */ -static inline float8 * -check_float8_array(ArrayType *transarray, const char *caller, int n) -{ - if (ARR_NDIM(transarray) != 1 || - ARR_DIMS(transarray)[0] != n || - ARR_HASNULL(transarray) || - ARR_ELEMTYPE(transarray) != FLOAT8OID) - elog(ERROR, "%s: expected %d-element float8 array", caller, n); - return (float8 *) ARR_DATA_PTR(transarray); -} + if (!AggCheckCallContext(fcinfo, &aggcxt)) + elog(ERROR, "aggregate function called in non-aggregate context"); + if (PG_ARGISNULL(0)) + { + if (PG_ARGISNULL(1)) + PG_RETURN_NULL(); + arg = (kagg_state__pavg_int_packed *)PG_GETARG_BYTEA_P(1); + state = MemoryContextAlloc(aggcxt, sizeof(*state)); + memcpy(state, arg, sizeof(*state)); + } + else + { + state = (kagg_state__pavg_int_packed *)PG_GETARG_BYTEA_P(0); + if (!PG_ARGISNULL(1)) + { + arg = (kagg_state__pavg_int_packed *)PG_GETARG_BYTEA_P(1); -static inline void -check_float8_value(float8 value, bool inf_is_valid, bool zero_is_valid) -{ - if (isinf(value) && !inf_is_valid) - ereport(ERROR, - (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), - errmsg("value out of range: overflow"))); - if (value == 0.0 && !zero_is_valid) - ereport(ERROR, - (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), - errmsg("value out of range: underflow"))); + state->nitems += arg->nitems; + state->sum += arg->sum; + } + } + PG_RETURN_POINTER(state); } -/* - * pgstrom_float8_combine - */ Datum -pgstrom_float8_combine(PG_FUNCTION_ARGS) +pgstrom_favg_trans_fp(PG_FUNCTION_ARGS) { - ArrayType *transarray1 = PG_GETARG_ARRAYTYPE_P(0); - ArrayType *transarray2 = PG_GETARG_ARRAYTYPE_P(1); - float8 *transvalues1; - float8 *transvalues2; - float8 N, sumX, sumX2; + kagg_state__pavg_fp_packed *state; + kagg_state__pavg_fp_packed *arg; + MemoryContext aggcxt; - if (!AggCheckCallContext(fcinfo, NULL)) + if (!AggCheckCallContext(fcinfo, &aggcxt)) elog(ERROR, "aggregate function called in non-aggregate context"); - transvalues1 = check_float8_array(transarray1, __FUNCTION__, 3); - N = transvalues1[0]; - sumX = transvalues1[1]; - sumX2 = transvalues1[2]; - - transvalues2 = check_float8_array(transarray2, __FUNCTION__, 3); - N += transvalues2[0]; - sumX += transvalues2[1]; - sumX2 += transvalues2[2]; - check_float8_value(sumX, isinf(transvalues1[1]) || isinf(transvalues2[1]), true); - check_float8_value(sumX2, isinf(transvalues1[2]) || isinf(transvalues2[2]), true); - - transvalues1[0] = N; - transvalues1[1] = sumX; - transvalues1[2] = sumX2; + if (PG_ARGISNULL(0)) + { + if (PG_ARGISNULL(1)) + PG_RETURN_NULL(); + arg = (kagg_state__pavg_fp_packed *)PG_GETARG_BYTEA_P(1); + state = MemoryContextAlloc(aggcxt, sizeof(*state)); + memcpy(state, arg, sizeof(*state)); + } + else + { + state = (kagg_state__pavg_fp_packed *)PG_GETARG_BYTEA_P(0); + if (!PG_ARGISNULL(1)) + { + arg = (kagg_state__pavg_fp_packed *)PG_GETARG_BYTEA_P(1); - PG_RETURN_ARRAYTYPE_P(transarray1); + state->nitems += arg->nitems; + state->sum += arg->sum; + } + } + PG_RETURN_POINTER(state); } -/* - * pgstrom_float8_var_samp - */ Datum -pgstrom_float8_var_samp(PG_FUNCTION_ARGS) -{ - ArrayType *transarray = PG_GETARG_ARRAYTYPE_P(0); - float8 *transvalues; - float8 N, sumX, sumX2; - float8 numerator; - - transvalues = check_float8_array(transarray, "float8_stddev_pop", 3); - N = transvalues[0]; - sumX = transvalues[1]; - sumX2 = transvalues[2]; - /* Population stddev is undefined when N is 0, so return NULL */ - if (N == 0.0) - PG_RETURN_NULL(); - - numerator = N * sumX2 - sumX * sumX; - check_float8_value(numerator, isinf(sumX2) || isinf(sumX), true); +pgstrom_favg_final_int(PG_FUNCTION_ARGS) +{ + kagg_state__pavg_int_packed *state; + Datum n, sum; - /* Watch out for roundoff error producing a negative numerator */ - if (numerator <= 0.0) - PG_RETURN_FLOAT8(0.0); + state = (kagg_state__pavg_int_packed *)PG_GETARG_BYTEA_P(0); + if (state->nitems == 0) + PG_RETURN_NULL(); + n = DirectFunctionCall1(int4_numeric, Int32GetDatum(state->nitems)); + sum = DirectFunctionCall1(int8_numeric, Int64GetDatum(state->sum)); - PG_RETURN_FLOAT8(numerator / (N * (N - 1.0))); + PG_RETURN_DATUM(DirectFunctionCall2(numeric_div, sum, n)); } -/* - * pgstrom_float8_var_pop - */ Datum -pgstrom_float8_var_pop(PG_FUNCTION_ARGS) -{ - ArrayType *transarray = PG_GETARG_ARRAYTYPE_P(0); - float8 *transvalues; - float8 N, sumX, sumX2; - float8 numerator; - - transvalues = check_float8_array(transarray, "float8_stddev_pop", 3); - N = transvalues[0]; - sumX = transvalues[1]; - sumX2 = transvalues[2]; - /* Population stddev is undefined when N is 0, so return NULL */ - if (N == 0.0) +pgstrom_favg_final_fp(PG_FUNCTION_ARGS) +{ + kagg_state__pavg_fp_packed *state + = (kagg_state__pavg_fp_packed *)PG_GETARG_BYTEA_P(0); + if (state->nitems == 0) PG_RETURN_NULL(); + PG_RETURN_FLOAT8((double)state->sum / (double)state->nitems); +} - numerator = N * sumX2 - sumX * sumX; - check_float8_value(numerator, isinf(sumX2) || isinf(sumX), true); +Datum +pgstrom_favg_final_num(PG_FUNCTION_ARGS) +{ + kagg_state__pavg_fp_packed *state; + Datum n, sum; - /* Watch out for roundoff error producing a negative numerator */ - if (numerator <= 0.0) - PG_RETURN_FLOAT8(0.0); + state = (kagg_state__pavg_fp_packed *)PG_GETARG_BYTEA_P(0); + if (state->nitems == 0) + PG_RETURN_NULL(); + n = DirectFunctionCall1(int4_numeric, Int32GetDatum(state->nitems)); + sum = DirectFunctionCall1(float8_numeric, Float8GetDatum(state->sum)); - PG_RETURN_FLOAT8(numerator / (N * N)); + PG_RETURN_DATUM(DirectFunctionCall2(numeric_div, sum, n)); } /* - * pgstrom_float8_stddev_samp + * STDDEV/VARIANCE */ Datum -pgstrom_float8_stddev_samp(PG_FUNCTION_ARGS) -{ - ArrayType *transarray = PG_GETARG_ARRAYTYPE_P(0); - float8 *transvalues; - float8 N, sumX, sumX2; - float8 numerator; - - transvalues = check_float8_array(transarray, "float8_stddev_pop", 3); - N = transvalues[0]; - sumX = transvalues[1]; - sumX2 = transvalues[2]; - /* Population stddev is undefined when N is 0, so return NULL */ - if (N == 0.0) - PG_RETURN_NULL(); - - numerator = N * sumX2 - sumX * sumX; - check_float8_value(numerator, isinf(sumX2) || isinf(sumX), true); +pgstrom_partial_variance(PG_FUNCTION_ARGS) +{ + kagg_state__stddev_packed *r = palloc(sizeof(kagg_state__stddev_packed)); + float8_t fval = PG_GETARG_FLOAT8(0); - /* Watch out for roundoff error producing a negative numerator */ - if (numerator <= 0.0) - PG_RETURN_FLOAT8(0.0); + r->nitems = 1; + r->sum_x2 = fval * fval; + SET_VARSIZE(r, sizeof(kagg_state__stddev_packed)); - PG_RETURN_FLOAT8(sqrt(numerator / (N * (N - 1.0)))); + PG_RETURN_POINTER(r); } -/* - * pgstrom_float8_stddev_pop - */ Datum -pgstrom_float8_stddev_pop(PG_FUNCTION_ARGS) -{ - ArrayType *transarray = PG_GETARG_ARRAYTYPE_P(0); - float8 *transvalues; - float8 N, sumX, sumX2; - float8 numerator; - - transvalues = check_float8_array(transarray, "float8_stddev_pop", 3); - N = transvalues[0]; - sumX = transvalues[1]; - sumX2 = transvalues[2]; - /* Population stddev is undefined when N is 0, so return NULL */ - if (N == 0.0) - PG_RETURN_NULL(); - - numerator = N * sumX2 - sumX * sumX; - check_float8_value(numerator, isinf(sumX2) || isinf(sumX), true); +pgstrom_stddev_trans(PG_FUNCTION_ARGS) +{ + kagg_state__stddev_packed *state; + kagg_state__stddev_packed *arg; + MemoryContext aggcxt; - /* Watch out for roundoff error producing a negative numerator */ - if (numerator <= 0.0) - PG_RETURN_FLOAT8(0.0); + if (!AggCheckCallContext(fcinfo, &aggcxt)) + elog(ERROR, "aggregate function called in non-aggregate context"); + if (PG_ARGISNULL(0)) + { + if (PG_ARGISNULL(1)) + PG_RETURN_NULL(); + arg = (kagg_state__stddev_packed *)PG_GETARG_BYTEA_P(1); + state = MemoryContextAlloc(aggcxt, sizeof(*state)); + memcpy(state, arg, sizeof(*state)); + } + else + { + state = (kagg_state__stddev_packed *)PG_GETARG_BYTEA_P(0); + if (!PG_ARGISNULL(1)) + { + arg = (kagg_state__stddev_packed *)PG_GETARG_BYTEA_P(1); - PG_RETURN_FLOAT8(sqrt(numerator / (N * N))); + state->nitems += arg->nitems; + state->sum_x += arg->sum_x; + state->sum_x2 += arg->sum_x2; + } + } + PG_RETURN_POINTER(state); } -/* - * pgstrom_float8_stddev_samp_numeric - */ Datum -pgstrom_float8_stddev_samp_numeric(PG_FUNCTION_ARGS) +pgstrom_var_sampf_final(PG_FUNCTION_ARGS) { - Datum datum = pgstrom_float8_stddev_samp(fcinfo); + kagg_state__stddev_packed *state + = (kagg_state__stddev_packed *)PG_GETARG_BYTEA_P(0); + if (state->nitems > 1) + { + float8_t N = (double)state->nitems; + float8_t fval = N * state->sum_x2 - state->sum_x * state->sum_x; - PG_RETURN_NUMERIC(DirectFunctionCall1(float8_numeric, datum)); + PG_RETURN_FLOAT8(fval / (N * (N - 1.0))); + } + PG_RETURN_NULL(); } -/* - * pgstrom_float8_stddev_pop_numeric - */ Datum -pgstrom_float8_stddev_pop_numeric(PG_FUNCTION_ARGS) +pgstrom_var_samp_final(PG_FUNCTION_ARGS) { - Datum datum = pgstrom_float8_stddev_pop(fcinfo); + Datum datum = pgstrom_var_sampf_final(fcinfo); + if (fcinfo->isnull) + PG_RETURN_NULL(); PG_RETURN_NUMERIC(DirectFunctionCall1(float8_numeric, datum)); } -/* - * pgstrom_float8_var_samp_numeric - */ Datum -pgstrom_float8_var_samp_numeric(PG_FUNCTION_ARGS) +pgstrom_var_popf_final(PG_FUNCTION_ARGS) { - Datum datum = pgstrom_float8_var_samp(fcinfo); + kagg_state__stddev_packed *state + = (kagg_state__stddev_packed *)PG_GETARG_BYTEA_P(0); + if (state->nitems > 0) + { + float8_t N = (double)state->nitems; + float8_t fval = N * state->sum_x2 - state->sum_x * state->sum_x; - PG_RETURN_NUMERIC(DirectFunctionCall1(float8_numeric, datum)); + PG_RETURN_FLOAT8(fval / (N * N)); + } + PG_RETURN_NULL(); } -/* - * pgstrom_float8_var_pop_numeric - */ Datum -pgstrom_float8_var_pop_numeric(PG_FUNCTION_ARGS) +pgstrom_var_pop_final(PG_FUNCTION_ARGS) { - Datum datum = pgstrom_float8_var_pop(fcinfo); + Datum datum = pgstrom_var_popf_final(fcinfo); + if (fcinfo->isnull) + PG_RETURN_NULL(); PG_RETURN_NUMERIC(DirectFunctionCall1(float8_numeric, datum)); } -/* - * pgstrom_float8_regr_combine - */ Datum -pgstrom_float8_regr_combine(PG_FUNCTION_ARGS) +pgstrom_stddev_sampf_final(PG_FUNCTION_ARGS) { - ArrayType *transarray1 = PG_GETARG_ARRAYTYPE_P(0); - ArrayType *transarray2 = PG_GETARG_ARRAYTYPE_P(1); - float8 *transvalues1; - float8 *transvalues2; - float8 N, sumX, sumX2, sumY, sumY2, sumXY; + Datum datum = pgstrom_var_sampf_final(fcinfo); - if (!AggCheckCallContext(fcinfo, NULL)) - elog(ERROR, "aggregate function called in non-aggregate context"); + if (fcinfo->isnull) + PG_RETURN_NULL(); + PG_RETURN_FLOAT8(sqrt(DatumGetFloat8(datum))); +} - transvalues1 = check_float8_array(transarray1, __FUNCTION__, 6); - transvalues2 = check_float8_array(transarray2, __FUNCTION__, 6); - N = transvalues1[0] + transvalues2[0]; - sumX = transvalues1[1] + transvalues2[1]; - sumX2 = transvalues1[2] + transvalues2[2]; - sumY = transvalues1[3] + transvalues2[3]; - sumY2 = transvalues1[4] + transvalues2[4]; - sumXY = transvalues1[5] + transvalues2[5]; +Datum +pgstrom_stddev_samp_final(PG_FUNCTION_ARGS) +{ + Datum datum = pgstrom_stddev_sampf_final(fcinfo); - check_float8_value(sumX, isinf(transvalues1[1]) || isinf(transvalues2[1]), true); - check_float8_value(sumX2, isinf(transvalues1[2]) || isinf(transvalues2[2]), true); - check_float8_value(sumY, isinf(transvalues1[3]) || isinf(transvalues2[3]), true); - check_float8_value(sumY2, isinf(transvalues1[4]) || isinf(transvalues2[4]), true); - check_float8_value(sumXY, isinf(transvalues1[5]) || isinf(transvalues2[5]), true); + if (fcinfo->isnull) + PG_RETURN_NULL(); + PG_RETURN_NUMERIC(DirectFunctionCall1(float8_numeric, datum)); +} - transvalues1[0] = N; - transvalues1[1] = sumX; - transvalues1[2] = sumX2; - transvalues1[3] = sumY; - transvalues1[4] = sumY2; - transvalues1[5] = sumXY; +Datum +pgstrom_stddev_popf_final(PG_FUNCTION_ARGS) +{ + Datum datum = pgstrom_var_popf_final(fcinfo); - PG_RETURN_ARRAYTYPE_P(transarray1); + if (fcinfo->isnull) + PG_RETURN_NULL(); + PG_RETURN_FLOAT8(sqrt(DatumGetFloat8(datum))); } -/* - * pgstrom_float8_covar_pop - */ Datum -pgstrom_float8_covar_pop(PG_FUNCTION_ARGS) +pgstrom_stddev_pop_final(PG_FUNCTION_ARGS) { - ArrayType *transarray = PG_GETARG_ARRAYTYPE_P(0); - float8 *transvalues; - float8 N, sumX, sumY, sumXY; - float8 numerator; - - transvalues = check_float8_array(transarray, __FUNCTION__, 6); - N = transvalues[0]; - sumX = transvalues[1]; - sumY = transvalues[3]; - sumXY = transvalues[5]; + Datum datum = pgstrom_stddev_popf_final(fcinfo); - /* if N is 0 we should return NULL */ - if (N < 1.0) + if (fcinfo->isnull) PG_RETURN_NULL(); - numerator = N * sumXY - sumX * sumX; - check_float8_value(numerator, isinf(sumXY) || isinf(sumX) || isinf(sumY), true); - - PG_RETURN_FLOAT8(numerator / (N * N)); + PG_RETURN_NUMERIC(DirectFunctionCall1(float8_numeric, datum)); } /* - * pgstrom_float8_covar_samp + * COVAR/REGR_* */ Datum -pgstrom_float8_covar_samp(PG_FUNCTION_ARGS) +pgstrom_partial_covar(PG_FUNCTION_ARGS) { - ArrayType *transarray = PG_GETARG_ARRAYTYPE_P(0); - float8 *transvalues; - float8 N, sumX, sumY, sumXY; - float8 numerator; + kagg_state__covar_packed *r = palloc(sizeof(kagg_state__covar_packed)); + float8_t x = PG_GETARG_FLOAT8(0); + float8_t y = PG_GETARG_FLOAT8(1); - transvalues = check_float8_array(transarray, __FUNCTION__, 6); - N = transvalues[0]; - sumX = transvalues[1]; - sumY = transvalues[3]; - sumXY = transvalues[5]; + r->nitems = 1; + r->sum_x = x; + r->sum_xx = x * x; + r->sum_y = y; + r->sum_yy = y * y; + r->sum_xy = x * y; + SET_VARSIZE(r, sizeof(kagg_state__covar_packed)); - /* if N is <= 1 we should return NULL */ - if (N < 2.0) - PG_RETURN_NULL(); - numerator = N * sumXY - sumX * sumX; - check_float8_value(numerator, isinf(sumXY) || isinf(sumX) || isinf(sumY), true); - - PG_RETURN_FLOAT8(numerator / (N * (N - 1.0))); + PG_RETURN_POINTER(r); } -/* - * pgstrom_float8_corr - */ Datum -pgstrom_float8_corr(PG_FUNCTION_ARGS) -{ - ArrayType *transarray = PG_GETARG_ARRAYTYPE_P(0); - float8 *transvalues; - float8 N, sumX, sumX2, sumY, sumY2, sumXY; - float8 numeratorX, numeratorY, numeratorXY; - - transvalues = check_float8_array(transarray, __FUNCTION__, 6); - N = transvalues[0]; - sumX = transvalues[1]; - sumX2 = transvalues[2]; - sumY = transvalues[3]; - sumY2 = transvalues[4]; - sumXY = transvalues[5]; - - /* if N is 0 we should return NULL */ - if (N < 1.0) - PG_RETURN_NULL(); - numeratorX = N * sumX2 - sumX * sumX; - numeratorY = N * sumY2 - sumY * sumY; - numeratorXY = N * sumXY - sumX * sumY; - check_float8_value(numeratorX, isinf(sumX) || isinf(sumX2), true); - check_float8_value(numeratorY, isinf(sumY) || isinf(sumY2), true); - check_float8_value(numeratorXY, isinf(sumXY) || isinf(sumX) || isinf(sumY), true); - - if (numeratorX <= 0 || numeratorY <= 0) - PG_RETURN_NULL(); - PG_RETURN_FLOAT8(numeratorXY / sqrt(numeratorX * numeratorY)); +pgstrom_covar_accum(PG_FUNCTION_ARGS) +{ + kagg_state__covar_packed *state; + kagg_state__covar_packed *arg; + MemoryContext aggcxt; + + if (!AggCheckCallContext(fcinfo, &aggcxt)) + elog(ERROR, "aggregate function called in non-aggregate context"); + if (PG_ARGISNULL(0)) + { + if (PG_ARGISNULL(1)) + PG_RETURN_NULL(); + arg = (kagg_state__covar_packed *)PG_GETARG_BYTEA_P(1); + state = MemoryContextAlloc(aggcxt, sizeof(*state)); + memcpy(state, arg, sizeof(*state)); + } + else + { + state = (kagg_state__covar_packed *)PG_GETARG_BYTEA_P(0); + if (!PG_ARGISNULL(1)) + { + arg = (kagg_state__covar_packed *)PG_GETARG_BYTEA_P(1); + + state->nitems += arg->nitems; + state->sum_x += arg->sum_x; + state->sum_xx += arg->sum_xx; + state->sum_y += arg->sum_y; + state->sum_yy += arg->sum_yy; + state->sum_xy += arg->sum_xy; + } + } + PG_RETURN_POINTER(state); } -/* - * pgstrom_float8_regr_avgx - */ Datum -pgstrom_float8_regr_avgx(PG_FUNCTION_ARGS) +pgstrom_covar_samp_final(PG_FUNCTION_ARGS) { - ArrayType *transarray = PG_GETARG_ARRAYTYPE_P(0); - float8 *transvalues; - float8 N, sumX; + kagg_state__covar_packed *state + = (kagg_state__covar_packed *)PG_GETARG_BYTEA_P(0); - transvalues = check_float8_array(transarray, __FUNCTION__, 6); - N = transvalues[0]; - sumX = transvalues[1]; + if (state->nitems > 1) + { + float8_t N = (float8_t)state->nitems; + float8_t fval = N * state->sum_xy - state->sum_x * state->sum_y; - /* if N is 0 we should return NULL */ - if (N < 1.0) - PG_RETURN_NULL(); - PG_RETURN_FLOAT8(sumX / N); + PG_RETURN_FLOAT8(fval / (N * (N - 1.0))); + } + PG_RETURN_NULL(); } -/* - * pgstrom_float8_regr_avgy - */ Datum -pgstrom_float8_regr_avgy(PG_FUNCTION_ARGS) +pgstrom_covar_pop_final(PG_FUNCTION_ARGS) { - ArrayType *transarray = PG_GETARG_ARRAYTYPE_P(0); - float8 *transvalues; - float8 N, sumY; + kagg_state__covar_packed *state + = (kagg_state__covar_packed *)PG_GETARG_BYTEA_P(0); - transvalues = check_float8_array(transarray, __FUNCTION__, 6); - N = transvalues[0]; - sumY = transvalues[3]; + if (state->nitems > 0) + { + float8_t N = (float8_t)state->nitems; + float8_t fval = N * state->sum_xy - state->sum_x * state->sum_y; - /* if N is 0 we should return NULL */ - if (N < 1.0) - PG_RETURN_NULL(); - PG_RETURN_FLOAT8(sumY / N); + PG_RETURN_FLOAT8(fval / (N * N)); + } + PG_RETURN_NULL(); } -/* - * pgstrom_float8_regr_intercept - */ Datum -pgstrom_float8_regr_intercept(PG_FUNCTION_ARGS) -{ - ArrayType *transarray = PG_GETARG_ARRAYTYPE_P(0); - float8 *transvalues; - float8 N, sumX, sumX2, sumY, sumXY; - float8 numeratorX, numeratorXXY; - - transvalues = check_float8_array(transarray, __FUNCTION__, 6); - N = transvalues[0]; - sumX = transvalues[1]; - sumX2 = transvalues[2]; - sumY = transvalues[3]; - sumXY = transvalues[5]; - - /* if N is 0 we should return NULL */ - if (N < 1.0) - PG_RETURN_NULL(); - numeratorX = N * sumX2 - sumX * sumX; - numeratorXXY = sumY * sumX2 - sumX * sumXY; - check_float8_value(numeratorX, isinf(sumX) || isinf(sumX2), true); - check_float8_value(numeratorXXY, (isinf(sumY) || isinf(sumX2) || - isinf(sumX) || isinf(sumXY)), true); - if (numeratorX <= 0) - PG_RETURN_NULL(); - PG_RETURN_FLOAT8(numeratorXXY / numeratorX); -} +pgstrom_regr_avgx_final(PG_FUNCTION_ARGS) +{ + kagg_state__covar_packed *state + = (kagg_state__covar_packed *)PG_GETARG_BYTEA_P(0); + if (state->nitems > 0) + { + float8_t N = (float8_t)state->nitems; -/* - * pgstrom_float8_regr_r2 - */ -Datum -pgstrom_float8_regr_r2(PG_FUNCTION_ARGS) -{ - ArrayType *transarray = PG_GETARG_ARRAYTYPE_P(0); - float8 *transvalues; - float8 N, sumX, sumX2, sumY, sumY2, sumXY; - float8 numeratorX, numeratorY, numeratorXY; - - transvalues = check_float8_array(transarray, __FUNCTION__, 6); - N = transvalues[0]; - sumX = transvalues[1]; - sumX2 = transvalues[2]; - sumY = transvalues[3]; - sumY2 = transvalues[4]; - sumXY = transvalues[5]; - - /* if N is 0 we should return NULL */ - if (N < 1.0) - PG_RETURN_NULL(); - numeratorX = N * sumX2 - sumX * sumX; - numeratorY = N * sumY2 - sumY * sumY; - numeratorXY = N * sumXY - sumX * sumY; - check_float8_value(numeratorX, isinf(sumX) || isinf(sumX2), true); - check_float8_value(numeratorY, isinf(sumY) || isinf(sumY2), true); - check_float8_value(numeratorXY, isinf(sumXY) || isinf(sumX) || isinf(sumY), true); - - if (numeratorX <= 0.0) - PG_RETURN_NULL(); - if (numeratorY <= 0.0) - PG_RETURN_FLOAT8(1.0); - PG_RETURN_FLOAT8((numeratorXY * numeratorXY) / (numeratorX * numeratorY)); + PG_RETURN_FLOAT8(state->sum_x / N); + } + PG_RETURN_NULL(); } -/* - * pgstrom_float8_regr_slope - */ Datum -pgstrom_float8_regr_slope(PG_FUNCTION_ARGS) -{ - ArrayType *transarray = PG_GETARG_ARRAYTYPE_P(0); - float8 *transvalues; - float8 N, sumX, sumX2, sumY, sumXY; - float8 numeratorX, numeratorXY; - - transvalues = check_float8_array(transarray, __FUNCTION__, 6); - N = transvalues[0]; - sumX = transvalues[1]; - sumX2 = transvalues[2]; - sumY = transvalues[3]; - sumXY = transvalues[5]; - - /* if N is 0 we should return NULL */ - if (N < 1.0) - PG_RETURN_NULL(); - numeratorX = N * sumX2 - sumX * sumX; - numeratorXY = N * sumXY - sumX * sumY; - check_float8_value(numeratorX, isinf(sumX) || isinf(sumX2), true); - check_float8_value(numeratorXY, isinf(sumXY) || isinf(sumX) || isinf(sumY), true); +pgstrom_regr_avgy_final(PG_FUNCTION_ARGS) +{ + kagg_state__covar_packed *state + = (kagg_state__covar_packed *)PG_GETARG_BYTEA_P(0); + if (state->nitems > 0) + { + float8_t N = (float8_t)state->nitems; - if (numeratorX <= 0) - PG_RETURN_NULL(); - PG_RETURN_FLOAT8(numeratorXY / numeratorX); + PG_RETURN_FLOAT8(state->sum_y / N); + } + PG_RETURN_NULL(); } -/* - * pgstrom_float8_regr_sxx - */ Datum -pgstrom_float8_regr_sxx(PG_FUNCTION_ARGS) +pgstrom_regr_count_final(PG_FUNCTION_ARGS) { - ArrayType *transarray = PG_GETARG_ARRAYTYPE_P(0); - float8 *transvalues; - float8 N, sumX, sumX2; - float8 numeratorX; + kagg_state__covar_packed *state + = (kagg_state__covar_packed *)PG_GETARG_BYTEA_P(0); - transvalues = check_float8_array(transarray, __FUNCTION__, 6); - N = transvalues[0]; - sumX = transvalues[1]; - sumX2 = transvalues[2]; - - /* if N is 0 we should return NULL */ - if (N < 1.0) - PG_RETURN_NULL(); - numeratorX = N * sumX2 - sumX * sumX; - check_float8_value(numeratorX, isinf(sumX) || isinf(sumX2), true); - - if (numeratorX <= 0) - PG_RETURN_NULL(); - PG_RETURN_FLOAT8(numeratorX / N); + PG_RETURN_FLOAT8((float8_t)state->nitems); } -/* - * pgstrom_float8_regr_syy - */ Datum -pgstrom_float8_regr_syy(PG_FUNCTION_ARGS) +pgstrom_regr_intercept_final(PG_FUNCTION_ARGS) { - ArrayType *transarray = PG_GETARG_ARRAYTYPE_P(0); - float8 *transvalues; - float8 N, sumY, sumY2; - float8 numeratorY; - - transvalues = check_float8_array(transarray, __FUNCTION__, 6); - N = transvalues[0]; - sumY = transvalues[3]; - sumY2 = transvalues[4]; - - /* if N is 0 we should return NULL */ - if (N < 1.0) - PG_RETURN_NULL(); - numeratorY = N * sumY2 - sumY * sumY; - check_float8_value(numeratorY, isinf(sumY) || isinf(sumY2), true); + kagg_state__covar_packed *state + = (kagg_state__covar_packed *)PG_GETARG_BYTEA_P(0); + if (state->nitems > 0 && state->sum_xx != 0.0) + { + float8_t N = (float8_t)state->nitems; + + PG_RETURN_FLOAT8((state->sum_y - + state->sum_x * state->sum_xy / state->sum_xx) / N); + } + PG_RETURN_NULL(); +} - if (numeratorY <= 0) - PG_RETURN_NULL(); - PG_RETURN_FLOAT8(numeratorY / N); +Datum +pgstrom_regr_r2_final(PG_FUNCTION_ARGS) +{ + kagg_state__covar_packed *state + = (kagg_state__covar_packed *)PG_GETARG_BYTEA_P(0); + if (state->nitems > 0 && + state->sum_xx != 0.0 && + state->sum_yy != 0.0) + { + PG_RETURN_FLOAT8((state->sum_xy * state->sum_xy) / + (state->sum_xx * state->sum_yy)); + } + PG_RETURN_NULL(); } -/* - * pgstrom_float8_regr_sxy - */ Datum -pgstrom_float8_regr_sxy(PG_FUNCTION_ARGS) +pgstrom_regr_slope_final(PG_FUNCTION_ARGS) { - ArrayType *transarray = PG_GETARG_ARRAYTYPE_P(0); - float8 *transvalues; - float8 N, sumX, sumY, sumXY; - float8 numeratorXY; + kagg_state__covar_packed *state + = (kagg_state__covar_packed *)PG_GETARG_BYTEA_P(0); + if (state->nitems > 0 && state->sum_xx != 0.0) + { + PG_RETURN_FLOAT8(state->sum_xy / state->sum_xx); + } + PG_RETURN_NULL(); +} - transvalues = check_float8_array(transarray, __FUNCTION__, 6); - N = transvalues[0]; - sumX = transvalues[1]; - sumY = transvalues[3]; - sumXY = transvalues[5]; +Datum +pgstrom_regr_sxx_final(PG_FUNCTION_ARGS) +{ + kagg_state__covar_packed *state + = (kagg_state__covar_packed *)PG_GETARG_BYTEA_P(0); + if (state->nitems > 0) + { + PG_RETURN_FLOAT8(state->sum_xx); + } + PG_RETURN_NULL(); +} - /* if N is 0 we should return NULL */ - if (N < 1.0) - PG_RETURN_NULL(); - numeratorXY = N * sumXY - sumX * sumY; - check_float8_value(numeratorXY, isinf(sumXY) || isinf(sumX) || isinf(sumY), true); +Datum +pgstrom_regr_sxy_final(PG_FUNCTION_ARGS) +{ + kagg_state__covar_packed *state + = (kagg_state__covar_packed *)PG_GETARG_BYTEA_P(0); + if (state->nitems > 0) + { + PG_RETURN_FLOAT8(state->sum_xy); + } + PG_RETURN_NULL(); +} - PG_RETURN_FLOAT8(numeratorXY / N); +Datum +pgstrom_regr_syy_final(PG_FUNCTION_ARGS) +{ + kagg_state__covar_packed *state + = (kagg_state__covar_packed *)PG_GETARG_BYTEA_P(0); + if (state->nitems > 0) + { + PG_RETURN_FLOAT8(state->sum_yy); + } + PG_RETURN_NULL(); } +#if 0 /* * ---------------------------------------------------------------- * @@ -1093,17 +910,17 @@ __pgstrom_hll_hash_int8(Datum datum) static uint64 __pgstrom_hll_hash_numeric(Datum datum) { - kern_context kcxt; - pg_numeric_t num; - size_t sz; + xpu_numeric_t num; + const char *emsg; - memset(&kcxt, 0, sizeof(kcxt)); - num = pg_numeric_from_varlena(&kcxt, (struct varlena *)datum); - if (kcxt.errcode != ERRCODE_STROM_SUCCESS) - elog(ERROR, "failed on hash calculation of device numeric: %s", - DatumGetCString(DirectFunctionCall1(numeric_out, datum))); - sz = offsetof(pg_numeric_t, weight) + sizeof(cl_short); - return __pgstrom_hll_siphash_value(&num, sz); + memset(&num, 0, sizeof(num)); + emsg = __xpu_numeric_from_varlena(&num, (struct varlena *)datum); + if (emsg) + elog(ERROR, "failed on hash calculation of device numeric: %s", emsg); + return __pgstrom_hll_siphash_value(&num.weight, + offsetof(xpu_numeric_t, value) + + sizeof(int128_t) + - offsetof(xpu_numeric_t, weight)); } static uint64 @@ -1405,3 +1222,4 @@ pgstrom_hll_sketch_histogram(PG_FUNCTION_ARGS) 'i'); PG_RETURN_POINTER(result); } +#endif diff --git a/src/arrow_defs.h b/src/arrow_defs.h index 3da60b165..b78ca1ae4 100644 --- a/src/arrow_defs.h +++ b/src/arrow_defs.h @@ -126,6 +126,7 @@ typedef enum { ArrowIntervalUnit__Year_Month = 0, ArrowIntervalUnit__Day_Time = 1, + ArrowIntervalUnit__Month_Day_Nano = 2, } ArrowIntervalUnit; /* @@ -192,50 +193,39 @@ typedef enum /* * ArrowTypeOptions - our own definition */ -#define ARROW_TYPE_OPTIONS_COMMON_FIELDS \ - ArrowTypeTag tag; \ - unsigned short unitsz - -typedef union ArrowTypeOptions -{ - struct { - ARROW_TYPE_OPTIONS_COMMON_FIELDS; - } common; - struct { - ARROW_TYPE_OPTIONS_COMMON_FIELDS; - unsigned short bitWidth; - __boolean is_signed; - } integer; - struct { - ARROW_TYPE_OPTIONS_COMMON_FIELDS; - ArrowPrecision precision; - } floating_point; - struct { - ARROW_TYPE_OPTIONS_COMMON_FIELDS; - unsigned short precision; - unsigned short scale; - unsigned short bitWidth; - } decimal; - struct { - ARROW_TYPE_OPTIONS_COMMON_FIELDS; - ArrowDateUnit unit; - } date; - struct { - ARROW_TYPE_OPTIONS_COMMON_FIELDS; - ArrowTimeUnit unit; - } time; - struct { - ARROW_TYPE_OPTIONS_COMMON_FIELDS; - ArrowTimeUnit unit; - } timestamp; - struct { - ARROW_TYPE_OPTIONS_COMMON_FIELDS; - ArrowIntervalUnit unit; - } interval; - struct { - ARROW_TYPE_OPTIONS_COMMON_FIELDS; - int byteWidth; - } fixed_size_binary; +typedef struct ArrowTypeOptions +{ + ArrowTypeTag tag; + short unitsz; + union { + struct { + unsigned short bitWidth; + __boolean is_signed; + } integer; + struct { + ArrowPrecision precision; + } floating_point; + struct { + unsigned short precision; + unsigned short scale; + unsigned short bitWidth; + } decimal; + struct { + ArrowDateUnit unit; + } date; + struct { + ArrowTimeUnit unit; + } time; + struct { + ArrowTimeUnit unit; + } timestamp; + struct { + ArrowIntervalUnit unit; + } interval; + struct { + unsigned int byteWidth; + } fixed_size_binary; + }; } ArrowTypeOptions; #undef ARROW_TYPE_OPTIONS_COMMON_FIELDS diff --git a/src/arrow_fdw.c b/src/arrow_fdw.c index f7b6f2a32..89a787ffb 100644 --- a/src/arrow_fdw.c +++ b/src/arrow_fdw.c @@ -3,8 +3,8 @@ * * Routines to map Apache Arrow files as PG's Foreign-Table. * ---- - * Copyright 2011-2021 (C) KaiGai Kohei - * Copyright 2014-2021 (C) PG-Strom Developers Team + * Copyright 2011-2023 (C) KaiGai Kohei + * Copyright 2014-2023 (C) PG-Strom Developers Team * * This program is free software; you can redistribute it and/or modify * it under the terms of the PostgreSQL License. @@ -12,13 +12,30 @@ #include "pg_strom.h" #include "arrow_defs.h" #include "arrow_ipc.h" -#include "cuda_numeric.cu" +#include "xpu_numeric.h" + +/* + * min/max statistics datum + */ +typedef struct +{ + bool isnull; + union { + Datum datum; + NumericData numeric; /* if NUMERICOID */ + } min; + union { + Datum datum; + NumericData numeric; /* if NUMERICOID */ + } max; +} MinMaxStatDatum; /* * RecordBatchState */ typedef struct RecordBatchFieldState { + /* common fields with cache */ Oid atttypid; int atttypmod; ArrowTypeOptions attopts; @@ -30,10 +47,7 @@ typedef struct RecordBatchFieldState size_t values_length; off_t extra_offset; size_t extra_length; - /* min/max statistics */ - SQLstat__datum stat_min; - SQLstat__datum stat_max; - bool stat_isnull; + MinMaxStatDatum stat_datum; /* sub-fields if any */ int num_children; struct RecordBatchFieldState *children; @@ -41,606 +55,493 @@ typedef struct RecordBatchFieldState typedef struct RecordBatchState { - File fdesc; - GPUDirectFileDesc *dfile; - struct stat stat_buf; + struct ArrowFileState *af_state; /* reference to ArrowFileState */ int rb_index; /* index number in a file */ off_t rb_offset; /* offset from the head */ size_t rb_length; /* length of the entire RecordBatch */ int64 rb_nitems; /* number of items */ /* per column information */ - int ncols; - RecordBatchFieldState columns[FLEXIBLE_ARRAY_MEMBER]; + int nfields; + RecordBatchFieldState fields[FLEXIBLE_ARRAY_MEMBER]; } RecordBatchState; -/* - * metadata cache (on shared memory) - */ -typedef struct -{ - dev_t st_dev; - ino_t st_ino; - uint32 hash; -} MetadataCacheKey; - -typedef struct +typedef struct ArrowFileState { - dlist_node chain; - dlist_node lru_chain; - dlist_head siblings; /* if two or more record batches per file */ - /* key of RecordBatch metadata cache */ + const char *filename; + const char *dpu_path; /* relative pathname, if DPU */ struct stat stat_buf; - uint32 hash; - /* fields from RecordBatchState */ - int rb_index; /* index of the RecordBatch */ - off_t rb_offset; /* offset from the head */ - size_t rb_length; /* length of the entire RecordBatch */ - int64 rb_nitems; /* number of items */ - int ncols; - int nfields; /* length of fstate[] array */ - RecordBatchFieldState fstate[FLEXIBLE_ARRAY_MEMBER]; -} arrowMetadataCache; - -#define ARROW_METADATA_HASH_NSLOTS 2048 -typedef struct -{ - slock_t lru_lock; - dlist_head lru_list; - pg_atomic_uint64 consumed; - - LWLock lock_slots[ARROW_METADATA_HASH_NSLOTS]; - dlist_head hash_slots[ARROW_METADATA_HASH_NSLOTS]; - dlist_head mvcc_slots[ARROW_METADATA_HASH_NSLOTS]; -} arrowMetadataState; - -/* setup of MetadataCacheKey */ -static inline int -initMetadataCacheKey(MetadataCacheKey *mkey, struct stat *stat_buf) -{ - memset(mkey, 0, sizeof(MetadataCacheKey)); - mkey->st_dev = stat_buf->st_dev; - mkey->st_ino = stat_buf->st_ino; - mkey->hash = hash_any((unsigned char *)mkey, - offsetof(MetadataCacheKey, hash)); - return mkey->hash % ARROW_METADATA_HASH_NSLOTS; -} + List *rb_list; /* list of RecordBatchState */ +} ArrowFileState; /* - * executor hint by min/max statistics per record batch + * ArrowFdwState - executor state to run apache arrow */ typedef struct { - List *orig_quals; - List *eval_quals; - ExprState *eval_state; Bitmapset *stat_attrs; Bitmapset *load_attrs; + List *orig_quals; /* for EXPLAIN */ + List *eval_quals; + ExprState *eval_state; ExprContext *econtext; } arrowStatsHint; -/* - * MVCC state for the pending writes - */ -typedef struct +struct ArrowFdwState { - dlist_node chain; - MetadataCacheKey key; - TransactionId xid; - CommandId cid; - uint32 record_batch; -} arrowWriteMVCCLog; + Bitmapset *referenced; /* referenced columns */ + arrowStatsHint *stats_hint; /* min/max statistics, if any */ + pg_atomic_uint32 *rbatch_index; + pg_atomic_uint32 __rbatch_index_local; /* if single process */ + pg_atomic_uint32 *rbatch_nload; + pg_atomic_uint32 __rbatch_nload_local; /* if single process */ + pg_atomic_uint32 *rbatch_nskip; + pg_atomic_uint32 __rbatch_nskip_local; /* if single process */ + StringInfoData chunk_buffer; /* buffer to load record-batch */ + File curr_filp; /* current arrow file to read */ + kern_data_store *curr_kds; /* current chunk to read */ + uint32_t curr_index; /* current index on the chunk */ + List *af_states_list; /* list of ArrowFileState */ + uint32_t rb_nitems; /* number of record-batches */ + RecordBatchState *rb_states[FLEXIBLE_ARRAY_MEMBER]; /* flatten RecordBatchState */ +}; /* - * REDO Log for INSERT/TRUNCATE + * Metadata Cache (on shared memory) */ +#define ARROW_METADATA_BLOCKSZ (128 * 1024) /* 128kB */ typedef struct { - dlist_node chain; - MetadataCacheKey key; - TransactionId xid; - CommandId cid; - char *pathname; - bool is_truncate; - /* for TRUNCATE */ - uint32 suffix; - /* for INSERT */ - loff_t footer_offset; - size_t footer_length; - char footer_backup[FLEXIBLE_ARRAY_MEMBER]; -} arrowWriteRedoLog; + dlist_node chain; /* link to free_blocks; NULL if active */ + int32_t unitsz; /* unit size of slab items */ + int32_t n_actives; /* number of active items */ + char data[FLEXIBLE_ARRAY_MEMBER]; +} arrowMetadataCacheBlock; +#define ARROW_METADATA_CACHE_FREE_MAGIC (0xdeadbeafU) +#define ARROW_METADATA_CACHE_ACTIVE_MAGIC (0xcafebabeU) -/* - * arrowWriteState - */ -typedef struct -{ - MemoryContext memcxt; - File file; - MetadataCacheKey key; - uint32 hash; - SQLtable sql_table; -} arrowWriteState; +typedef struct arrowMetadataFieldCache arrowMetadataFieldCache; +typedef struct arrowMetadataCache arrowMetadataCache; -/* - * ArrowFdwState - */ -struct ArrowFdwState +struct arrowMetadataFieldCache { - GpuContext *gcontext; /* valid if owned by GpuXXX plan */ - List *gpuDirectFileDescList; /* list of GPUDirectFileDesc */ - List *fdescList; /* list of File (buffered i/o) */ - Bitmapset *referenced; - arrowStatsHint *stats_hint; - pg_atomic_uint32 *rbatch_index; - pg_atomic_uint32 __rbatch_index_local; /* if single process */ - pg_atomic_uint32 *rbatch_nload; - pg_atomic_uint32 __rbatch_nload_local; /* if single process */ - pg_atomic_uint32 *rbatch_nskip; - pg_atomic_uint32 __rbatch_nskip_local; /* if single process */ - pgstrom_data_store *curr_pds; /* current focused buffer */ - cl_ulong curr_index; /* current index to row on KDS */ - /* state of RecordBatches */ - uint32 num_rbatches; - RecordBatchState *rbatches[FLEXIBLE_ARRAY_MEMBER]; + arrowMetadataCacheBlock *owner; + dlist_node chain; /* link to free/fields[children] list */ + /* common fields with cache */ + Oid atttypid; + int atttypmod; + ArrowTypeOptions attopts; + int64 nitems; /* usually, same with rb_nitems */ + int64 null_count; + off_t nullmap_offset; + size_t nullmap_length; + off_t values_offset; + size_t values_length; + off_t extra_offset; + size_t extra_length; + MinMaxStatDatum stat_datum; + /* sub-fields if any */ + int num_children; + dlist_head children; + uint32_t magic; }; -/* ---------- static variables ---------- */ -static FdwRoutine pgstrom_arrow_fdw_routine; -static shmem_request_hook_type shmem_request_next = NULL; -static shmem_startup_hook_type shmem_startup_next = NULL; -static arrowMetadataState *arrow_metadata_state = NULL; -static dlist_head arrow_write_redo_list; -static bool arrow_fdw_enabled; /* GUC */ -static bool arrow_fdw_stats_hint_enabled; /* GUC */ -static int arrow_metadata_cache_size_kb; /* GUC */ -static size_t arrow_metadata_cache_size; -static int arrow_record_batch_size_kb; /* GUC */ - -/* ---------- static functions ---------- */ -static Oid arrowTypeToPGTypeOid(ArrowField *field, int *typmod); -static const char *arrowTypeToPGTypeName(ArrowField *field); -static size_t arrowFieldLength(ArrowField *field, int64 nitems); -static bool arrowSchemaCompatibilityCheck(TupleDesc tupdesc, - RecordBatchState *rb_state); -static List *__arrowFdwExtractFilesList(List *options_list, - int *p_parallel_nworkers, - bool *p_writable); -static List *arrowFdwExtractFilesList(List *options_list); -static List *arrowLookupOrBuildMetadataCache(File fdesc, Bitmapset **p_stat_attrs); -static void pg_datum_arrow_ref(kern_data_store *kds, - kern_colmeta *cmeta, - size_t index, - Datum *p_datum, - bool *p_isnull); -/* routines for writable arrow_fdw foreign tables */ -static void setupArrowSQLbufferSchema(SQLtable *table, TupleDesc tupdesc, - ArrowFileInfo *af_info); -static void setupArrowSQLbufferBatches(SQLtable *table, - ArrowFileInfo *af_info); -static loff_t createArrowWriteRedoLog(File filp, bool is_newfile); -static void writeOutArrowRecordBatch(arrowWriteState *aw_state, - bool with_footer); - -Datum pgstrom_arrow_fdw_handler(PG_FUNCTION_ARGS); -Datum pgstrom_arrow_fdw_validator(PG_FUNCTION_ARGS); -Datum pgstrom_arrow_fdw_precheck_schema(PG_FUNCTION_ARGS); -Datum pgstrom_arrow_fdw_truncate(PG_FUNCTION_ARGS); -Datum pgstrom_arrow_fdw_import_file(PG_FUNCTION_ARGS); +struct arrowMetadataCache +{ + arrowMetadataCacheBlock *owner; + dlist_node chain; /* link to free/hash list */ + dlist_node lru_chain; /* link to lru_list */ + struct timeval lru_tv; /* last access time */ + arrowMetadataCache *next; /* next record-batch if any */ + struct stat stat_buf; /* result of stat(2) */ + int rb_index; /* index number in a file */ + off_t rb_offset; /* offset from the head */ + size_t rb_length; /* length of the entire RecordBatch */ + int64 rb_nitems; /* number of items */ + /* per column information */ + int nfields; + dlist_head fields; /* list of arrowMetadataFieldCache */ + uint32_t magic; +}; /* - * timespec_comp - compare timespec values + * Metadata cache management */ -static inline int -timespec_comp(struct timespec *tv1, struct timespec *tv2) +#define ARROW_METADATA_HASH_NSLOTS 2000 +typedef struct { - if (tv1->tv_sec < tv2->tv_sec) - return -1; - if (tv1->tv_sec > tv2->tv_sec) - return 1; - if (tv1->tv_nsec < tv2->tv_nsec) - return -1; - if (tv1->tv_nsec > tv2->tv_nsec) - return 1; - return 0; -} + LWLock mutex; + slock_t lru_lock; /* protect lru related stuff */ + dlist_head lru_list; + dlist_head free_blocks; /* list of arrowMetadataCacheBlock */ + dlist_head free_mcaches; /* list of arrowMetadataCache */ + dlist_head free_fcaches; /* list of arrowMetadataFieldCache */ + dlist_head hash_slots[ARROW_METADATA_HASH_NSLOTS]; +} arrowMetadataCacheHead; /* - * baseRelIsArrowFdw + * Static variables */ -bool -baseRelIsArrowFdw(RelOptInfo *baserel) -{ - if ((baserel->reloptkind == RELOPT_BASEREL || - baserel->reloptkind == RELOPT_OTHER_MEMBER_REL) && - baserel->rtekind == RTE_RELATION && - OidIsValid(baserel->serverid) && - baserel->fdwroutine && - memcmp(baserel->fdwroutine, - &pgstrom_arrow_fdw_routine, - sizeof(FdwRoutine)) == 0) - return true; +static FdwRoutine pgstrom_arrow_fdw_routine; +static shmem_request_hook_type shmem_request_next = NULL; +static shmem_startup_hook_type shmem_startup_next = NULL; +static arrowMetadataCacheHead *arrow_metadata_cache = NULL; +static bool arrow_fdw_enabled; /* GUC */ +static bool arrow_fdw_stats_hint_enabled; /* GUC */ +static int arrow_metadata_cache_size_kb; /* GUC */ - return false; -} +PG_FUNCTION_INFO_V1(pgstrom_arrow_fdw_handler); +PG_FUNCTION_INFO_V1(pgstrom_arrow_fdw_validator); +PG_FUNCTION_INFO_V1(pgstrom_arrow_fdw_import_file); +PG_FUNCTION_INFO_V1(pgstrom_arrow_fdw_precheck_schema); + +/* ---------------------------------------------------------------- + * + * Apache Arrow <--> PG Types Mapping Routines + * + * ---------------------------------------------------------------- + */ /* - * RelationIsArrowFdw + * arrowFieldGetPGTypeHint */ -bool -RelationIsArrowFdw(Relation frel) +static Oid +arrowFieldGetPGTypeHint(const ArrowField *field) { - if (RelationGetForm(frel)->relkind == RELKIND_FOREIGN_TABLE) + for (int i=0; i < field->_num_custom_metadata; i++) { - FdwRoutine *routine = GetFdwRoutineForRelation(frel, false); + ArrowKeyValue *kv = &field->custom_metadata[i]; + char *namebuf, *pos; + Oid namespace_oid = PG_CATALOG_NAMESPACE; + HeapTuple tup; - if (memcmp(routine, &pgstrom_arrow_fdw_routine, - sizeof(FdwRoutine)) == 0) - return true; + if (strcmp(kv->key, "pg_type") != 0) + continue; + namebuf = alloca(kv->_value_len + 10); + strcpy(namebuf, kv->value); + pos = strchr(namebuf, '.'); + if (pos) + { + *pos++ = '\0'; + namespace_oid = get_namespace_oid(namebuf, true); + if (!OidIsValid(namespace_oid)) + continue; + namebuf = pos; + } + tup = SearchSysCache2(TYPENAMENSP, + PointerGetDatum(namebuf), + ObjectIdGetDatum(namespace_oid)); + if (HeapTupleIsValid(tup)) + { + Oid hint = ((Form_pg_type) GETSTRUCT(tup))->oid; + + ReleaseSysCache(tup); + + return hint; + } } - return false; + return InvalidOid; } -/* - * RecordBatchFieldCount +/* ------------------------------------------------ + * Metadata Cache Management Routines + * + * MEMO: all of them requires the caller must have exclusive lock + * on the arrowMetadataCache::mutex + * ------------------------------------------------ */ -static int -__RecordBatchFieldCount(RecordBatchFieldState *fstate) +static void +__releaseMetadataFieldCache(arrowMetadataFieldCache *fcache) { - int j, count = 1; + arrowMetadataCacheBlock *mc_block = fcache->owner; + + Assert(fcache->magic == ARROW_METADATA_CACHE_ACTIVE_MAGIC); + /* also release sub-fields if any */ + while (!dlist_is_empty(&fcache->children)) + { + arrowMetadataFieldCache *__fcache + = dlist_container(arrowMetadataFieldCache, chain, + dlist_pop_head_node(&fcache->children)); + __releaseMetadataFieldCache(__fcache); + } + fcache->magic = ARROW_METADATA_CACHE_FREE_MAGIC; + dlist_push_tail(&arrow_metadata_cache->free_fcaches, + &fcache->chain); - for (j=0; j < fstate->num_children; j++) - count += __RecordBatchFieldCount(&fstate->children[j]); + /* also back the owner block if all slabs become free */ + Assert(mc_block->n_actives > 0); + if (--mc_block->n_actives == 0) + { + char *pos = mc_block->data; + char *end = (char *)mc_block + ARROW_METADATA_BLOCKSZ; - return count; + Assert(mc_block->unitsz == MAXALIGN(sizeof(arrowMetadataFieldCache))); + while (pos + mc_block->unitsz <= end) + { + arrowMetadataFieldCache *__fcache = (arrowMetadataFieldCache *)pos; + Assert(__fcache->owner == mc_block && + __fcache->magic == ARROW_METADATA_CACHE_FREE_MAGIC); + dlist_delete(&__fcache->chain); + pos += mc_block->unitsz; + } + Assert(!mc_block->chain.prev && + !mc_block->chain.next); /* must be active block */ + dlist_push_tail(&arrow_metadata_cache->free_blocks, + &mc_block->chain); + } } -static int -RecordBatchFieldCount(RecordBatchState *rbstate) +static void +__releaseMetadataCache(arrowMetadataCache *mcache) { - int j, count = 0; + while (mcache) + { + arrowMetadataCacheBlock *mc_block = mcache->owner; + arrowMetadataCache *__mcache_next = mcache->next; - for (j=0; j < rbstate->ncols; j++) - count += __RecordBatchFieldCount(&rbstate->columns[j]); + Assert(mcache->magic == ARROW_METADATA_CACHE_ACTIVE_MAGIC); + /* + * MEMO: Caller already detach the leader mcache from the hash- + * slot and the LRU-list. The follower mcaches should never be + * linked to hash-slot and LRU-list. + * So, we just put Assert() here. + */ + Assert(!mcache->chain.prev && !mcache->chain.next && + !mcache->lru_chain.prev && !mcache->lru_chain.next); - return count; + /* also release arrowMetadataFieldCache */ + while (!dlist_is_empty(&mcache->fields)) + { + arrowMetadataFieldCache *fcache + = dlist_container(arrowMetadataFieldCache, chain, + dlist_pop_head_node(&mcache->fields)); + __releaseMetadataFieldCache(fcache); + } + mcache->magic = ARROW_METADATA_CACHE_FREE_MAGIC; + dlist_push_tail(&arrow_metadata_cache->free_mcaches, + &mcache->chain); + /* also back the owner block if all slabs become free */ + Assert(mc_block->n_actives > 0); + if (--mc_block->n_actives == 0) + { + char *pos = mc_block->data; + char *end = (char *)mc_block + ARROW_METADATA_BLOCKSZ; + + Assert(mc_block->unitsz == MAXALIGN(sizeof(arrowMetadataCache))); + while (pos + mc_block->unitsz <= end) + { + arrowMetadataCache *__mcache = (arrowMetadataCache *)pos; + + Assert(__mcache->owner == mc_block && + __mcache->magic == ARROW_METADATA_CACHE_FREE_MAGIC); + dlist_delete(&__mcache->chain); + pos += mc_block->unitsz; + } + Assert(!mc_block->chain.prev && + !mc_block->chain.next); /* must be active block */ + dlist_push_tail(&arrow_metadata_cache->free_blocks, + &mc_block->chain); + } + mcache = __mcache_next; + } } -/* - * RecordBatchFieldLength - */ -static size_t -RecordBatchFieldLength(RecordBatchFieldState *fstate) +static bool +__reclaimMetadataCache(void) { - size_t len; - int j; + SpinLockAcquire(&arrow_metadata_cache->lru_lock); + if (!dlist_is_empty(&arrow_metadata_cache->lru_list)) + { + arrowMetadataCache *mcache; + dlist_node *dnode; + struct timeval curr_tv; + int64_t elapsed; - len = BLCKALIGN(fstate->nullmap_length + - fstate->values_length + - fstate->extra_length); - for (j=0; j < fstate->num_children; j++) - len += RecordBatchFieldLength(&fstate->children[j]); - return len; + gettimeofday(&curr_tv, NULL); + dnode = dlist_tail_node(&arrow_metadata_cache->lru_list); + mcache = dlist_container(arrowMetadataCache, lru_chain, dnode); + elapsed = ((curr_tv.tv_sec - mcache->lru_tv.tv_sec) * 1000000 + + (curr_tv.tv_usec - mcache->lru_tv.tv_usec)); + if (elapsed > 30000000UL) /* > 30s */ + { + dlist_delete(&mcache->lru_chain); + memset(&mcache->lru_chain, 0, sizeof(dlist_node)); + SpinLockRelease(&arrow_metadata_cache->lru_lock); + dlist_delete(&mcache->chain); + memset(&mcache->chain, 0, sizeof(dlist_node)); + + __releaseMetadataCache(mcache); + return true; + } + } + SpinLockRelease(&arrow_metadata_cache->lru_lock); + return false; } -/* - * ArrowGetForeignRelSize - */ -static void -ArrowGetForeignRelSize(PlannerInfo *root, - RelOptInfo *baserel, - Oid foreigntableid) +static arrowMetadataFieldCache * +__allocMetadataFieldCache(void) { - ForeignTable *ft = GetForeignTable(foreigntableid); - List *filesList; - Size filesSizeTotal = 0; - Bitmapset *referenced = NULL; - BlockNumber npages = 0; - double ntuples = 0.0; - ListCell *lc; - int parallel_nworkers; - bool writable; - Bitmapset *optimal_gpus = (void *)(~0UL); - int j, k; + arrowMetadataFieldCache *fcache; + dlist_node *dnode; - /* columns to be fetched */ - foreach (lc, baserel->baserestrictinfo) + while (dlist_is_empty(&arrow_metadata_cache->free_fcaches)) { - RestrictInfo *rinfo = lfirst(lc); + arrowMetadataCacheBlock *mc_block; + char *pos, *end; - pull_varattnos((Node *)rinfo->clause, baserel->relid, &referenced); + while (dlist_is_empty(&arrow_metadata_cache->free_blocks)) + { + if (!__reclaimMetadataCache()) + return NULL; + } + dnode = dlist_pop_head_node(&arrow_metadata_cache->free_blocks); + mc_block = dlist_container(arrowMetadataCacheBlock, chain, dnode); + memset(mc_block, 0, offsetof(arrowMetadataCacheBlock, data)); + mc_block->unitsz = MAXALIGN(sizeof(arrowMetadataFieldCache)); + for (pos = mc_block->data, end = (char *)mc_block + ARROW_METADATA_BLOCKSZ; + pos + mc_block->unitsz <= end; + pos += mc_block->unitsz) + { + fcache = (arrowMetadataFieldCache *)pos; + fcache->owner = mc_block; + fcache->magic = ARROW_METADATA_CACHE_FREE_MAGIC; + dlist_push_tail(&arrow_metadata_cache->free_fcaches, + &fcache->chain); + } } - referenced = pgstrom_pullup_outer_refs(root, baserel, referenced); + dnode = dlist_pop_head_node(&arrow_metadata_cache->free_fcaches); + fcache = dlist_container(arrowMetadataFieldCache, chain, dnode); + fcache->owner->n_actives++; + Assert(fcache->magic == ARROW_METADATA_CACHE_FREE_MAGIC); + memset(&fcache->chain, 0, (offsetof(arrowMetadataFieldCache, magic) - + offsetof(arrowMetadataFieldCache, chain))); + fcache->magic = ARROW_METADATA_CACHE_ACTIVE_MAGIC; + return fcache; +} - filesList = __arrowFdwExtractFilesList(ft->options, - ¶llel_nworkers, - &writable); - foreach (lc, filesList) +static arrowMetadataCache * +__allocMetadataCache(void) +{ + arrowMetadataCache *mcache; + dlist_node *dnode; + + if (dlist_is_empty(&arrow_metadata_cache->free_mcaches)) { - char *fname = strVal(lfirst(lc)); - File fdesc; - List *rb_cached; - ListCell *cell; - Bitmapset *__gpus; - size_t len = 0; - - fdesc = PathNameOpenFile(fname, O_RDONLY | PG_BINARY); - if (fdesc < 0) + arrowMetadataCacheBlock *mc_block; + char *pos, *end; + + while (dlist_is_empty(&arrow_metadata_cache->free_blocks)) { - if (writable && errno == ENOENT) - continue; - elog(ERROR, "failed to open file '%s' on behalf of '%s'", - fname, get_rel_name(foreigntableid)); + if (!__reclaimMetadataCache()) + return NULL; } - /* lookup optimal GPUs */ - __gpus = extraSysfsLookupOptimalGpus(fdesc); - if (optimal_gpus == (void *)(~0UL)) - optimal_gpus = __gpus; - else - optimal_gpus = bms_intersect(optimal_gpus, __gpus); - /* lookup or build metadata cache */ - rb_cached = arrowLookupOrBuildMetadataCache(fdesc, NULL); - foreach (cell, rb_cached) + dnode = dlist_pop_head_node(&arrow_metadata_cache->free_blocks); + mc_block = dlist_container(arrowMetadataCacheBlock, chain, dnode); + memset(mc_block, 0, offsetof(arrowMetadataCacheBlock, data)); + mc_block->unitsz = MAXALIGN(sizeof(arrowMetadataCache)); + for (pos = mc_block->data, end = (char *)mc_block + ARROW_METADATA_BLOCKSZ; + pos + mc_block->unitsz <= end; + pos += mc_block->unitsz) { - RecordBatchState *rb_state = lfirst(cell); - - if (cell == list_head(rb_cached)) - filesSizeTotal += BLCKALIGN(rb_state->stat_buf.st_size); - - if (bms_is_member(-FirstLowInvalidHeapAttributeNumber, referenced)) - { - for (j=0; j < rb_state->ncols; j++) - len += RecordBatchFieldLength(&rb_state->columns[j]); - } - else - { - for (k = bms_next_member(referenced, -1); - k >= 0; - k = bms_next_member(referenced, k)) - { - j = k + FirstLowInvalidHeapAttributeNumber; - if (j < 0 || j >= rb_state->ncols) - continue; - len += RecordBatchFieldLength(&rb_state->columns[j]); - } - } - ntuples += rb_state->rb_nitems; + mcache = (arrowMetadataCache *)pos; + mcache->owner = mc_block; + mcache->magic = ARROW_METADATA_CACHE_FREE_MAGIC; + dlist_push_tail(&arrow_metadata_cache->free_mcaches, + &mcache->chain); } - npages = len / BLCKSZ; - FileClose(fdesc); } - bms_free(referenced); - - if (optimal_gpus == (void *)(~0UL) || - filesSizeTotal < pgstrom_gpudirect_threshold()) - optimal_gpus = NULL; - - baserel->rel_parallel_workers = parallel_nworkers; - baserel->fdw_private = list_make1(optimal_gpus); - baserel->pages = npages; - baserel->tuples = ntuples; - baserel->rows = ntuples * - clauselist_selectivity(root, - baserel->baserestrictinfo, - 0, - JOIN_INNER, - NULL); + dnode = dlist_pop_head_node(&arrow_metadata_cache->free_mcaches); + mcache = dlist_container(arrowMetadataCache, chain, dnode); + mcache->owner->n_actives++; + Assert(mcache->magic == ARROW_METADATA_CACHE_FREE_MAGIC); + memset(&mcache->chain, 0, (offsetof(arrowMetadataCache, magic) - + offsetof(arrowMetadataCache, chain))); + mcache->magic = ARROW_METADATA_CACHE_ACTIVE_MAGIC; + return mcache; } /* - * GetOptimalGpusForArrowFdw + * lookupArrowMetadataCache * - * optimal GPUs bitmap is saved at baserel->fdw_private + * caller must hold "at least" shared lock on the arrow_metadata_cache->mutex. + * if exclusive lock is held, it may invalidate legacy cache if any. */ -Bitmapset * -GetOptimalGpusForArrowFdw(PlannerInfo *root, RelOptInfo *baserel) +static inline uint32_t +arrowMetadataHashIndex(struct stat *stat_buf) { - if (baserel->fdw_private == NIL) - { - RangeTblEntry *rte = root->simple_rte_array[baserel->relid]; + struct { + dev_t st_dev; + ino_t st_ino; + } hkey; + uint32_t hash; - ArrowGetForeignRelSize(root, baserel, rte->relid); - } - return linitial(baserel->fdw_private); + hkey.st_dev = stat_buf->st_dev; + hkey.st_ino = stat_buf->st_ino; + hash = hash_bytes((unsigned char *)&hkey, sizeof(hkey)); + return hash % ARROW_METADATA_HASH_NSLOTS; } -static void -cost_arrow_fdw_seqscan(Path *path, - PlannerInfo *root, - RelOptInfo *baserel, - ParamPathInfo *param_info, - int num_workers) +static arrowMetadataCache * +lookupArrowMetadataCache(struct stat *stat_buf, bool has_exclusive) { - Cost startup_cost = 0.0; - Cost disk_run_cost = 0.0; - Cost cpu_run_cost = 0.0; - QualCost qcost; - double nrows; - double spc_seq_page_cost; + arrowMetadataCache *mcache; + uint32_t hindex; + dlist_iter iter; - if (param_info) - nrows = param_info->ppi_rows; - else - nrows = baserel->rows; - - /* arrow_fdw.enabled */ - if (!arrow_fdw_enabled) - startup_cost += disable_cost; - - /* - * Storage costs - * - * XXX - smaller number of columns to read shall have less disk cost - * because of columnar format. Right now, we don't discount cost for - * the pages not to be read. - */ - get_tablespace_page_costs(baserel->reltablespace, - NULL, - &spc_seq_page_cost); - disk_run_cost = spc_seq_page_cost * baserel->pages; - - /* CPU costs */ - if (param_info) - { - cost_qual_eval(&qcost, param_info->ppi_clauses, root); - qcost.startup += baserel->baserestrictcost.startup; - qcost.per_tuple += baserel->baserestrictcost.per_tuple; - } - else - qcost = baserel->baserestrictcost; - startup_cost += qcost.startup; - cpu_run_cost = (cpu_tuple_cost + qcost.per_tuple) * baserel->tuples; - - /* tlist evaluation costs */ - startup_cost += path->pathtarget->cost.startup; - cpu_run_cost += path->pathtarget->cost.per_tuple * path->rows; - - /* adjust cost for CPU parallelism */ - if (num_workers > 0) - { - double leader_contribution; - double parallel_divisor = (double) num_workers; - - /* see get_parallel_divisor() */ - leader_contribution = 1.0 - (0.3 * (double)num_workers); - parallel_divisor += Max(leader_contribution, 0.0); - - /* The CPU cost is divided among all the workers. */ - cpu_run_cost /= parallel_divisor; - - /* Estimated row count per background worker process */ - nrows = clamp_row_est(nrows / parallel_divisor); - } - path->rows = nrows; - path->startup_cost = startup_cost; - path->total_cost = startup_cost + cpu_run_cost + disk_run_cost; - path->parallel_workers = num_workers; -} - -/* - * ArrowGetForeignPaths - */ -static void -ArrowGetForeignPaths(PlannerInfo *root, - RelOptInfo *baserel, - Oid foreigntableid) -{ - ForeignPath *fpath; - ParamPathInfo *param_info; - Relids required_outer = baserel->lateral_relids; - - param_info = get_baserel_parampathinfo(root, baserel, required_outer); - - fpath = create_foreignscan_path(root, baserel, - NULL, /* default pathtarget */ - -1, /* dummy */ - -1.0, /* dummy */ - -1.0, /* dummy */ - NIL, /* no pathkeys */ - required_outer, - NULL, /* no extra plan */ - NIL); /* no particular private */ - cost_arrow_fdw_seqscan(&fpath->path, root, baserel, param_info, 0); - add_path(baserel, (Path *)fpath); - - if (baserel->consider_parallel) - { - int num_workers = - compute_parallel_worker(baserel, - baserel->pages, -1.0, - max_parallel_workers_per_gather); - if (num_workers == 0) - return; - - fpath = create_foreignscan_path(root, - baserel, - NULL, /* default pathtarget */ - -1, /* dummy */ - -1.0, /* dummy */ - -1.0, /* dummy */ - NIL, /* no pathkeys */ - required_outer, - NULL, /* no extra plan */ - NIL); /* no particular private */ - fpath->path.parallel_aware = true; - - cost_arrow_fdw_seqscan(&fpath->path, root, baserel, param_info, - num_workers); - add_partial_path(baserel, (Path *)fpath); - } -} - -/* - * ArrowGetForeignPlan - */ -static ForeignScan * -ArrowGetForeignPlan(PlannerInfo *root, - RelOptInfo *baserel, - Oid foreigntableid, - ForeignPath *best_path, - List *tlist, - List *scan_clauses, - Plan *outer_plan) -{ - Bitmapset *referenced = NULL; - List *ref_list = NIL; - ListCell *lc; - int j, k; - - foreach (lc, baserel->baserestrictinfo) + hindex = arrowMetadataHashIndex(stat_buf); + dlist_foreach(iter, &arrow_metadata_cache->hash_slots[hindex]) { - RestrictInfo *rinfo = lfirst(lc); + mcache = dlist_container(arrowMetadataCache, chain, iter.cur); - pull_varattnos((Node *)rinfo->clause, baserel->relid, &referenced); - } - referenced = pgstrom_pullup_outer_refs(root, baserel, referenced); - - for (k = bms_next_member(referenced, -1); - k >= 0; - k = bms_next_member(referenced, k)) - { - j = k + FirstLowInvalidHeapAttributeNumber; - ref_list = lappend_int(ref_list, j); + if (stat_buf->st_dev == mcache->stat_buf.st_dev && + stat_buf->st_ino == mcache->stat_buf.st_ino) + { + /* + * Is the metadata cache still valid? + */ + if (stat_buf->st_mtim.tv_sec < mcache->stat_buf.st_mtim.tv_sec || + (stat_buf->st_mtim.tv_sec == mcache->stat_buf.st_mtim.tv_sec && + stat_buf->st_mtim.tv_nsec <= mcache->stat_buf.st_mtim.tv_nsec)) + { + /* ok, found */ + SpinLockAcquire(&arrow_metadata_cache->lru_lock); + gettimeofday(&mcache->lru_tv, NULL); + dlist_move_head(&arrow_metadata_cache->lru_list, + &mcache->lru_chain); + SpinLockRelease(&arrow_metadata_cache->lru_lock); + return mcache; + } + else if (has_exclusive) + { + /* + * Unfortunatelly, metadata cache is already invalid. + * If caller has exclusive lock, we release it. + */ + SpinLockAcquire(&arrow_metadata_cache->lru_lock); + dlist_delete(&mcache->lru_chain); + memset(&mcache->lru_chain, 0, sizeof(dlist_node)); + SpinLockRelease(&arrow_metadata_cache->lru_lock); + dlist_delete(&mcache->chain); + memset(&mcache->chain, 0, sizeof(dlist_node)); + + __releaseMetadataCache(mcache); + } + } } - bms_free(referenced); - - return make_foreignscan(tlist, - extract_actual_clauses(scan_clauses, false), - baserel->relid, - NIL, /* no expressions to evaluate */ - ref_list, /* list of referenced attnums */ - NIL, /* no custom tlist */ - NIL, /* no remote quals */ - outer_plan); + return NULL; } /* ---------------------------------------------------------------- * - * Routines related to min/max statistics and scan hint + * buildArrowStatsBinary * - * If mapped Apache Arrow files have custome-metadata of "min_values" and - * "max_values" at the Field, arrow_fdw deals with this comma separated - * integer values as min/max value for each field, if any. - * Once we can know min/max value of the field, we can skip record batches - * that shall not match with WHERE-clause. + * ...and, routines related to Arrow Min/Max statistics * - * This min/max array is expected to have as many integer elements or nulls - * as there are record-batches. * ---------------------------------------------------------------- */ - -/* - * buildArrowStatsBinary - * - * It reconstruct binary min/max statistics per record-batch - * from the custom-metadata of ArrowField. - */ typedef struct arrowFieldStatsBinary { uint32 nrooms; /* number of record-batches */ - int unitsz; /* unit size of min/max statistics */ - bool *isnull; - char *min_values; - char *max_values; + MinMaxStatDatum *stat_values; int nfields; /* if List/Struct data type */ struct arrowFieldStatsBinary *subfields; } arrowFieldStatsBinary; @@ -648,38 +549,30 @@ typedef struct arrowFieldStatsBinary typedef struct { int nitems; /* number of record-batches */ - int ncols; - arrowFieldStatsBinary columns[FLEXIBLE_ARRAY_MEMBER]; + int nfields; /* number of columns */ + arrowFieldStatsBinary fields[FLEXIBLE_ARRAY_MEMBER]; } arrowStatsBinary; static void __releaseArrowFieldStatsBinary(arrowFieldStatsBinary *bstats) { - int j; - if (bstats->subfields) { - for (j=0; j < bstats->nfields; j++) + for (int j=0; j < bstats->nfields; j++) __releaseArrowFieldStatsBinary(&bstats->subfields[j]); pfree(bstats->subfields); } - if (bstats->isnull) - pfree(bstats->isnull); - if (bstats->min_values) - pfree(bstats->min_values); - if (bstats->max_values) - pfree(bstats->max_values); + if (bstats->stat_values) + pfree(bstats->stat_values); } static void releaseArrowStatsBinary(arrowStatsBinary *arrow_bstats) { - int j; - if (arrow_bstats) { - for (j=0; j < arrow_bstats->ncols; j++) - __releaseArrowFieldStatsBinary(&arrow_bstats->columns[j]); + for (int j=0; j < arrow_bstats->nfields; j++) + __releaseArrowFieldStatsBinary(&arrow_bstats->fields[j]); pfree(arrow_bstats); } } @@ -718,116 +611,20 @@ __parseArrowFieldStatsBinary(arrowFieldStatsBinary *bstats, const char *min_tokens, const char *max_tokens) { - int unitsz = -1; + MinMaxStatDatum *stat_values; char *min_buffer; char *max_buffer; - char *min_values = NULL; - char *max_values = NULL; - bool *isnull = NULL; char *tok1, *pos1; char *tok2, *pos2; - uint32 index; - - /* determine the unitsz of datum */ - switch (field->type.node.tag) - { - case ArrowNodeTag__Int: - switch (field->type.Int.bitWidth) - { - case 8: - unitsz = sizeof(uint8_t); - break; - case 16: - unitsz = sizeof(uint16_t); - break; - case 32: - unitsz = sizeof(uint32_t); - break; - case 64: - unitsz = sizeof(uint64_t); - break; - default: - return false; - } - break; - - case ArrowNodeTag__FloatingPoint: - switch (field->type.FloatingPoint.precision) - { - case ArrowPrecision__Half: - unitsz = sizeof(uint16_t); - break; - case ArrowPrecision__Single: - unitsz = sizeof(uint32_t); - break; - case ArrowPrecision__Double: - unitsz = sizeof(uint64_t); - break; - default: - return false; - } - break; - - case ArrowNodeTag__Decimal: - unitsz = sizeof(int128_t); - break; - - case ArrowNodeTag__Date: - switch (field->type.Date.unit) - { - case ArrowDateUnit__Day: - unitsz = sizeof(uint32_t); - break; - case ArrowDateUnit__MilliSecond: - unitsz = sizeof(uint64_t); - break; - default: - return false; - } - break; - - case ArrowNodeTag__Time: - switch (field->type.Time.unit) - { - case ArrowTimeUnit__Second: - case ArrowTimeUnit__MilliSecond: - unitsz = sizeof(uint32_t); - break; - case ArrowTimeUnit__MicroSecond: - case ArrowTimeUnit__NanoSecond: - unitsz = sizeof(uint64_t); - break; - default: - return false; - } - break; + uint32_t index; - case ArrowNodeTag__Timestamp: - switch (field->type.Timestamp.unit) - { - case ArrowTimeUnit__Second: - case ArrowTimeUnit__MilliSecond: - case ArrowTimeUnit__MicroSecond: - case ArrowTimeUnit__NanoSecond: - unitsz = sizeof(uint64_t); - break; - default: - return false; - } - break; - default: - return false; - } - Assert(unitsz > 0); /* parse the min_tokens/max_tokens */ min_buffer = alloca(strlen(min_tokens) + 1); max_buffer = alloca(strlen(max_tokens) + 1); strcpy(min_buffer, min_tokens); strcpy(max_buffer, max_tokens); - min_values = palloc0(unitsz * bstats->nrooms); - max_values = palloc0(unitsz * bstats->nrooms); - isnull = palloc0(sizeof(bool) * bstats->nrooms); + stat_values = palloc0(sizeof(MinMaxStatDatum) * bstats->nrooms); for (tok1 = strtok_r(min_buffer, ",", &pos1), tok2 = strtok_r(max_buffer, ",", &pos2), index = 0; tok1 != NULL && tok2 != NULL && index < bstats->nrooms; @@ -839,26 +636,107 @@ __parseArrowFieldStatsBinary(arrowFieldStatsBinary *bstats, int128_t __max = __atoi128(__trim(tok2), &__isnull); if (__isnull) - isnull[index] = true; - else { - memcpy(min_values + unitsz * index, &__min, unitsz); - memcpy(max_values + unitsz * index, &__max, unitsz); + stat_values[index].isnull = true; + continue; + } + + switch (field->type.node.tag) + { + case ArrowNodeTag__Int: + case ArrowNodeTag__FloatingPoint: + stat_values[index].min.datum = (Datum)__min; + stat_values[index].max.datum = (Datum)__min; + break; + + case ArrowNodeTag__Decimal: + __xpu_numeric_to_varlena((char *)&stat_values[index].min.numeric, + field->type.Decimal.scale, + __min); + __xpu_numeric_to_varlena((char *)&stat_values[index].max.numeric, + field->type.Decimal.scale, + __max); + break; + + case ArrowNodeTag__Date: + switch (field->type.Date.unit) + { + case ArrowDateUnit__Day: + stat_values[index].min.datum = __min + - (POSTGRES_EPOCH_JDATE - UNIX_EPOCH_JDATE); + stat_values[index].max.datum = __max + - (POSTGRES_EPOCH_JDATE - UNIX_EPOCH_JDATE); + break; + case ArrowDateUnit__MilliSecond: + stat_values[index].min.datum = __min / (SECS_PER_DAY * 1000) + - (POSTGRES_EPOCH_JDATE - UNIX_EPOCH_JDATE); + stat_values[index].max.datum = __max / (SECS_PER_DAY * 1000) + - (POSTGRES_EPOCH_JDATE - UNIX_EPOCH_JDATE); + break; + default: + goto bailout; + } + break; + + case ArrowNodeTag__Time: + switch (field->type.Time.unit) + { + case ArrowTimeUnit__Second: + stat_values[index].min.datum = __min * 1000000L; + stat_values[index].max.datum = __max * 1000000L; + break; + case ArrowTimeUnit__MilliSecond: + stat_values[index].min.datum = __min * 1000L; + stat_values[index].max.datum = __max * 1000L; + break; + case ArrowTimeUnit__MicroSecond: + stat_values[index].min.datum = __min; + stat_values[index].max.datum = __max; + break; + case ArrowTimeUnit__NanoSecond: + stat_values[index].min.datum = __min / 1000; + stat_values[index].max.datum = __max / 1000; + break; + default: + goto bailout; + } + break; + + case ArrowNodeTag__Timestamp: + switch (field->type.Timestamp.unit) + { + case ArrowTimeUnit__Second: + stat_values[index].min.datum = __min * 1000000L; + stat_values[index].max.datum = __max * 1000000L; + break; + case ArrowTimeUnit__MilliSecond: + stat_values[index].min.datum = __min * 1000L; + stat_values[index].max.datum = __max * 1000L; + break; + case ArrowTimeUnit__MicroSecond: + stat_values[index].min.datum = __min; + stat_values[index].max.datum = __max; + break; + case ArrowTimeUnit__NanoSecond: + stat_values[index].min.datum = __min / 1000; + stat_values[index].max.datum = __max / 1000; + break; + default: + goto bailout; + } + break; + default: + goto bailout; } } /* sanity checks */ if (!tok1 && !tok2 && index == bstats->nrooms) { - bstats->unitsz = unitsz; - bstats->isnull = isnull; - bstats->min_values = min_values; - bstats->max_values = max_values; + bstats->stat_values = stat_values; return true; } - /* elsewhere, something wrong */ - pfree(min_values); - pfree(max_values); - pfree(isnull); +bailout: + pfree(stat_values); return false; } @@ -883,7 +761,6 @@ __buildArrowFieldStatsBinary(arrowFieldStatsBinary *bstats, } bstats->nrooms = numRecordBatches; - bstats->unitsz = -1; if (min_tokens && max_tokens) { if (__parseArrowFieldStatsBinary(bstats, field, @@ -892,20 +769,6 @@ __buildArrowFieldStatsBinary(arrowFieldStatsBinary *bstats, { retval = true; } - else - { - /* parse error, ignore the stat */ - if (bstats->isnull) - pfree(bstats->isnull); - if (bstats->min_values) - pfree(bstats->min_values); - if (bstats->max_values) - pfree(bstats->max_values); - bstats->unitsz = -1; - bstats->isnull = NULL; - bstats->min_values = NULL; - bstats->max_values = NULL; - } } if (field->_num_children > 0) @@ -927,16 +790,16 @@ static arrowStatsBinary * buildArrowStatsBinary(const ArrowFooter *footer, Bitmapset **p_stat_attrs) { arrowStatsBinary *arrow_bstats; - int j, ncols = footer->schema._num_fields; + int nfields = footer->schema._num_fields; bool found = false; arrow_bstats = palloc0(offsetof(arrowStatsBinary, - columns[ncols])); + fields[nfields])); arrow_bstats->nitems = footer->_num_recordBatches; - arrow_bstats->ncols = ncols; - for (j=0; j < ncols; j++) + arrow_bstats->nfields = nfields; + for (int j=0; j < nfields; j++) { - if (__buildArrowFieldStatsBinary(&arrow_bstats->columns[j], + if (__buildArrowFieldStatsBinary(&arrow_bstats->fields[j], &footer->schema.fields[j], footer->_num_recordBatches)) { @@ -955,124 +818,44 @@ buildArrowStatsBinary(const ArrowFooter *footer, Bitmapset **p_stat_attrs) /* * applyArrowStatsBinary - * - * It applies the fetched min/max values on the cached record-batch metadata */ static void -__applyArrowFieldStatsBinary(RecordBatchFieldState *fstate, +__applyArrowFieldStatsBinary(RecordBatchFieldState *rb_field, arrowFieldStatsBinary *bstats, int rb_index) { int j; - if (bstats->unitsz > 0 && - bstats->isnull != NULL && - bstats->min_values != NULL && - bstats->max_values != NULL) + if (bstats->stat_values) { - size_t off = bstats->unitsz * rb_index; - - memcpy(&fstate->stat_min, - bstats->min_values + off, bstats->unitsz); - memcpy(&fstate->stat_max, - bstats->max_values + off, bstats->unitsz); - fstate->stat_isnull = false; + memcpy(&rb_field->stat_datum, + &bstats->stat_values[rb_index], sizeof(MinMaxStatDatum)); } else { - memset(&fstate->stat_min, 0, sizeof(SQLstat__datum)); - memset(&fstate->stat_max, 0, sizeof(SQLstat__datum)); - fstate->stat_isnull = true; + rb_field->stat_datum.isnull = true; } - - Assert(fstate->num_children == bstats->nfields); - for (j=0; j < fstate->num_children; j++) + Assert(rb_field->num_children == bstats->nfields); + for (j=0; j < rb_field->num_children; j++) { - RecordBatchFieldState *__fstate = &fstate->children[j]; + RecordBatchFieldState *__rb_field = &rb_field->children[j]; arrowFieldStatsBinary *__bstats = &bstats->subfields[j]; - __applyArrowFieldStatsBinary(__fstate, __bstats, rb_index); + __applyArrowFieldStatsBinary(__rb_field, __bstats, rb_index); } } static void applyArrowStatsBinary(RecordBatchState *rb_state, arrowStatsBinary *arrow_bstats) { - int j, ncols = rb_state->ncols; - - Assert(rb_state->ncols == arrow_bstats->ncols && + Assert(rb_state->nfields == arrow_bstats->nfields && rb_state->rb_index < arrow_bstats->nitems); - for (j=0; j < ncols; j++) - { - RecordBatchFieldState *fstate = &rb_state->columns[j]; - arrowFieldStatsBinary *bstats = &arrow_bstats->columns[j]; - - __applyArrowFieldStatsBinary(fstate, bstats, rb_state->rb_index); - } -} - -static SQLstat * -__buildArrowFieldStatsList(ArrowField *field, uint32 numRecordBatches) -{ - const char *min_tokens = NULL; - const char *max_tokens = NULL; - char *min_buffer; - char *max_buffer; - char *tok1, *pos1; - char *tok2, *pos2; - SQLstat *results = NULL; - int k, index; - - for (k=0; k < field->_num_custom_metadata; k++) - { - ArrowKeyValue *kv = &field->custom_metadata[k]; - - if (strcmp(kv->key, "min_values") == 0) - min_tokens = kv->value; - else if (strcmp(kv->key, "max_values") == 0) - max_tokens = kv->value; - } - if (!min_tokens || !max_tokens) - return NULL; - min_buffer = alloca(strlen(min_tokens) + 1); - max_buffer = alloca(strlen(max_tokens) + 1); - strcpy(min_buffer, min_tokens); - strcpy(max_buffer, max_tokens); - - for (tok1 = strtok_r(min_buffer, ",", &pos1), - tok2 = strtok_r(max_buffer, ",", &pos2), index = 0; - tok1 && tok2; - tok1 = strtok_r(NULL, ",", &pos1), - tok2 = strtok_r(NULL, ",", &pos2), index++) - { - bool __isnull = false; - int128_t __min = __atoi128(__trim(tok1), &__isnull); - int128_t __max = __atoi128(__trim(tok2), &__isnull); - - if (!__isnull) - { - SQLstat *item = palloc0(sizeof(SQLstat)); - - item->next = results; - item->rb_index = index; - item->is_valid = true; - item->min.i128 = __min; - item->max.i128 = __max; - results = item; - } - } - /* sanity checks */ - if (!tok1 && !tok2 && index == numRecordBatches) - return results; - /* ah, error... */ - while (results) + for (int j=0; j < rb_state->nfields; j++) { - SQLstat *next = results->next; - - pfree(results); - results = next; + __applyArrowFieldStatsBinary(&rb_state->fields[j], + &arrow_bstats->fields[j], + rb_state->rb_index); } - return NULL; } /* @@ -1081,7 +864,7 @@ __buildArrowFieldStatsList(ArrowField *field, uint32 numRecordBatches) * ... are executor routines for min/max statistics. */ static bool -__buildArrowStatsOper(arrowStatsHint *arange, +__buildArrowStatsOper(arrowStatsHint *as_hint, ScanState *ss, OpExpr *op, bool reverse) @@ -1109,9 +892,9 @@ __buildArrowStatsOper(arrowStatsHint *arange, arg = linitial(op->args); } /* Is it VAR ARG form? */ - if (!IsA(var, Var) || var->varno != scanrelid) + if (!IsA(var, Var) || var->varno != scanrelid || !OidIsValid(opcode)) return false; - if (!bms_is_member(var->varattno, arange->stat_attrs)) + if (!bms_is_member(var->varattno, as_hint->stat_attrs)) return false; if (contain_var_clause(arg) || contain_volatile_functions(arg)) @@ -1135,10 +918,11 @@ __buildArrowStatsOper(arrowStatsHint *arange, if (strategy == BTLessStrategyNumber || strategy == BTLessEqualStrategyNumber) { - /* (VAR < ARG) --> (Min < ARG) */ - /* (VAR <= ARG) --> (Min <= ARG) */ - arange->load_attrs = bms_add_member(arange->load_attrs, - var->varattno); + /* if (VAR < ARG) --> (Min >= ARG), can be skipped */ + /* if (VAR <= ARG) --> (Min > ARG), can be skipped */ + opcode = get_negator(opcode); + if (!OidIsValid(opcode)) + return false; expr = make_opclause(opcode, op->opresulttype, op->opretset, @@ -1152,15 +936,16 @@ __buildArrowStatsOper(arrowStatsHint *arange, op->opcollid, op->inputcollid); set_opfuncid((OpExpr *)expr); - arange->eval_quals = lappend(arange->eval_quals, expr); + as_hint->eval_quals = lappend(as_hint->eval_quals, expr); } else if (strategy == BTGreaterEqualStrategyNumber || strategy == BTGreaterStrategyNumber) { - /* (VAR >= ARG) --> (Max >= ARG) */ - /* (VAR > ARG) --> (Max > ARG) */ - arange->load_attrs = bms_add_member(arange->load_attrs, - var->varattno); + /* if (VAR > ARG) --> (Max <= ARG), can be skipped */ + /* if (VAR >= ARG) --> (Max < ARG), can be skipped */ + opcode = get_negator(opcode); + if (!OidIsValid(opcode)) + return false; expr = make_opclause(opcode, op->opresulttype, op->opretset, @@ -1174,18 +959,18 @@ __buildArrowStatsOper(arrowStatsHint *arange, op->opcollid, op->inputcollid); set_opfuncid((OpExpr *)expr); - arange->eval_quals = lappend(arange->eval_quals, expr); + as_hint->eval_quals = lappend(as_hint->eval_quals, expr); } else if (strategy == BTEqualStrategyNumber) { - /* (VAR = ARG) --> (Max >= ARG && Min <= ARG) */ + /* (VAR = ARG) --> (Min > ARG) || (Max < ARG), can be skipped */ opcode = get_opfamily_member(opfamily, var->vartype, exprType((Node *)arg), - BTGreaterEqualStrategyNumber); + BTGreaterStrategyNumber); expr = make_opclause(opcode, op->opresulttype, op->opretset, - (Expr *)makeVar(OUTER_VAR, + (Expr *)makeVar(INNER_VAR, var->varattno, var->vartype, var->vartypmod, @@ -1195,7 +980,7 @@ __buildArrowStatsOper(arrowStatsHint *arange, op->opcollid, op->inputcollid); set_opfuncid((OpExpr *)expr); - arange->eval_quals = lappend(arange->eval_quals, expr); + as_hint->eval_quals = lappend(as_hint->eval_quals, expr); opcode = get_opfamily_member(opfamily, var->vartype, exprType((Node *)arg), @@ -1203,7 +988,7 @@ __buildArrowStatsOper(arrowStatsHint *arange, expr = make_opclause(opcode, op->opresulttype, op->opretset, - (Expr *)makeVar(INNER_VAR, + (Expr *)makeVar(OUTER_VAR, var->varattno, var->vartype, var->vartypmod, @@ -1213,169 +998,55 @@ __buildArrowStatsOper(arrowStatsHint *arange, op->opcollid, op->inputcollid); set_opfuncid((OpExpr *)expr); - arange->eval_quals = lappend(arange->eval_quals, expr); + as_hint->eval_quals = lappend(as_hint->eval_quals, expr); } else { return false; } - arange->load_attrs = bms_add_member(arange->load_attrs, - var->varattno); + as_hint->load_attrs = bms_add_member(as_hint->load_attrs, var->varattno); + return true; } static arrowStatsHint * -execInitArrowStatsHint(ScanState *ss, - Bitmapset *stat_attrs, - List *outer_quals) +execInitArrowStatsHint(ScanState *ss, List *outer_quals, Bitmapset *stat_attrs) { Relation relation = ss->ss_currentRelation; TupleDesc tupdesc = RelationGetDescr(relation); + arrowStatsHint *as_hint; ExprContext *econtext; - arrowStatsHint *result, temp; Expr *eval_expr; ListCell *lc; - memset(&temp, 0, sizeof(arrowStatsHint)); - temp.stat_attrs = stat_attrs; + as_hint = palloc0(sizeof(arrowStatsHint)); + as_hint->stat_attrs = stat_attrs; foreach (lc, outer_quals) { OpExpr *op = lfirst(lc); if (IsA(op, OpExpr) && list_length(op->args) == 2 && - (__buildArrowStatsOper(&temp, ss, op, false) || - __buildArrowStatsOper(&temp, ss, op, true))) + (__buildArrowStatsOper(as_hint, ss, op, false) || + __buildArrowStatsOper(as_hint, ss, op, true))) { - temp.orig_quals = lappend(temp.orig_quals, copyObject(op)); + as_hint->orig_quals = lappend(as_hint->orig_quals, op); } } - if (!temp.orig_quals) + if (as_hint->eval_quals == NIL) return NULL; - - Assert(list_length(temp.eval_quals) > 0); - if (list_length(temp.eval_quals) == 1) - eval_expr = linitial(temp.eval_quals); + if (list_length(as_hint->eval_quals) == 1) + eval_expr = linitial(as_hint->eval_quals); else - eval_expr = make_andclause(temp.eval_quals); + eval_expr = make_orclause(as_hint->eval_quals); econtext = CreateExprContext(ss->ps.state); econtext->ecxt_innertuple = MakeSingleTupleTableSlot(tupdesc, &TTSOpsVirtual); econtext->ecxt_outertuple = MakeSingleTupleTableSlot(tupdesc, &TTSOpsVirtual); - result = palloc0(sizeof(arrowStatsHint)); - result->orig_quals = temp.orig_quals; - result->eval_quals = temp.eval_quals; - result->eval_state = ExecInitExpr(eval_expr, &ss->ps); - result->stat_attrs = bms_copy(stat_attrs); - result->load_attrs = temp.load_attrs; - result->econtext = econtext; - - return result; -} - -static bool -__fetchArrowStatsDatum(RecordBatchFieldState *fstate, - SQLstat__datum *sval, - Datum *p_datum, bool *p_isnull) -{ - Datum datum; - int64 shift; - - switch (fstate->atttypid) - { - case INT1OID: - datum = Int8GetDatum(sval->i8); - break; - case INT2OID: - case FLOAT2OID: - datum = Int16GetDatum(sval->i16); - break; - case INT4OID: - case FLOAT4OID: - datum = Int32GetDatum(sval->i32); - break; - case INT8OID: - case FLOAT8OID: - datum = Int64GetDatum(sval->i64); - break; - case NUMERICOID: - { - Int128_t decimal; - int dscale = fstate->attopts.decimal.scale; - char *result = palloc0(sizeof(struct NumericData)); - - decimal.ival = sval->i128; - while (dscale > 0 && decimal.ival % 10 == 0) - { - decimal.ival /= 10; - dscale--; - } - pg_numeric_to_varlena(result, dscale, decimal); - - datum = PointerGetDatum(result); - } - break; - case DATEOID: - shift = POSTGRES_EPOCH_JDATE - UNIX_EPOCH_JDATE; - switch (fstate->attopts.date.unit) - { - case ArrowDateUnit__Day: - datum = DateADTGetDatum((DateADT)sval->i32 - shift); - break; - case ArrowDateUnit__MilliSecond: - datum = DateADTGetDatum((DateADT)sval->i64 / 1000L - shift); - break; - default: - return false; - } - break; + as_hint->eval_state = ExecInitExpr(eval_expr, &ss->ps); + as_hint->econtext = econtext; - case TIMEOID: - switch (fstate->attopts.time.unit) - { - case ArrowTimeUnit__Second: - datum = TimeADTGetDatum((TimeADT)sval->u32 * 1000000L); - break; - case ArrowTimeUnit__MilliSecond: - datum = TimeADTGetDatum((TimeADT)sval->u32 * 1000L); - break; - case ArrowTimeUnit__MicroSecond: - datum = TimeADTGetDatum((TimeADT)sval->u64); - break; - case ArrowTimeUnit__NanoSecond: - datum = TimeADTGetDatum((TimeADT)sval->u64 / 1000L); - break; - default: - return false; - } - break; - case TIMESTAMPOID: - case TIMESTAMPTZOID: - shift = (POSTGRES_EPOCH_JDATE - UNIX_EPOCH_JDATE) * USECS_PER_DAY; - switch (fstate->attopts.timestamp.unit) - { - case ArrowTimeUnit__Second: - datum = TimestampGetDatum((Timestamp)sval->i64 * 1000000L - shift); - break; - case ArrowTimeUnit__MilliSecond: - datum = TimestampGetDatum((Timestamp)sval->i64 * 1000L - shift); - break; - case ArrowTimeUnit__MicroSecond: - datum = TimestampGetDatum((Timestamp)sval->i64 - shift); - break; - case ArrowTimeUnit__NanoSecond: - datum = TimestampGetDatum((Timestamp)sval->i64 / 1000L - shift); - break; - default: - return false; - } - break; - default: - return false; - } - *p_datum = datum; - *p_isnull = false; - return true; + return as_hint; } static bool @@ -1396,32 +1067,32 @@ execCheckArrowStatsHint(arrowStatsHint *stats_hint, anum >= 0; anum = bms_next_member(stats_hint->load_attrs, anum)) { - RecordBatchFieldState *fstate = &rb_state->columns[anum-1]; + RecordBatchFieldState *rb_field = &rb_state->fields[anum-1]; - Assert(anum > 0 && anum <= rb_state->ncols); - /* - * In case when min/max statistics are missing, we cannot determine - * whether we can skip the current record-batch. - */ - if (fstate->stat_isnull) - return false; - - if (!__fetchArrowStatsDatum(fstate, &fstate->stat_min, - &min_values->tts_values[anum-1], - &min_values->tts_isnull[anum-1])) - return false; - - if (!__fetchArrowStatsDatum(fstate, &fstate->stat_max, - &max_values->tts_values[anum-1], - &max_values->tts_isnull[anum-1])) - return false; + Assert(anum > 0 && anum <= rb_state->nfields); + if (!rb_field->stat_datum.isnull) + { + min_values->tts_isnull[anum-1] = false; + max_values->tts_isnull[anum-1] = false; + if (rb_field->atttypid == NUMERICOID) + { + min_values->tts_values[anum-1] + = PointerGetDatum(&rb_field->stat_datum.min.numeric); + max_values->tts_values[anum-1] + = PointerGetDatum(&rb_field->stat_datum.max.numeric); + } + else + { + min_values->tts_values[anum-1] = rb_field->stat_datum.min.datum; + max_values->tts_values[anum-1] = rb_field->stat_datum.max.datum; + } + } } datum = ExecEvalExprSwitchContext(stats_hint->eval_state, econtext, &isnull); - // elog(INFO, "file [%s] rb_index=%u datum=%lu isnull=%d", // FilePathName(rb_state->fdesc), rb_state->rb_index, datum, (int)isnull); if (!isnull && DatumGetBool(datum)) - return true; + return true; /* ok, skip this record-batch */ return false; } @@ -1438,450 +1109,1085 @@ execEndArrowStatsHint(arrowStatsHint *stats_hint) FreeExprContext(econtext, true); } + +/* ---------------------------------------------------------------- + * + * BuildArrowFileState + * + * It build RecordBatchState based on the metadata-cache, or raw Arrow files. + * ---------------------------------------------------------------- + */ +static void +__buildRecordBatchFieldStateByCache(RecordBatchFieldState *rb_field, + arrowMetadataFieldCache *fcache) +{ + rb_field->atttypid = fcache->atttypid; + rb_field->atttypmod = fcache->atttypmod; + rb_field->attopts = fcache->attopts; + rb_field->nitems = fcache->nitems; + rb_field->null_count = fcache->null_count; + rb_field->nullmap_offset = fcache->nullmap_offset; + rb_field->nullmap_length = fcache->nullmap_length; + rb_field->values_offset = fcache->values_offset; + rb_field->values_length = fcache->values_length; + rb_field->extra_offset = fcache->extra_offset; + rb_field->extra_length = fcache->extra_length; + memcpy(&rb_field->stat_datum, + &fcache->stat_datum, sizeof(MinMaxStatDatum)); + if (fcache->num_children > 0) + { + dlist_iter iter; + int j = 0; + + rb_field->num_children = fcache->num_children; + rb_field->children = palloc0(sizeof(RecordBatchFieldState) * + fcache->num_children); + dlist_foreach(iter, &fcache->children) + { + arrowMetadataFieldCache *__fcache + = dlist_container(arrowMetadataFieldCache, chain, iter.cur); + __buildRecordBatchFieldStateByCache(&rb_field->children[j++], __fcache); + } + Assert(j == rb_field->num_children); + } + else + { + Assert(dlist_is_empty(&fcache->children)); + } +} + +static ArrowFileState * +__buildArrowFileStateByCache(const char *filename, + arrowMetadataCache *mcache, + Bitmapset **p_stat_attrs) +{ + ArrowFileState *af_state; + + af_state = palloc0(sizeof(ArrowFileState)); + af_state->filename = pstrdup(filename); + memcpy(&af_state->stat_buf, &mcache->stat_buf, sizeof(struct stat)); + + while (mcache) + { + RecordBatchState *rb_state; + dlist_iter iter; + int j = 0; + + rb_state = palloc0(offsetof(RecordBatchState, + fields[mcache->nfields])); + rb_state->af_state = af_state; + rb_state->rb_index = mcache->rb_index; + rb_state->rb_offset = mcache->rb_offset; + rb_state->rb_length = mcache->rb_length; + rb_state->rb_nitems = mcache->rb_nitems; + rb_state->nfields = mcache->nfields; + dlist_foreach(iter, &mcache->fields) + { + arrowMetadataFieldCache *fcache; + + fcache = dlist_container(arrowMetadataFieldCache, chain, iter.cur); + if (p_stat_attrs && fcache->stat_datum.isnull) + *p_stat_attrs = bms_add_member(*p_stat_attrs, j+1); + __buildRecordBatchFieldStateByCache(&rb_state->fields[j++], fcache); + } + Assert(j == rb_state->nfields); + af_state->rb_list = lappend(af_state->rb_list, rb_state); + + mcache = mcache->next; + } + return af_state; +} + /* - * Routines to setup record-batches + * Routines to setup RecordBatchState by raw-file */ typedef struct { - ArrowBuffer *buffer_curr; - ArrowBuffer *buffer_tail; + ArrowBuffer *buffer_curr; + ArrowBuffer *buffer_tail; ArrowFieldNode *fnode_curr; ArrowFieldNode *fnode_tail; } setupRecordBatchContext; -static void -assignArrowTypeOptions(ArrowTypeOptions *attopts, const ArrowType *atype) -{ - memset(attopts, 0, sizeof(ArrowTypeOptions)); - switch (atype->node.tag) +static Oid +__lookupCompositePGType(int nattrs, Oid *type_oids, Oid hint_oid) +{ + Relation rel; + ScanKeyData skeys[3]; + SysScanDesc sscan; + Oid comp_oid = InvalidOid; + + rel = table_open(RelationRelationId, AccessShareLock); + ScanKeyInit(&skeys[0], + Anum_pg_class_relkind, + BTEqualStrategyNumber, F_CHAREQ, + CharGetDatum(RELKIND_COMPOSITE_TYPE)); + ScanKeyInit(&skeys[1], + Anum_pg_class_relnatts, + BTEqualStrategyNumber, F_INT2EQ, + Int16GetDatum(nattrs)); + ScanKeyInit(&skeys[2], + Anum_pg_class_oid, + BTEqualStrategyNumber, F_OIDNE, + ObjectIdGetDatum(hint_oid)); + sscan = systable_beginscan(rel, InvalidOid, false, NULL, + OidIsValid(hint_oid) ? 3 : 2, skeys); + for (;;) { - case ArrowNodeTag__Decimal: - if (atype->Decimal.precision < SHRT_MIN || - atype->Decimal.precision > SHRT_MAX) - elog(ERROR, "Decimal precision is out of range"); - if (atype->Decimal.scale < SHRT_MIN || - atype->Decimal.scale > SHRT_MAX) - elog(ERROR, "Decimal scale is out of range"); - attopts->decimal.precision = atype->Decimal.precision; - attopts->decimal.scale = atype->Decimal.scale; - break; - case ArrowNodeTag__Date: - if (atype->Date.unit == ArrowDateUnit__Day || - atype->Date.unit == ArrowDateUnit__MilliSecond) - attopts->date.unit = atype->Date.unit; - else - elog(ERROR, "unknown unit of Date"); - break; - case ArrowNodeTag__Time: - if (atype->Time.unit == ArrowTimeUnit__Second || - atype->Time.unit == ArrowTimeUnit__MilliSecond || - atype->Time.unit == ArrowTimeUnit__MicroSecond || - atype->Time.unit == ArrowTimeUnit__NanoSecond) - attopts->time.unit = atype->Time.unit; - else - elog(ERROR, "unknown unit of Time"); - break; - case ArrowNodeTag__Timestamp: - if (atype->Timestamp.unit == ArrowTimeUnit__Second || - atype->Timestamp.unit == ArrowTimeUnit__MilliSecond || - atype->Timestamp.unit == ArrowTimeUnit__MicroSecond || - atype->Timestamp.unit == ArrowTimeUnit__NanoSecond) - attopts->timestamp.unit = atype->Timestamp.unit; - else - elog(ERROR, "unknown unit of Timestamp"); - break; - case ArrowNodeTag__Interval: - if (atype->Interval.unit == ArrowIntervalUnit__Year_Month || - atype->Interval.unit == ArrowIntervalUnit__Day_Time) - attopts->interval.unit = atype->Interval.unit; - else - elog(ERROR, "unknown unit of Interval"); - break; - case ArrowNodeTag__FixedSizeBinary: - attopts->fixed_size_binary.byteWidth = atype->FixedSizeBinary.byteWidth; - break; - default: - /* no extra attributes */ - break; + HeapTuple htup; + TupleDesc tupdesc; + int j; + + if (OidIsValid(hint_oid)) + { + comp_oid = hint_oid; + hint_oid = InvalidOid; + } + else + { + htup = systable_getnext(sscan); + if (!HeapTupleIsValid(htup)) + break; + comp_oid = ((Form_pg_type) GETSTRUCT(htup))->oid; + } + + if (pg_type_aclcheck(comp_oid, + GetUserId(), + ACL_USAGE) != ACLCHECK_OK) + continue; + + tupdesc = lookup_rowtype_tupdesc_noerror(comp_oid, -1, true); + if (!tupdesc) + continue; + if (tupdesc->natts == nattrs) + { + for (j=0; j < tupdesc->natts; j++) + { + Form_pg_attribute attr = TupleDescAttr(tupdesc, j); + + if (attr->atttypid != type_oids[j]) + break; + } + if (j == tupdesc->natts) + { + ReleaseTupleDesc(tupdesc); + goto found; + } + } + ReleaseTupleDesc(tupdesc); } + comp_oid = InvalidOid; /* not found */ +found: + systable_endscan(sscan); + table_close(rel, AccessShareLock); + + return comp_oid; } static void -setupRecordBatchField(setupRecordBatchContext *con, - RecordBatchFieldState *fstate, - ArrowField *field, - int depth) -{ - ArrowBuffer *buffer_curr; - ArrowFieldNode *fnode; - - if (con->fnode_curr >= con->fnode_tail) - elog(ERROR, "RecordBatch has less ArrowFieldNode than expected"); - fnode = con->fnode_curr++; - fstate->atttypid = arrowTypeToPGTypeOid(field, &fstate->atttypmod); - fstate->nitems = fnode->length; - fstate->null_count = fnode->null_count; - fstate->stat_isnull = true; +__arrowFieldTypeToPGType(const ArrowField *field, + Oid *p_type_oid, + int32_t *p_type_mod, + ArrowTypeOptions *p_attopts) +{ + const ArrowType *t = &field->type; + Oid type_oid = InvalidOid; + int32_t type_mod = -1; + Oid hint_oid = arrowFieldGetPGTypeHint(field); + ArrowTypeOptions attopts; - switch (field->type.node.tag) + memset(&attopts, 0, sizeof(ArrowTypeOptions)); + switch (t->node.tag) { case ArrowNodeTag__Int: - case ArrowNodeTag__FloatingPoint: - case ArrowNodeTag__Bool: - case ArrowNodeTag__Decimal: - case ArrowNodeTag__Date: - case ArrowNodeTag__Time: - case ArrowNodeTag__Timestamp: - case ArrowNodeTag__Interval: - case ArrowNodeTag__FixedSizeBinary: - /* fixed length values */ - if (con->buffer_curr + 2 > con->buffer_tail) - elog(ERROR, "RecordBatch has less buffers than expected"); - buffer_curr = con->buffer_curr++; - if (fstate->null_count > 0) + attopts.tag = ArrowType__Int; + switch (t->Int.bitWidth) { - fstate->nullmap_offset = buffer_curr->offset; - fstate->nullmap_length = buffer_curr->length; - if (fstate->nullmap_length < BITMAPLEN(fstate->nitems)) - elog(ERROR, "nullmap length is smaller than expected"); - if ((fstate->nullmap_offset & (MAXIMUM_ALIGNOF - 1)) != 0) - elog(ERROR, "nullmap is not aligned well"); + case 8: + attopts.unitsz = sizeof(int8_t); + type_oid = + GetSysCacheOid2(TYPENAMENSP, + Anum_pg_type_oid, + CStringGetDatum("int1"), + ObjectIdGetDatum(PG_CATALOG_NAMESPACE)); + break; + case 16: + attopts.unitsz = sizeof(int16_t); + type_oid = INT2OID; + break; + case 32: + attopts.unitsz = sizeof(int32_t); + type_oid = INT4OID; + break; + case 64: + attopts.unitsz = sizeof(int64_t); + type_oid = INT8OID; + break; + default: + elog(ERROR, "Arrow::Int bitWidth=%d is not supported", + t->Int.bitWidth); } - buffer_curr = con->buffer_curr++; - fstate->values_offset = buffer_curr->offset; - fstate->values_length = buffer_curr->length; - if (fstate->values_length < arrowFieldLength(field,fstate->nitems)) - elog(ERROR, "values array is smaller than expected"); - if ((fstate->values_offset & (MAXIMUM_ALIGNOF - 1)) != 0) - elog(ERROR, "values array is not aligned well"); + attopts.integer.bitWidth = t->Int.bitWidth; + attopts.integer.is_signed = t->Int.is_signed; break; - case ArrowNodeTag__List: - if (field->_num_children != 1) - elog(ERROR, "Bug? List of arrow type is corrupted"); - if (depth > 0) - elog(ERROR, "nested array type is not supported"); - /* nullmap */ - if (con->buffer_curr + 1 > con->buffer_tail) - elog(ERROR, "RecordBatch has less buffers than expected"); - buffer_curr = con->buffer_curr++; - if (fstate->null_count > 0) + case ArrowNodeTag__FloatingPoint: + attopts.tag = ArrowType__FloatingPoint; + switch (t->FloatingPoint.precision) { - fstate->nullmap_offset = buffer_curr->offset; - fstate->nullmap_length = buffer_curr->length; - if (fstate->nullmap_length < BITMAPLEN(fstate->nitems)) - elog(ERROR, "nullmap length is smaller than expected"); - if ((fstate->nullmap_offset & (MAXIMUM_ALIGNOF - 1)) != 0) - elog(ERROR, "nullmap is not aligned well"); + case ArrowPrecision__Half: + attopts.unitsz = sizeof(float2_t); + type_oid = + GetSysCacheOid2(TYPENAMENSP, + Anum_pg_type_oid, + CStringGetDatum("float2"), + ObjectIdGetDatum(PG_CATALOG_NAMESPACE)); + break; + case ArrowPrecision__Single: + attopts.unitsz = sizeof(float4_t); + type_oid = FLOAT4OID; + break; + case ArrowPrecision__Double: + attopts.unitsz = sizeof(float8_t); + type_oid = FLOAT8OID; + break; + default: + elog(ERROR, "Arrow::FloatingPoint unknown precision (%d)", + (int)t->FloatingPoint.precision); } - /* offset values */ - buffer_curr = con->buffer_curr++; - fstate->values_offset = buffer_curr->offset; - fstate->values_length = buffer_curr->length; - if (fstate->values_length < arrowFieldLength(field,fstate->nitems)) - elog(ERROR, "offset array is smaller than expected"); - if ((fstate->values_offset & (MAXIMUM_ALIGNOF - 1)) != 0) - elog(ERROR, "offset array is not aligned well"); - /* setup array element */ - fstate->children = palloc0(sizeof(RecordBatchFieldState)); - setupRecordBatchField(con, - &fstate->children[0], - &field->children[0], - depth+1); - fstate->num_children = 1; + attopts.floating_point.precision = t->FloatingPoint.precision; break; - case ArrowNodeTag__Utf8: - case ArrowNodeTag__Binary: - /* variable length values */ - if (con->buffer_curr + 3 > con->buffer_tail) - elog(ERROR, "RecordBatch has less buffers than expected"); - buffer_curr = con->buffer_curr++; - if (fstate->null_count > 0) - { - fstate->nullmap_offset = buffer_curr->offset; - fstate->nullmap_length = buffer_curr->length; - if (fstate->nullmap_length < BITMAPLEN(fstate->nitems)) - elog(ERROR, "nullmap length is smaller than expected"); - if ((fstate->nullmap_offset & (MAXIMUM_ALIGNOF - 1)) != 0) - elog(ERROR, "nullmap is not aligned well"); - } + case ArrowNodeTag__Bool: + attopts.tag = ArrowType__Bool; + attopts.unitsz = -1; /* values is bitmap */ + type_oid = BOOLOID; + break; - buffer_curr = con->buffer_curr++; - fstate->values_offset = buffer_curr->offset; - fstate->values_length = buffer_curr->length; - if (fstate->values_length < arrowFieldLength(field,fstate->nitems)) - elog(ERROR, "offset array is smaller than expected"); - if ((fstate->values_offset & (MAXIMUM_ALIGNOF - 1)) != 0) - elog(ERROR, "offset array is not aligned well (%lu %lu)", fstate->values_offset, fstate->values_length); - - buffer_curr = con->buffer_curr++; - fstate->extra_offset = buffer_curr->offset; - fstate->extra_length = buffer_curr->length; - if ((fstate->extra_offset & (MAXIMUM_ALIGNOF - 1)) != 0) - elog(ERROR, "extra buffer is not aligned well"); + case ArrowNodeTag__Decimal: + if (t->Decimal.bitWidth != 128) + elog(ERROR, "Arrow::Decimal%u is not supported", t->Decimal.bitWidth); + attopts.tag = ArrowType__Decimal; + attopts.unitsz = sizeof(int128_t); + attopts.decimal.precision = t->Decimal.precision; + attopts.decimal.scale = t->Decimal.scale; + attopts.decimal.bitWidth = t->Decimal.bitWidth; + type_oid = NUMERICOID; break; - case ArrowNodeTag__Struct: - if (depth > 0) - elog(ERROR, "nested composite type is not supported"); - /* only nullmap */ - if (con->buffer_curr + 1 > con->buffer_tail) - elog(ERROR, "RecordBatch has less buffers than expected"); - buffer_curr = con->buffer_curr++; - if (fstate->null_count > 0) + case ArrowNodeTag__Date: + attopts.tag = ArrowType__Date; + switch (t->Date.unit) { - fstate->nullmap_offset = buffer_curr->offset; - fstate->nullmap_length = buffer_curr->length; - if (fstate->nullmap_length < BITMAPLEN(fstate->nitems)) - elog(ERROR, "nullmap length is smaller than expected"); - if ((fstate->nullmap_offset & (MAXIMUM_ALIGNOF - 1)) != 0) - elog(ERROR, "nullmap is not aligned well"); + case ArrowDateUnit__Day: + attopts.unitsz = sizeof(int32_t); + break; + case ArrowDateUnit__MilliSecond: + attopts.unitsz = sizeof(int32_t); + break; + default: + elog(ERROR, "Arrow::Date unknown unit (%d)", + (int)t->Date.unit); + } + attopts.date.unit = t->Date.unit; + type_oid = DATEOID; + break; + + case ArrowNodeTag__Time: + attopts.tag = ArrowType__Time; + switch (t->Time.unit) + { + case ArrowTimeUnit__Second: + case ArrowTimeUnit__MilliSecond: + attopts.unitsz = sizeof(int32_t); + break; + case ArrowTimeUnit__MicroSecond: + case ArrowTimeUnit__NanoSecond: + attopts.unitsz = sizeof(int64_t); + break; + default: + elog(ERROR, "unknown Time::unit (%d)", + (int)t->Time.unit); + } + attopts.time.unit = t->Time.unit; + type_oid = TIMEOID; + break; + + case ArrowNodeTag__Timestamp: + attopts.tag = ArrowType__Timestamp; + switch (t->Timestamp.unit) + { + case ArrowTimeUnit__Second: + case ArrowTimeUnit__MilliSecond: + case ArrowTimeUnit__MicroSecond: + case ArrowTimeUnit__NanoSecond: + attopts.unitsz = sizeof(int64_t); + break; + default: + elog(ERROR, "unknown Timestamp::unit (%d)", + (int)t->Timestamp.unit); + } + attopts.timestamp.unit = t->Timestamp.unit; + type_oid = (t->Timestamp.timezone + ? TIMESTAMPTZOID + : TIMESTAMPOID); + break; + + case ArrowNodeTag__Interval: + attopts.tag = ArrowType__Interval; + switch (t->Interval.unit) + { + case ArrowIntervalUnit__Year_Month: + attopts.unitsz = sizeof(int32_t); + break; + case ArrowIntervalUnit__Day_Time: + attopts.unitsz = sizeof(int64_t); + break; + default: + elog(ERROR, "unknown Interval::unit (%d)", + (int)t->Interval.unit); + } + attopts.interval.unit = t->Interval.unit; + type_oid = INTERVALOID; + break; + + case ArrowNodeTag__FixedSizeBinary: + attopts.tag = ArrowType__FixedSizeBinary; + attopts.unitsz = t->FixedSizeBinary.byteWidth; + attopts.fixed_size_binary.byteWidth = t->FixedSizeBinary.byteWidth; + if (t->FixedSizeBinary.byteWidth <= 0 || + t->FixedSizeBinary.byteWidth > BLCKSZ) + elog(ERROR, "arrow_fdw: %s with byteWidth=%d is not supported", + t->node.tagName, + t->FixedSizeBinary.byteWidth); + if (hint_oid == MACADDROID && + t->FixedSizeBinary.byteWidth == 6) + { + type_oid = MACADDROID; + } + else if (hint_oid == INETOID && + (t->FixedSizeBinary.byteWidth == 4 || + t->FixedSizeBinary.byteWidth == 16)) + { + type_oid = INETOID; + } + else + { + type_oid = BPCHAROID; + type_mod = VARHDRSZ + t->FixedSizeBinary.byteWidth; + } + break; + + case ArrowNodeTag__Utf8: + attopts.tag = ArrowType__Utf8; + attopts.unitsz = sizeof(uint32_t); + type_oid = TEXTOID; + break; + + case ArrowNodeTag__LargeUtf8: + attopts.tag = ArrowType__LargeUtf8; + attopts.unitsz = sizeof(uint64_t); + type_oid = TEXTOID; + break; + + case ArrowNodeTag__Binary: + attopts.tag = ArrowType__Binary; + attopts.unitsz = sizeof(uint32_t); + type_oid = BYTEAOID; + break; + + case ArrowNodeTag__LargeBinary: + attopts.tag = ArrowType__LargeBinary; + attopts.unitsz = sizeof(uint64_t); + type_oid = BYTEAOID; + break; + + case ArrowNodeTag__List: + case ArrowNodeTag__LargeList: + if (field->_num_children != 1) + elog(ERROR, "Bug? List of arrow type is corrupted"); + else + { + Oid __type_oid = InvalidOid; + + attopts.tag = ArrowType__List; + attopts.unitsz = (t->node.tag == ArrowNodeTag__List + ? sizeof(uint32_t) + : sizeof(uint64_t)); + __arrowFieldTypeToPGType(&field->children[0], + &__type_oid, + NULL, + NULL); + type_oid = get_array_type(__type_oid); + if (!OidIsValid(type_oid)) + elog(ERROR, "arrow_fdw: no array type for '%s'", + format_type_be(__type_oid)); } + break; - if (field->_num_children > 0) + case ArrowNodeTag__Struct: { - int i; + Oid *__type_oids; - fstate->children = palloc0(sizeof(RecordBatchFieldState) * - field->_num_children); - for (i=0; i < field->_num_children; i++) + attopts.tag = ArrowType__Struct; + attopts.unitsz = 0; /* only nullmap */ + __type_oids = alloca(sizeof(Oid) * (field->_num_children + 1)); + for (int j=0; j < field->_num_children; j++) { - setupRecordBatchField(con, - &fstate->children[i], - &field->children[i], - depth+1); + __arrowFieldTypeToPGType(&field->children[j], + &__type_oids[j], + NULL, + NULL); } + type_oid = __lookupCompositePGType(field->_num_children, + __type_oids, + hint_oid); + if (!OidIsValid(type_oid)) + elog(ERROR, "arrow_fdw: no suitable composite type"); } - fstate->num_children = field->_num_children; + break; + + default: + elog(ERROR, "Bug? ArrowSchema contains unsupported types"); + } + + if (p_type_oid) + *p_type_oid = type_oid; + if (p_type_mod) + *p_type_mod = type_mod; + if (p_attopts) + memcpy(p_attopts, &attopts, sizeof(ArrowTypeOptions)); +} + +static void +__buildRecordBatchFieldState(setupRecordBatchContext *con, + RecordBatchFieldState *rb_field, + ArrowField *field, int depth) +{ + ArrowFieldNode *fnode; + ArrowBuffer *buffer_curr; + size_t least_values_length = 0; + bool has_extra_buffer = false; + + if (con->fnode_curr >= con->fnode_tail) + elog(ERROR, "RecordBatch has less ArrowFieldNode than expected"); + fnode = con->fnode_curr++; + rb_field->atttypid = InvalidOid; + rb_field->atttypmod = -1; + rb_field->nitems = fnode->length; + rb_field->null_count = fnode->null_count; + rb_field->stat_datum.isnull = true; + __arrowFieldTypeToPGType(field, + &rb_field->atttypid, + &rb_field->atttypmod, + &rb_field->attopts); + /* assign buffers */ + switch (field->type.node.tag) + { + case ArrowNodeTag__Bool: + least_values_length = BITMAPLEN(rb_field->nitems); + break; + case ArrowNodeTag__Int: + case ArrowNodeTag__FloatingPoint: + case ArrowNodeTag__Decimal: + case ArrowNodeTag__Date: + case ArrowNodeTag__Time: + case ArrowNodeTag__Timestamp: + case ArrowNodeTag__Interval: + case ArrowNodeTag__FixedSizeBinary: + least_values_length = rb_field->attopts.unitsz * rb_field->nitems; + break; + + case ArrowNodeTag__Utf8: + case ArrowNodeTag__LargeUtf8: + case ArrowNodeTag__Binary: + case ArrowNodeTag__LargeBinary: + least_values_length = rb_field->attopts.unitsz * (rb_field->nitems + 1); + has_extra_buffer = true; + break; + + case ArrowNodeTag__List: + case ArrowNodeTag__LargeList: + if (depth > 0) + elog(ERROR, "nested array type is not supported"); + least_values_length = rb_field->attopts.unitsz * (rb_field->nitems + 1); + break; + + case ArrowNodeTag__Struct: + if (depth > 0) + elog(ERROR, "nested composite type is not supported"); + /* no values and extra buffer, only nullmap */ break; default: elog(ERROR, "Bug? ArrowSchema contains unsupported types"); } - /* assign extra attributes (precision, unitsz, ...) */ - assignArrowTypeOptions(&fstate->attopts, &field->type); + + /* setup nullmap buffer */ + buffer_curr = con->buffer_curr++; + if (buffer_curr >= con->buffer_tail) + elog(ERROR, "RecordBatch has less buffers than expected"); + if (rb_field->null_count > 0) + { + rb_field->nullmap_offset = buffer_curr->offset; + rb_field->nullmap_length = buffer_curr->length; + if (rb_field->nullmap_length < BITMAPLEN(rb_field->nitems)) + elog(ERROR, "nullmap length is smaller than expected"); + if (rb_field->nullmap_offset != MAXALIGN(rb_field->nullmap_offset)) + elog(ERROR, "nullmap is not aligned well"); + } + + /* setup values buffer */ + if (least_values_length > 0) + { + buffer_curr = con->buffer_curr++; + if (buffer_curr >= con->buffer_tail) + elog(ERROR, "RecordBatch has less buffers than expected"); + rb_field->values_offset = buffer_curr->offset; + rb_field->values_length = buffer_curr->length; + if (rb_field->values_length < least_values_length) + elog(ERROR, "values array is smaller than expected"); + if (rb_field->values_offset != MAXALIGN(rb_field->values_offset)) + elog(ERROR, "values array is not aligned well"); + } + + /* setup extra buffer */ + if (has_extra_buffer) + { + Assert(least_values_length > 0); + buffer_curr = con->buffer_curr++; + if (buffer_curr >= con->buffer_tail) + elog(ERROR, "RecordBatch has less buffers than expected"); + rb_field->extra_offset = buffer_curr->offset; + rb_field->extra_length = buffer_curr->length; + if (rb_field->extra_offset != MAXALIGN(rb_field->extra_offset)) + elog(ERROR, "extra buffer is not aligned well"); + } + + /* child fields, if any */ + if (field->_num_children > 0) + { + rb_field->children = palloc0(sizeof(RecordBatchFieldState) * + field->_num_children); + for (int j=0; j < field->_num_children; j++) + { + __buildRecordBatchFieldState(con, + &rb_field->children[j], + &field->children[j], + depth+1); + } + } + rb_field->num_children = field->_num_children; } static RecordBatchState * -makeRecordBatchState(ArrowSchema *schema, - ArrowBlock *block, - ArrowRecordBatch *rbatch) +__buildRecordBatchStateOne(ArrowSchema *schema, + ArrowFileState *af_state, + int rb_index, + ArrowBlock *block, + ArrowRecordBatch *rbatch) { setupRecordBatchContext con; - RecordBatchState *result; - int j, ncols = schema->_num_fields; + RecordBatchState *rb_state; + int nfields = schema->_num_fields; - /* - * Right now, we have no support for compressed RecordBatches - */ if (rbatch->compression) - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("arrow_fdw: compressed record-batches are not supported"))); - - result = palloc0(offsetof(RecordBatchState, columns[ncols])); - result->ncols = ncols; - result->rb_offset = block->offset + block->metaDataLength; - result->rb_length = block->bodyLength; - result->rb_nitems = rbatch->length; + elog(ERROR, "arrow_fdw: right now, compressed record-batche is not supported"); + + rb_state = palloc0(offsetof(RecordBatchState, fields[nfields])); + rb_state->af_state = af_state; + rb_state->rb_index = rb_index; + rb_state->rb_offset = block->offset + block->metaDataLength; + rb_state->rb_length = block->bodyLength; + rb_state->rb_nitems = rbatch->length; + rb_state->nfields = nfields; memset(&con, 0, sizeof(setupRecordBatchContext)); con.buffer_curr = rbatch->buffers; con.buffer_tail = rbatch->buffers + rbatch->_num_buffers; con.fnode_curr = rbatch->nodes; con.fnode_tail = rbatch->nodes + rbatch->_num_nodes; - - for (j=0; j < ncols; j++) + for (int j=0; j < nfields; j++) { - RecordBatchFieldState *fstate = &result->columns[j]; + RecordBatchFieldState *rb_field = &rb_state->fields[j]; ArrowField *field = &schema->fields[j]; - setupRecordBatchField(&con, fstate, field, 0); + __buildRecordBatchFieldState(&con, rb_field, field, 0); } if (con.buffer_curr != con.buffer_tail || con.fnode_curr != con.fnode_tail) - elog(ERROR, "arrow_fdw: RecordBatch may have corruption."); - - return result; + elog(ERROR, "arrow_fdw: RecordBatch may be corrupted"); + return rb_state; } /* - * ExecInitArrowFdw + * readArrowFile */ -ArrowFdwState * -ExecInitArrowFdw(ScanState *ss, - GpuContext *gcontext, - List *outer_quals, - Bitmapset *outer_refs) +static bool +readArrowFile(const char *filename, ArrowFileInfo *af_info, bool missing_ok) { - Relation relation = ss->ss_currentRelation; - TupleDesc tupdesc = RelationGetDescr(relation); - ForeignTable *ft = GetForeignTable(RelationGetRelid(relation)); - List *filesList = NIL; - List *fdescList = NIL; - List *gpuDirectFileDescList = NIL; - Bitmapset *referenced = NULL; - Bitmapset *stat_attrs = NULL; - bool whole_row_ref = false; - ArrowFdwState *af_state; - List *rb_state_list = NIL; - ListCell *lc; - bool writable; - int i, num_rbatches; + File filp = PathNameOpenFile(filename, O_RDONLY | PG_BINARY); - Assert(RelationGetForm(relation)->relkind == RELKIND_FOREIGN_TABLE && - memcmp(GetFdwRoutineForRelation(relation, false), - &pgstrom_arrow_fdw_routine, sizeof(FdwRoutine)) == 0); - /* expand 'referenced' if it has whole-row reference */ - if (bms_is_member(-FirstLowInvalidHeapAttributeNumber, outer_refs)) - whole_row_ref = true; - for (i=0; i < tupdesc->natts; i++) + if (filp < 0) { - Form_pg_attribute attr = tupleDescAttr(tupdesc, i); - int k = attr->attnum - FirstLowInvalidHeapAttributeNumber; + if (missing_ok && errno == ENOENT) + return false; + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", filename))); + } + readArrowFileDesc(FileGetRawDesc(filp), af_info); + FileClose(filp); + if (af_info->dictionaries != NULL) + elog(ERROR, "DictionaryBatch is not supported at '%s'", filename); + Assert(af_info->footer._num_dictionaries == 0); + return true; +} - if (attr->attisdropped) - continue; - if (whole_row_ref || bms_is_member(k, outer_refs)) - referenced = bms_add_member(referenced, k); +static ArrowFileState * +__buildArrowFileStateByFile(const char *filename, Bitmapset **p_stat_attrs) +{ + ArrowFileInfo af_info; + ArrowFileState *af_state; + arrowStatsBinary *arrow_bstats; + + if (!readArrowFile(filename, &af_info, true)) + { + elog(DEBUG2, "file '%s' is missing: %m", filename); + return NULL; + } + if (af_info.recordBatches == NULL) + { + elog(DEBUG2, "arrow file '%s' contains no RecordBatch", filename); + return NULL; } + /* allocate ArrowFileState */ + af_state = palloc0(sizeof(ArrowFileInfo)); + af_state->filename = pstrdup(filename); + memcpy(&af_state->stat_buf, &af_info.stat_buf, sizeof(struct stat)); - filesList = __arrowFdwExtractFilesList(ft->options, - NULL, - &writable); - foreach (lc, filesList) + arrow_bstats = buildArrowStatsBinary(&af_info.footer, p_stat_attrs); + for (int i=0; i < af_info.footer._num_recordBatches; i++) { - char *fname = strVal(lfirst(lc)); - File fdesc; - List *rb_cached = NIL; - ListCell *cell; - GPUDirectFileDesc *dfile = NULL; - - fdesc = PathNameOpenFile(fname, O_RDONLY | PG_BINARY); - if (fdesc < 0) - { - if (writable && errno == ENOENT) - continue; - elog(ERROR, "failed to open '%s' on behalf of '%s'", - fname, RelationGetRelationName(relation)); - } - fdescList = lappend_int(fdescList, fdesc); + ArrowBlock *block = &af_info.footer.recordBatches[i]; + ArrowRecordBatch *rbatch = &af_info.recordBatches[i].body.recordBatch; + RecordBatchState *rb_state; - /* - * Open file for GPUDirect I/O - */ - if (gcontext) - { - dfile = palloc0(sizeof(GPUDirectFileDesc)); + rb_state = __buildRecordBatchStateOne(&af_info.footer.schema, + af_state, i, block, rbatch); + if (arrow_bstats) + applyArrowStatsBinary(rb_state, arrow_bstats); + af_state->rb_list = lappend(af_state->rb_list, rb_state); + } + releaseArrowStatsBinary(arrow_bstats); - gpuDirectFileDescOpen(dfile, fdesc); - if (!trackRawFileDesc(gcontext, dfile, __FILE__, __LINE__)) - { - gpuDirectFileDescClose(dfile); - elog(ERROR, "out of memory"); - } - gpuDirectFileDescList = lappend(gpuDirectFileDescList, dfile); - } + return af_state; +} - rb_cached = arrowLookupOrBuildMetadataCache(fdesc, &stat_attrs); - /* check schema compatibility */ - foreach (cell, rb_cached) - { - RecordBatchState *rb_state = lfirst(cell); - if (!arrowSchemaCompatibilityCheck(tupdesc, rb_state)) - elog(ERROR, "arrow file '%s' on behalf of foreign table '%s' has incompatible schema definition", - fname, RelationGetRelationName(relation)); - /* GPUDirect I/O state, if any */ - rb_state->dfile = dfile; +static arrowMetadataFieldCache * +__buildArrowMetadataFieldCache(RecordBatchFieldState *rb_field) +{ + arrowMetadataFieldCache *fcache; + + fcache = __allocMetadataFieldCache(); + if (!fcache) + return NULL; + fcache->atttypid = rb_field->atttypid; + fcache->atttypmod = rb_field->atttypmod; + memcpy(&fcache->attopts, &rb_field->attopts, sizeof(ArrowTypeOptions)); + fcache->nitems = rb_field->nitems; + fcache->null_count = rb_field->null_count; + fcache->nullmap_offset = rb_field->nullmap_offset; + fcache->nullmap_length = rb_field->nullmap_length; + fcache->values_offset = rb_field->values_offset; + fcache->values_length = rb_field->values_length; + fcache->extra_offset = rb_field->extra_offset; + fcache->extra_length = rb_field->extra_length; + memcpy(&fcache->stat_datum, + &rb_field->stat_datum, sizeof(MinMaxStatDatum)); + fcache->num_children = rb_field->num_children; + dlist_init(&fcache->children); + for (int j=0; j < rb_field->num_children; j++) + { + arrowMetadataFieldCache *__fcache; + + __fcache = __buildArrowMetadataFieldCache(&rb_field->children[j]); + if (!__fcache) + { + __releaseMetadataFieldCache(fcache); + return NULL; } - rb_state_list = list_concat(rb_state_list, rb_cached); + dlist_push_tail(&fcache->children, &__fcache->chain); } - num_rbatches = list_length(rb_state_list); - af_state = palloc0(offsetof(ArrowFdwState, rbatches[num_rbatches])); - af_state->gcontext = gcontext; - af_state->gpuDirectFileDescList = gpuDirectFileDescList; - af_state->fdescList = fdescList; - af_state->referenced = referenced; - if (arrow_fdw_stats_hint_enabled) - af_state->stats_hint = execInitArrowStatsHint(ss, stat_attrs, - outer_quals); - af_state->rbatch_index = &af_state->__rbatch_index_local; - af_state->rbatch_nload = &af_state->__rbatch_nload_local; - af_state->rbatch_nskip = &af_state->__rbatch_nskip_local; - i = 0; - foreach (lc, rb_state_list) - af_state->rbatches[i++] = (RecordBatchState *)lfirst(lc); - af_state->num_rbatches = num_rbatches; - - return af_state; + return fcache; } /* - * ArrowBeginForeignScan + * __buildArrowMetadataCacheNoLock + * + * it builds arrowMetadataCache entries according to the supplied + * ArrowFileState */ static void -ArrowBeginForeignScan(ForeignScanState *node, int eflags) +__buildArrowMetadataCacheNoLock(ArrowFileState *af_state) { - Relation relation = node->ss.ss_currentRelation; - TupleDesc tupdesc = RelationGetDescr(relation); - ForeignScan *fscan = (ForeignScan *) node->ss.ps.plan; - ListCell *lc; - Bitmapset *referenced = NULL; + arrowMetadataCache *mcache_head = NULL; + arrowMetadataCache *mcache_prev = NULL; + arrowMetadataCache *mcache; + uint32_t hindex; + ListCell *lc; - foreach (lc, fscan->fdw_private) + foreach (lc, af_state->rb_list) { - int j = lfirst_int(lc); + RecordBatchState *rb_state = lfirst(lc); + + mcache = __allocMetadataCache(); + if (!mcache) + { + __releaseMetadataCache(mcache_head); + return; + } + memcpy(&mcache->stat_buf, + &af_state->stat_buf, sizeof(struct stat)); + mcache->rb_index = rb_state->rb_index; + mcache->rb_offset = rb_state->rb_offset; + mcache->rb_length = rb_state->rb_length; + mcache->rb_nitems = rb_state->rb_nitems; + mcache->nfields = rb_state->nfields; + dlist_init(&mcache->fields); + if (!mcache_head) + mcache_head = mcache; + else + mcache_prev->next = mcache; - if (j >= 0 && j <= tupdesc->natts) - referenced = bms_add_member(referenced, j - - FirstLowInvalidHeapAttributeNumber); + for (int j=0; j < rb_state->nfields; j++) + { + arrowMetadataFieldCache *fcache; + + fcache = __buildArrowMetadataFieldCache(&rb_state->fields[j]); + if (!fcache) + { + __releaseMetadataCache(mcache_head); + return; + } + dlist_push_tail(&mcache->fields, &fcache->chain); + } + mcache_prev = mcache; } - node->fdw_state = ExecInitArrowFdw(&node->ss, - NULL, - fscan->scan.plan.qual, - referenced); + /* chain to the list */ + hindex = arrowMetadataHashIndex(&af_state->stat_buf); + dlist_push_tail(&arrow_metadata_cache->hash_slots[hindex], + &mcache_head->chain ); + SpinLockAcquire(&arrow_metadata_cache->lru_lock); + gettimeofday(&mcache_head->lru_tv, NULL); + dlist_push_head(&arrow_metadata_cache->lru_list, &mcache_head->lru_chain); + SpinLockRelease(&arrow_metadata_cache->lru_lock); } -typedef struct +static ArrowFileState * +BuildArrowFileState(Relation frel, const char *filename, Bitmapset **p_stat_attrs) { - off_t rb_offset; - off_t f_offset; - off_t m_offset; - cl_int io_index; - cl_int depth; - strom_io_chunk ioc[FLEXIBLE_ARRAY_MEMBER]; -} arrowFdwSetupIOContext; + arrowMetadataCache *mcache; + ArrowFileState *af_state; + RecordBatchState *rb_state; + struct stat stat_buf; + TupleDesc tupdesc; -/* - * arrowFdwSetupIOvectorField - */ -static void -__setupIOvectorField(arrowFdwSetupIOContext *con, - off_t chunk_offset, - size_t chunk_length, - cl_uint *p_cmeta_offset, - cl_uint *p_cmeta_length) -{ - off_t f_pos = con->rb_offset + chunk_offset; - size_t __length = MAXALIGN(chunk_length); + if (stat(filename, &stat_buf) != 0) + elog(ERROR, "failed on stat('%s'): %m", filename); + LWLockAcquire(&arrow_metadata_cache->mutex, LW_SHARED); + mcache = lookupArrowMetadataCache(&stat_buf, false); + if (mcache) + { + /* found a valid metadata-cache */ + af_state = __buildArrowFileStateByCache(filename, mcache, + p_stat_attrs); + } + else + { + LWLockRelease(&arrow_metadata_cache->mutex); - Assert((con->m_offset & (MAXIMUM_ALIGNOF - 1)) == 0); + /* here is no valid metadata-cache, so build it from the raw file */ + af_state = __buildArrowFileStateByFile(filename, p_stat_attrs); + if (!af_state) + return NULL; /* file not found? */ - if (f_pos == con->f_offset) + LWLockAcquire(&arrow_metadata_cache->mutex, LW_EXCLUSIVE); + mcache = lookupArrowMetadataCache(&af_state->stat_buf, true); + if (!mcache) + __buildArrowMetadataCacheNoLock(af_state); + } + LWLockRelease(&arrow_metadata_cache->mutex); + + /* compatibility checks */ + rb_state = linitial(af_state->rb_list); + tupdesc = RelationGetDescr(frel); + if (tupdesc->natts != rb_state->nfields) + elog(ERROR, "arrow_fdw: foreign table '%s' is not compatible to '%s'", + RelationGetRelationName(frel), filename); + for (int j=0; j < tupdesc->natts; j++) { - /* good, buffer is fully continuous */ - *p_cmeta_offset = __kds_packed(con->m_offset); - *p_cmeta_length = __kds_packed(__length); + Form_pg_attribute attr = TupleDescAttr(tupdesc, j); + RecordBatchFieldState *rb_field = &rb_state->fields[j]; - con->m_offset += __length; - con->f_offset += __length; + if (attr->atttypid != rb_field->atttypid) + elog(ERROR, "arrow_fdw: foreign table '%s' column '%s' (%s) is not compatible to the arrow field (%s) in the '%s'", + RelationGetRelationName(frel), + NameStr(attr->attname), + format_type_be(attr->atttypid), + format_type_be(rb_field->atttypid), + filename); } - else if (f_pos > con->f_offset && - (f_pos & ~PAGE_MASK) == (con->f_offset & ~PAGE_MASK) && - ((f_pos - con->f_offset) & (MAXIMUM_ALIGNOF-1)) == 0) - { - /* + return af_state; +} + +/* + * baseRelIsArrowFdw + */ +bool +baseRelIsArrowFdw(RelOptInfo *baserel) +{ + if ((baserel->reloptkind == RELOPT_BASEREL || + baserel->reloptkind == RELOPT_OTHER_MEMBER_REL) && + baserel->rtekind == RTE_RELATION && + OidIsValid(baserel->serverid) && + baserel->fdwroutine && + memcmp(baserel->fdwroutine, + &pgstrom_arrow_fdw_routine, + sizeof(FdwRoutine)) == 0) + return true; + + return false; +} + +/* + * RelationIsArrowFdw + */ +bool +RelationIsArrowFdw(Relation frel) +{ + if (RelationGetForm(frel)->relkind == RELKIND_FOREIGN_TABLE) + { + FdwRoutine *routine = GetFdwRoutineForRelation(frel, false); + + if (memcmp(routine, &pgstrom_arrow_fdw_routine, sizeof(FdwRoutine)) == 0) + return true; + } + return false; +} + +/* + * GetOptimalGpusForArrowFdw + */ +const Bitmapset * +GetOptimalGpusForArrowFdw(PlannerInfo *root, RelOptInfo *baserel) +{ + List *priv_list = (List *)baserel->fdw_private; + Bitmapset *optimal_gpus = NULL; + + if (baseRelIsArrowFdw(baserel) && + IsA(priv_list, List) && list_length(priv_list) == 2) + { + List *af_list = lsecond(priv_list); + ListCell *lc; + + foreach (lc, af_list) + { + ArrowFileState *af_state = lfirst(lc); + const Bitmapset *__optimal_gpus; + + __optimal_gpus = GetOptimalGpuForFile(af_state->filename); + if (lc == list_head(af_list)) + optimal_gpus = bms_copy(__optimal_gpus); + else + optimal_gpus = bms_intersect(optimal_gpus, __optimal_gpus); + } + } + return optimal_gpus; +} + +/* + * GetOptimalDpuForArrowFdw + */ +const DpuStorageEntry * +GetOptimalDpuForArrowFdw(PlannerInfo *root, RelOptInfo *baserel) +{ + const DpuStorageEntry *ds_entry = NULL; + List *priv_list = (List *)baserel->fdw_private; + + if (baseRelIsArrowFdw(baserel) && + IsA(priv_list, List) && list_length(priv_list) == 2) + { + List *af_list = linitial(priv_list); + ListCell *lc; + + foreach (lc, af_list) + { + ArrowFileState *af_state = lfirst(lc); + const DpuStorageEntry *__ds_entry; + + __ds_entry = GetOptimalDpuForFile(af_state->filename, NULL); + if (lc == list_head(af_list)) + ds_entry = __ds_entry; + else if (ds_entry && ds_entry != __ds_entry) + ds_entry = NULL; + } + } + return ds_entry; +} + +/* + * arrowFdwExtractFilesList + */ +static List * +arrowFdwExtractFilesList(List *options_list, + int *p_parallel_nworkers) +{ + + ListCell *lc; + List *filesList = NIL; + char *dir_path = NULL; + char *dir_suffix = NULL; + int parallel_nworkers = -1; + + foreach (lc, options_list) + { + DefElem *defel = lfirst(lc); + + Assert(IsA(defel->arg, String)); + if (strcmp(defel->defname, "file") == 0) + { + char *temp = strVal(defel->arg); + + if (access(temp, R_OK) != 0) + elog(ERROR, "arrow_fdw: unable to access '%s': %m", temp); + filesList = lappend(filesList, makeString(pstrdup(temp))); + } + else if (strcmp(defel->defname, "files") == 0) + { + char *temp = pstrdup(strVal(defel->arg)); + char *saveptr; + char *tok; + + while ((tok = strtok_r(temp, ",", &saveptr)) != NULL) + { + tok = __trim(tok); + + if (*tok != '/') + elog(ERROR, "arrow_fdw: file '%s' must be absolute path", tok); + if (access(tok, R_OK) != 0) + elog(ERROR, "arrow_fdw: unable to access '%s': %m", tok); + filesList = lappend(filesList, makeString(pstrdup(tok))); + } + pfree(temp); + } + else if (strcmp(defel->defname, "dir") == 0) + { + dir_path = strVal(defel->arg); + if (*dir_path != '/') + elog(ERROR, "arrow_fdw: dir '%s' must be absolute path", dir_path); + } + else if (strcmp(defel->defname, "suffix") == 0) + { + dir_suffix = strVal(defel->arg); + } + else if (strcmp(defel->defname, "parallel_workers") == 0) + { + if (parallel_nworkers >= 0) + elog(ERROR, "'parallel_workers' appeared twice"); + parallel_nworkers = atoi(strVal(defel->arg)); + } + else + elog(ERROR, "arrow: unknown option (%s)", defel->defname); + } + if (dir_suffix && !dir_path) + elog(ERROR, "arrow: cannot use 'suffix' option without 'dir'"); + + if (dir_path) + { + struct dirent *dentry; + DIR *dir; + char *temp; + + dir = AllocateDir(dir_path); + while ((dentry = ReadDir(dir, dir_path)) != NULL) + { + if (strcmp(dentry->d_name, ".") == 0 || + strcmp(dentry->d_name, "..") == 0) + continue; + if (dir_suffix) + { + char *pos = strrchr(dentry->d_name, '.'); + + if (!pos || strcmp(pos+1, dir_suffix) != 0) + continue; + } + temp = psprintf("%s/%s", dir_path, dentry->d_name); + if (access(temp, R_OK) != 0) + { + elog(DEBUG1, "arrow_fdw: unable to read '%s', so skipped", temp); + continue; + } + filesList = lappend(filesList, makeString(temp)); + } + FreeDir(dir); + } + + if (p_parallel_nworkers) + *p_parallel_nworkers = parallel_nworkers; + return filesList; +} + +/* ---------------------------------------------------------------- + * + * arrowFdwLoadRecordBatch() and related routines + * + * it setup KDS (ARROW format) with IOvec according to RecordBatchState + * + * ---------------------------------------------------------------- + */ + +/* + * arrowFdwSetupIOvector + */ +typedef struct +{ + off_t rb_offset; + off_t f_offset; + off_t m_offset; + size_t kds_head_sz; + int32_t depth; + int32_t io_index; + strom_io_chunk ioc[FLEXIBLE_ARRAY_MEMBER]; +} arrowFdwSetupIOContext; + +static void +__setupIOvectorField(arrowFdwSetupIOContext *con, + off_t chunk_offset, + size_t chunk_length, + uint32_t *p_cmeta_offset, + uint32_t *p_cmeta_length) +{ + off_t f_pos = con->rb_offset + chunk_offset; + size_t __length = MAXALIGN(chunk_length); + + Assert(con->m_offset == MAXALIGN(con->m_offset)); + + if (f_pos == con->f_offset) + { + /* good, buffer is fully continuous */ + *p_cmeta_offset = __kds_packed(con->kds_head_sz + + con->m_offset); + *p_cmeta_length = __kds_packed(__length); + + con->m_offset += __length; + con->f_offset += __length; + } + else if (f_pos > con->f_offset && + (f_pos & ~PAGE_MASK) == (con->f_offset & ~PAGE_MASK) && + (f_pos - con->f_offset) == MAXALIGN(f_pos - con->f_offset)) + { + /* * we can also consolidate the i/o of two chunks, if file position * of the next chunk (f_pos) and the current file tail position * (con->f_offset) locate within the same file page, and if gap bytes @@ -1894,7 +2200,8 @@ __setupIOvectorField(arrowFdwSetupIOContext *con, con->m_offset += __gap; con->f_offset += __gap; - *p_cmeta_offset = __kds_packed(con->m_offset); + *p_cmeta_offset = __kds_packed(con->kds_head_sz + + con->m_offset); *p_cmeta_length = __kds_packed(__length); con->m_offset += __length; @@ -1907,70 +2214,68 @@ __setupIOvectorField(arrowFdwSetupIOContext *con, * the previous i/o-chunk. So, make a new i/o-chunk. */ off_t f_base = TYPEALIGN_DOWN(PAGE_SIZE, f_pos); - off_t f_tail; - off_t shift = f_pos - f_base; + off_t gap = f_pos - f_base; strom_io_chunk *ioc; if (con->io_index < 0) - con->io_index = 0; /* no previous i/o chunks */ + con->io_index = 0; /* no previous i/o chunks */ else { - ioc = &con->ioc[con->io_index++]; + off_t f_tail = PAGE_ALIGN(con->f_offset); - f_tail = TYPEALIGN(PAGE_SIZE, con->f_offset); + ioc = &con->ioc[con->io_index++]; ioc->nr_pages = f_tail / PAGE_SIZE - ioc->fchunk_id; - con->m_offset += (f_tail - con->f_offset); //safety margin; + con->m_offset += (f_tail - con->f_offset); /* margin for alignment */ } + Assert(con->m_offset == PAGE_ALIGN(con->m_offset)); ioc = &con->ioc[con->io_index]; - /* adjust position if con->m_offset is not aligned well */ - if (con->m_offset + shift != MAXALIGN(con->m_offset + shift)) - con->m_offset = MAXALIGN(con->m_offset + shift) - shift; ioc->m_offset = con->m_offset; ioc->fchunk_id = f_base / PAGE_SIZE; - *p_cmeta_offset = __kds_packed(con->m_offset + shift); + con->m_offset += gap; + *p_cmeta_offset = __kds_packed(con->kds_head_sz + + con->m_offset); *p_cmeta_length = __kds_packed(__length); - - con->m_offset += shift + __length; - con->f_offset = f_pos + __length; + con->m_offset += __length; + con->f_offset = f_pos + __length; } } static void arrowFdwSetupIOvectorField(arrowFdwSetupIOContext *con, - RecordBatchFieldState *fstate, + RecordBatchFieldState *rb_field, kern_data_store *kds, kern_colmeta *cmeta) { //int index = cmeta - kds->colmeta; - if (fstate->nullmap_length > 0) + if (rb_field->nullmap_length > 0) { - Assert(fstate->null_count > 0); + Assert(rb_field->null_count > 0); __setupIOvectorField(con, - fstate->nullmap_offset, - fstate->nullmap_length, + rb_field->nullmap_offset, + rb_field->nullmap_length, &cmeta->nullmap_offset, &cmeta->nullmap_length); - //elog(INFO, "D%d att[%d] nullmap=%lu,%lu m_offset=%lu f_offset=%lu", con->depth, index, fstate->nullmap_offset, fstate->nullmap_length, con->m_offset, con->f_offset); + //elog(INFO, "D%d att[%d] nullmap=%lu,%lu m_offset=%lu f_offset=%lu", con->depth, index, rb_field->nullmap_offset, rb_field->nullmap_length, con->m_offset, con->f_offset); } - if (fstate->values_length > 0) + if (rb_field->values_length > 0) { __setupIOvectorField(con, - fstate->values_offset, - fstate->values_length, + rb_field->values_offset, + rb_field->values_length, &cmeta->values_offset, &cmeta->values_length); - //elog(INFO, "D%d att[%d] values=%lu,%lu m_offset=%lu f_offset=%lu", con->depth, index, fstate->values_offset, fstate->values_length, con->m_offset, con->f_offset); + //elog(INFO, "D%d att[%d] values=%lu,%lu m_offset=%lu f_offset=%lu", con->depth, index, rb_field->values_offset, rb_field->values_length, con->m_offset, con->f_offset); } - if (fstate->extra_length > 0) + if (rb_field->extra_length > 0) { __setupIOvectorField(con, - fstate->extra_offset, - fstate->extra_length, + rb_field->extra_offset, + rb_field->extra_length, &cmeta->extra_offset, &cmeta->extra_length); - //elog(INFO, "D%d att[%d] extra=%lu,%lu m_offset=%lu f_offset=%lu", con->depth, index, fstate->extra_offset, fstate->extra_length, con->m_offset, con->f_offset); + //elog(INFO, "D%d att[%d] extra=%lu,%lu m_offset=%lu f_offset=%lu", con->depth, index, rb_field->extra_offset, rb_field->extra_length, con->m_offset, con->f_offset); } /* nested sub-fields if composite types */ @@ -1980,13 +2285,13 @@ arrowFdwSetupIOvectorField(arrowFdwSetupIOContext *con, kern_colmeta *subattr; int j; - Assert(fstate->num_children == cmeta->num_subattrs); + Assert(rb_field->num_children == cmeta->num_subattrs); con->depth++; for (j=0, subattr = &kds->colmeta[cmeta->idx_subattrs]; j < cmeta->num_subattrs; j++, subattr++) { - RecordBatchFieldState *child = &fstate->children[j]; + RecordBatchFieldState *child = &rb_field->children[j]; arrowFdwSetupIOvectorField(con, child, kds, subattr); } @@ -1994,1981 +2299,538 @@ arrowFdwSetupIOvectorField(arrowFdwSetupIOContext *con, } } -/* - * arrowFdwSetupIOvector - */ static strom_io_vector * -arrowFdwSetupIOvector(kern_data_store *kds, - RecordBatchState *rb_state, - Bitmapset *referenced) +arrowFdwSetupIOvector(RecordBatchState *rb_state, + Bitmapset *referenced, + kern_data_store *kds) { arrowFdwSetupIOContext *con; - strom_io_vector *iovec = NULL; - int j, nr_chunks = 0; + strom_io_vector *iovec; - Assert(kds->nr_colmeta >= kds->ncols); + Assert(kds->ncols <= kds->nr_colmeta && + kds->ncols == rb_state->nfields); con = alloca(offsetof(arrowFdwSetupIOContext, ioc[3 * kds->nr_colmeta])); con->rb_offset = rb_state->rb_offset; con->f_offset = ~0UL; /* invalid offset */ - con->m_offset = TYPEALIGN(PAGE_SIZE, KERN_DATA_STORE_HEAD_LENGTH(kds)); - con->io_index = -1; - for (j=0; j < kds->ncols; j++) + con->m_offset = 0; + con->kds_head_sz = KDS_HEAD_LENGTH(kds); + con->depth = 0; + con->io_index = -1; /* invalid index */ + for (int j=0; j < kds->ncols; j++) { - RecordBatchFieldState *fstate = &rb_state->columns[j]; + RecordBatchFieldState *rb_field = &rb_state->fields[j]; kern_colmeta *cmeta = &kds->colmeta[j]; int attidx = j + 1 - FirstLowInvalidHeapAttributeNumber; - if (referenced && bms_is_member(attidx, referenced)) - arrowFdwSetupIOvectorField(con, fstate, kds, cmeta); + if (bms_is_member(attidx, referenced) || + bms_is_member(-FirstLowInvalidHeapAttributeNumber, referenced)) + arrowFdwSetupIOvectorField(con, rb_field, kds, cmeta); else cmeta->atttypkind = TYPE_KIND__NULL; /* unreferenced */ } if (con->io_index >= 0) { /* close the last I/O chunks */ - strom_io_chunk *ioc = &con->ioc[con->io_index]; + strom_io_chunk *ioc = &con->ioc[con->io_index++]; ioc->nr_pages = (TYPEALIGN(PAGE_SIZE, con->f_offset) / PAGE_SIZE - ioc->fchunk_id); con->m_offset = ioc->m_offset + PAGE_SIZE * ioc->nr_pages; - nr_chunks = con->io_index + 1; } kds->length = con->m_offset; - iovec = palloc0(offsetof(strom_io_vector, ioc[nr_chunks])); - iovec->nr_chunks = nr_chunks; - if (nr_chunks > 0) - memcpy(iovec->ioc, con->ioc, sizeof(strom_io_chunk) * nr_chunks); - return iovec; -} - -/* - * __dump_kds_and_iovec - just for debug - */ -static inline void -__dump_kds_and_iovec(kern_data_store *kds, strom_io_vector *iovec) -{ + iovec = palloc0(offsetof(strom_io_vector, ioc[con->io_index])); + iovec->nr_chunks = con->io_index; + if (iovec->nr_chunks > 0) + memcpy(iovec->ioc, con->ioc, sizeof(strom_io_chunk) * con->io_index); #if 0 - int j; - - elog(INFO, "nchunks = %d", iovec->nr_chunks); - for (j=0; j < iovec->nr_chunks; j++) + /* for debug - dump the i/o vector */ { - strom_io_chunk *ioc = &iovec->ioc[j]; - - elog(INFO, "io[%d] [ m_offset=%lu, f_read=%lu...%lu, nr_pages=%u}", - j, - ioc->m_offset, - ioc->fchunk_id * PAGE_SIZE, - (ioc->fchunk_id + ioc->nr_pages) * PAGE_SIZE, - ioc->nr_pages); - } - - elog(INFO, "kds {length=%zu nitems=%u typeid=%u typmod=%u table_oid=%u}", - kds->length, kds->nitems, - kds->tdtypeid, kds->tdtypmod, kds->table_oid); - for (j=0; j < kds->nr_colmeta; j++) - { - kern_colmeta *cmeta = &kds->colmeta[j]; - - elog(INFO, "%ccol[%d] nullmap=%lu,%lu values=%lu,%lu extra=%lu,%lu", - j < kds->ncols ? ' ' : '*', j, - __kds_unpack(cmeta->nullmap_offset), - __kds_unpack(cmeta->nullmap_length), - __kds_unpack(cmeta->values_offset), - __kds_unpack(cmeta->values_length), - __kds_unpack(cmeta->extra_offset), - __kds_unpack(cmeta->extra_length)); + elog(INFO, "nchunks = %d", iovec->nr_chunks); + for (int j=0; j < iovec->nr_chunks; j++) + { + strom_io_chunk *ioc = &iovec->ioc[j]; + + elog(INFO, "io[%d] [ m_offset=%lu, f_read=%lu...%lu, nr_pages=%u}", + j, + ioc->m_offset, + ioc->fchunk_id * PAGE_SIZE, + (ioc->fchunk_id + ioc->nr_pages) * PAGE_SIZE, + ioc->nr_pages); + } + elog(INFO, "kds {length=%zu nitems=%u typeid=%u typmod=%u table_oid=%u}", + kds->length, kds->nitems, + kds->tdtypeid, kds->tdtypmod, kds->table_oid); + for (int j=0; j < kds->nr_colmeta; j++) + { + kern_colmeta *cmeta = &kds->colmeta[j]; + + elog(INFO, "%ccol[%d] nullmap=%lu,%lu values=%lu,%lu extra=%lu,%lu", + j < kds->ncols ? ' ' : '*', j, + __kds_unpack(cmeta->nullmap_offset), + __kds_unpack(cmeta->nullmap_length), + __kds_unpack(cmeta->values_offset), + __kds_unpack(cmeta->values_length), + __kds_unpack(cmeta->extra_offset), + __kds_unpack(cmeta->extra_length)); + } } #endif + return iovec; } /* * arrowFdwLoadRecordBatch */ static void -__arrowFdwAssignTypeOptions(kern_data_store *kds, - int base, int ncols, - RecordBatchFieldState *rb_fstate) +__arrowKdsAssignAttrOptions(kern_data_store *kds, + kern_colmeta *cmeta, + RecordBatchFieldState *rb_field) { - int i; - - for (i=0; i < ncols; i++) + memcpy(&cmeta->attopts, + &rb_field->attopts, sizeof(ArrowTypeOptions)); + if (cmeta->atttypkind == TYPE_KIND__ARRAY) { - kern_colmeta *cmeta = &kds->colmeta[base+i]; - - cmeta->attopts = rb_fstate[i].attopts; - if (cmeta->atttypkind == TYPE_KIND__ARRAY) - { - Assert(cmeta->idx_subattrs >= kds->ncols && - cmeta->num_subattrs == 1 && - cmeta->idx_subattrs + cmeta->num_subattrs <= kds->nr_colmeta); - Assert(rb_fstate[i].num_children == 1); - __arrowFdwAssignTypeOptions(kds, - cmeta->idx_subattrs, - cmeta->num_subattrs, - rb_fstate[i].children); - } - else if (cmeta->atttypkind == TYPE_KIND__COMPOSITE) + Assert(cmeta->idx_subattrs >= kds->ncols && + cmeta->num_subattrs == 1 && + cmeta->idx_subattrs + cmeta->num_subattrs <= kds->nr_colmeta && + rb_field->num_children == 1); + __arrowKdsAssignAttrOptions(kds, + &kds->colmeta[cmeta->idx_subattrs], + &rb_field->children[0]); + } + else if (cmeta->atttypkind == TYPE_KIND__COMPOSITE) + { + Assert(cmeta->idx_subattrs >= kds->ncols && + cmeta->num_subattrs == rb_field->num_children && + cmeta->idx_subattrs + cmeta->num_subattrs <= kds->nr_colmeta); + for (int j=0; j < cmeta->num_subattrs; j++) { - Assert(cmeta->idx_subattrs >= kds->ncols && - cmeta->idx_subattrs + cmeta->num_subattrs <= kds->nr_colmeta); - Assert(rb_fstate[i].num_children == cmeta->num_subattrs); - __arrowFdwAssignTypeOptions(kds, - cmeta->idx_subattrs, - cmeta->num_subattrs, - rb_fstate[i].children); + __arrowKdsAssignAttrOptions(kds, + &kds->colmeta[cmeta->idx_subattrs + j], + &rb_field->children[j]); } } } -static pgstrom_data_store * -__arrowFdwLoadRecordBatch(RecordBatchState *rb_state, - Relation relation, - Bitmapset *referenced, - GpuContext *gcontext, - MemoryContext mcontext, - const Bitmapset *optimal_gpus) +static strom_io_vector * +arrowFdwLoadRecordBatch(Relation relation, + Bitmapset *referenced, + RecordBatchState *rb_state, + StringInfo chunk_buffer) { - TupleDesc tupdesc = RelationGetDescr(relation); - pgstrom_data_store *pds; - kern_data_store *kds; - strom_io_vector *iovec; - size_t head_sz; - CUresult rc; + TupleDesc tupdesc = RelationGetDescr(relation); + size_t head_sz = estimate_kern_data_store(tupdesc); + kern_data_store *kds; /* setup KDS and I/O-vector */ - head_sz = KDS_calculateHeadSize(tupdesc); - kds = alloca(head_sz); - init_kernel_data_store(kds, tupdesc, 0, KDS_FORMAT_ARROW, 0); + enlargeStringInfo(chunk_buffer, head_sz); + kds = (kern_data_store *)(chunk_buffer->data + + chunk_buffer->len); + setup_kern_data_store(kds, tupdesc, 0, KDS_FORMAT_ARROW); kds->nitems = rb_state->rb_nitems; - kds->nrooms = rb_state->rb_nitems; kds->table_oid = RelationGetRelid(relation); - Assert(head_sz == KERN_DATA_STORE_HEAD_LENGTH(kds)); - Assert(kds->ncols == rb_state->ncols); - __arrowFdwAssignTypeOptions(kds, 0, kds->ncols, rb_state->columns); - iovec = arrowFdwSetupIOvector(kds, rb_state, referenced); - __dump_kds_and_iovec(kds, iovec); + Assert(head_sz == KDS_HEAD_LENGTH(kds)); + Assert(kds->ncols == rb_state->nfields); + for (int j=0; j < kds->ncols; j++) + __arrowKdsAssignAttrOptions(kds, + &kds->colmeta[j], + &rb_state->fields[j]); + chunk_buffer->len += head_sz; - /* - * If SSD-to-GPU Direct SQL is available on the arrow file, setup a small - * PDS on host-pinned memory, with strom_io_vector. - */ - if (gcontext && - bms_is_member(gcontext->cuda_dindex, optimal_gpus) && - iovec->nr_chunks > 0 && - kds->length <= gpuMemAllocIOMapMaxLength() && - rb_state->dfile != NULL) - { - size_t iovec_sz = offsetof(strom_io_vector, ioc[iovec->nr_chunks]); - - rc = gpuMemAllocHost(gcontext, (void **)&pds, - offsetof(pgstrom_data_store, kds) + - head_sz + iovec_sz); - if (rc != CUDA_SUCCESS) - elog(ERROR, "failed on gpuMemAllocHost: %s", errorText(rc)); - - pds->gcontext = gcontext; - pg_atomic_init_u32(&pds->refcnt, 1); - pds->nblocks_uncached = 0; - memcpy(&pds->filedesc, rb_state->dfile, sizeof(GPUDirectFileDesc)); - pds->iovec = (strom_io_vector *)((char *)&pds->kds + head_sz); - memcpy(&pds->kds, kds, head_sz); - memcpy(pds->iovec, iovec, iovec_sz); - } - else - { - /* Elsewhere, load RecordBatch by filesystem */ - int fdesc = FileGetRawDesc(rb_state->fdesc); + return arrowFdwSetupIOvector(rb_state, referenced, kds); +} - if (gcontext) - { - rc = gpuMemAllocManaged(gcontext, - (CUdeviceptr *)&pds, - offsetof(pgstrom_data_store, - kds) + kds->length, - CU_MEM_ATTACH_GLOBAL); - if (rc != CUDA_SUCCESS) - elog(ERROR, "failed on gpuMemAllocManaged: %s", errorText(rc)); - } - else +static kern_data_store * +arrowFdwFillupRecordBatch(Relation relation, + Bitmapset *referenced, + RecordBatchState *rb_state, + StringInfo chunk_buffer) +{ + ArrowFileState *af_state = rb_state->af_state; + kern_data_store *kds; + strom_io_vector *iovec; + char *base; + File filp; + + resetStringInfo(chunk_buffer); + iovec = arrowFdwLoadRecordBatch(relation, + referenced, + rb_state, + chunk_buffer); + kds = (kern_data_store *)chunk_buffer->data; + enlargeStringInfo(chunk_buffer, kds->length); + kds = (kern_data_store *)chunk_buffer->data; + filp = PathNameOpenFile(af_state->filename, O_RDONLY | PG_BINARY); + base = (char *)kds + KDS_HEAD_LENGTH(kds); + for (int i=0; i < iovec->nr_chunks; i++) + { + strom_io_chunk *ioc = &iovec->ioc[i]; + char *dest = base + ioc->m_offset; + off_t f_pos = (size_t)ioc->fchunk_id * PAGE_SIZE; + size_t len = (size_t)ioc->nr_pages * PAGE_SIZE; + ssize_t sz; + + while (len > 0) { - pds = MemoryContextAllocHuge(mcontext, - offsetof(pgstrom_data_store, - kds) + kds->length); + CHECK_FOR_INTERRUPTS(); + + sz = FileRead(filp, dest, len, f_pos, + WAIT_EVENT_REORDER_BUFFER_READ); + if (sz > 0) + { + Assert(sz <= len); + dest += sz; + f_pos += sz; + len -= sz; + } + else if (sz == 0) + { + /* + * Due to the page_sz alignment, we may try to read the file + * over its tail. So, pread(2) may tell us unable to read + * any more. The expected scenario happend only when remained + * length is less than PAGE_SIZE. + */ + memset(dest, 0, len); + break; + } + else if (errno != EINTR) + { + assert(false); + elog(ERROR, "failed on FileRead('%s', pos=%lu, len=%lu): %m", + af_state->filename, f_pos, len); + } } - __PDS_fillup_arrow(pds, gcontext, kds, fdesc, iovec); } + chunk_buffer->len += kds->length; + FileClose(filp); + pfree(iovec); - return pds; + + return kds; } -static pgstrom_data_store * -arrowFdwLoadRecordBatch(ArrowFdwState *af_state, - Relation relation, - EState *estate, - GpuContext *gcontext, - const Bitmapset *optimal_gpus) +/* + * ArrowGetForeignRelSize + */ +static size_t +__recordBatchFieldLength(RecordBatchFieldState *rb_field) { - RecordBatchState *rb_state; - uint32 rb_index; + size_t len = 0; -retry: - /* fetch next RecordBatch */ - rb_index = pg_atomic_fetch_add_u32(af_state->rbatch_index, 1); - if (rb_index >= af_state->num_rbatches) - return NULL; /* no more RecordBatch to read */ - rb_state = af_state->rbatches[rb_index]; - - if (af_state->stats_hint) - { - if (execCheckArrowStatsHint(af_state->stats_hint, rb_state)) - pg_atomic_fetch_add_u32(af_state->rbatch_nload, 1); - else - { - pg_atomic_fetch_add_u32(af_state->rbatch_nskip, 1); - goto retry; - } - } - return __arrowFdwLoadRecordBatch(rb_state, - relation, - af_state->referenced, - gcontext, - estate->es_query_cxt, - optimal_gpus); -} - -/* - * ExecScanChunkArrowFdw - */ -pgstrom_data_store * -ExecScanChunkArrowFdw(GpuTaskState *gts) -{ - pgstrom_data_store *pds; - - InstrStartNode(>s->outer_instrument); - pds = arrowFdwLoadRecordBatch(gts->af_state, - gts->css.ss.ss_currentRelation, - gts->css.ss.ps.state, - gts->gcontext, - gts->optimal_gpus); - InstrStopNode(>s->outer_instrument, - !pds ? 0.0 : (double)pds->kds.nitems); - return pds; -} - -/* - * ArrowIterateForeignScan - */ -static TupleTableSlot * -ArrowIterateForeignScan(ForeignScanState *node) -{ - ArrowFdwState *af_state = node->fdw_state; - Relation relation = node->ss.ss_currentRelation; - TupleTableSlot *slot = node->ss.ss_ScanTupleSlot; - pgstrom_data_store *pds; - - while ((pds = af_state->curr_pds) == NULL || - af_state->curr_index >= pds->kds.nitems) - { - EState *estate = node->ss.ps.state; - - /* unload the previous RecordBatch, if any */ - if (pds) - PDS_release(pds); - af_state->curr_index = 0; - af_state->curr_pds = arrowFdwLoadRecordBatch(af_state, - relation, - estate, - NULL, - NULL); - if (!af_state->curr_pds) - return NULL; - } - Assert(pds && af_state->curr_index < pds->kds.nitems); - if (KDS_fetch_tuple_arrow(slot, &pds->kds, af_state->curr_index++)) - return slot; - return NULL; -} - -/* - * ArrowReScanForeignScan - */ -void -ExecReScanArrowFdw(ArrowFdwState *af_state) -{ - /* rewind the current scan state */ - pg_atomic_write_u32(af_state->rbatch_index, 0); - if (af_state->curr_pds) - PDS_release(af_state->curr_pds); - af_state->curr_pds = NULL; - af_state->curr_index = 0; -} - -static void -ArrowReScanForeignScan(ForeignScanState *node) -{ - ExecReScanArrowFdw((ArrowFdwState *)node->fdw_state); -} - -/* - * ArrowEndForeignScan - */ -void -ExecEndArrowFdw(ArrowFdwState *af_state) -{ - ListCell *lc; - - foreach (lc, af_state->fdescList) - FileClose((File)lfirst_int(lc)); - foreach (lc, af_state->gpuDirectFileDescList) - { - GPUDirectFileDesc *dfile = lfirst(lc); - - untrackRawFileDesc(af_state->gcontext, dfile); - gpuDirectFileDescClose(dfile); - } - if (af_state->stats_hint) - execEndArrowStatsHint(af_state->stats_hint); -} - -static void -ArrowEndForeignScan(ForeignScanState *node) -{ - ExecEndArrowFdw((ArrowFdwState *)node->fdw_state); -} - -/* - * ArrowExplainForeignScan - */ -void -ExplainArrowFdw(ArrowFdwState *af_state, - Relation frel, - ExplainState *es, - List *dcontext) -{ - TupleDesc tupdesc = RelationGetDescr(frel); - ListCell *lc; - int fcount = 0; - char label[80]; - size_t *chunk_sz = alloca(sizeof(size_t) * tupdesc->natts); - int i, j, k; - StringInfoData buf; - - /* shows referenced columns */ - initStringInfo(&buf); - for (k = bms_next_member(af_state->referenced, -1); - k >= 0; - k = bms_next_member(af_state->referenced, k)) - { - j = k + FirstLowInvalidHeapAttributeNumber - 1; - - if (j >= 0) - { - Form_pg_attribute attr = tupleDescAttr(tupdesc, j); - const char *attName = NameStr(attr->attname); - if (buf.len > 0) - appendStringInfoString(&buf, ", "); - appendStringInfoString(&buf, quote_identifier(attName)); - } - } - ExplainPropertyText("referenced", buf.data, es); - - /* shows stats hint if any */ - if (af_state->stats_hint) - { - arrowStatsHint *stats_hint = af_state->stats_hint; - - resetStringInfo(&buf); - - if (dcontext == NIL) - { - int anum; - - for (anum = bms_next_member(stats_hint->load_attrs, -1); - anum >= 0; - anum = bms_next_member(stats_hint->load_attrs, anum)) - { - Form_pg_attribute attr = tupleDescAttr(tupdesc, anum-1); - const char *attName = NameStr(attr->attname); - - if (buf.len > 0) - appendStringInfoString(&buf, ", "); - appendStringInfoString(&buf, quote_identifier(attName)); - } - } - else - { - ListCell *lc; - - foreach (lc, stats_hint->orig_quals) - { - Node *qual = lfirst(lc); - char *temp; - - temp = deparse_expression(qual, dcontext, es->verbose, false); - if (buf.len > 0) - appendStringInfoString(&buf, ", "); - appendStringInfoString(&buf, temp); - pfree(temp); - } - } - if (es->analyze) - appendStringInfo(&buf, " [loaded: %u, skipped: %u]", - pg_atomic_read_u32(af_state->rbatch_nload), - pg_atomic_read_u32(af_state->rbatch_nskip)); - ExplainPropertyText("Stats-Hint", buf.data, es); - } - - /* shows files on behalf of the foreign table */ - foreach (lc, af_state->fdescList) - { - File fdesc = (File)lfirst_int(lc); - const char *fname = FilePathName(fdesc); - int rbcount = 0; - size_t read_sz = 0; - char *pos = label; - struct stat st_buf; - - pos += snprintf(label, sizeof(label), "files%d", fcount++); - if (fstat(FileGetRawDesc(fdesc), &st_buf) != 0) - memset(&st_buf, 0, sizeof(struct stat)); - - /* size count per chunk */ - memset(chunk_sz, 0, sizeof(size_t) * tupdesc->natts); - for (i=0; i < af_state->num_rbatches; i++) - { - RecordBatchState *rb_state = af_state->rbatches[i]; - size_t sz; - - if (rb_state->fdesc != fdesc) - continue; - - for (k = bms_next_member(af_state->referenced, -1); - k >= 0; - k = bms_next_member(af_state->referenced, k)) - { - j = k + FirstLowInvalidHeapAttributeNumber - 1; - if (j < 0 || j >= tupdesc->natts) - continue; - sz = RecordBatchFieldLength(&rb_state->columns[j]); - read_sz += sz; - chunk_sz[j] += sz; - } - rbcount++; - } - - /* file size and read size */ - if (es->format == EXPLAIN_FORMAT_TEXT) - { - resetStringInfo(&buf); - if (st_buf.st_size == 0) - appendStringInfoString(&buf, fname); - else - appendStringInfo(&buf, "%s (read: %s, size: %s)", - fname, - format_bytesz(read_sz), - format_bytesz(st_buf.st_size)); - ExplainPropertyText(label, buf.data, es); - } - else - { - ExplainPropertyText(label, fname, es); - - sprintf(pos, "-size"); - ExplainPropertyText(label, format_bytesz(st_buf.st_size), es); - - sprintf(pos, "-read"); - ExplainPropertyText(label, format_bytesz(read_sz), es); - } - - /* read-size per column (verbose mode only) */ - if (es->verbose && rbcount >= 0) - { - for (k = bms_next_member(af_state->referenced, -1); - k >= 0; - k = bms_next_member(af_state->referenced, k)) - { - Form_pg_attribute attr; - - j = k + FirstLowInvalidHeapAttributeNumber - 1; - if (j < 0 || j >= tupdesc->natts) - continue; - attr = tupleDescAttr(tupdesc, j); - snprintf(label, sizeof(label), - " %s", NameStr(attr->attname)); - ExplainPropertyText(label, format_bytesz(chunk_sz[j]), es); - } - } - } - pfree(buf.data); -} - -static void -ArrowExplainForeignScan(ForeignScanState *node, ExplainState *es) -{ - Relation frel = node->ss.ss_currentRelation; - - ExplainArrowFdw((ArrowFdwState *)node->fdw_state, frel, es, NIL); -} - -/* - * readArrowFile - */ -static bool -readArrowFile(const char *pathname, ArrowFileInfo *af_info, bool missing_ok) -{ - File filp = PathNameOpenFile(pathname, O_RDONLY | PG_BINARY); - - if (filp < 0) - { - if (missing_ok && errno == ENOENT) - return false; - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not open file \"%s\": %m", pathname))); - } - readArrowFileDesc(FileGetRawDesc(filp), af_info); - FileClose(filp); - return true; -} - -/* - * RecordBatchAcquireSampleRows - random sampling - */ -static int -RecordBatchAcquireSampleRows(Relation relation, - RecordBatchState *rb_state, - HeapTuple *rows, - int nsamples) -{ - TupleDesc tupdesc = RelationGetDescr(relation); - pgstrom_data_store *pds; - Bitmapset *referenced = NULL; - Datum *values; - bool *isnull; - int count; - int i, j, nwords; - - /* ANALYZE needs to fetch all the attributes */ - nwords = (tupdesc->natts - FirstLowInvalidHeapAttributeNumber + - BITS_PER_BITMAPWORD - 1) / BITS_PER_BITMAPWORD; - referenced = alloca(offsetof(Bitmapset, words[nwords])); - referenced->nwords = nwords; - memset(referenced->words, -1, sizeof(bitmapword) * nwords); - - pds = __arrowFdwLoadRecordBatch(rb_state, - relation, - referenced, - NULL, - CurrentMemoryContext, - NULL); - values = alloca(sizeof(Datum) * tupdesc->natts); - isnull = alloca(sizeof(bool) * tupdesc->natts); - for (count = 0; count < nsamples; count++) - { - /* fetch a row randomly */ - i = (double)pds->kds.nitems * drand48(); - Assert(i < pds->kds.nitems); - - for (j=0; j < pds->kds.ncols; j++) - { - kern_colmeta *cmeta = &pds->kds.colmeta[j]; - - pg_datum_arrow_ref(&pds->kds, - cmeta, - i, - values + j, - isnull + j); - } - rows[count] = heap_form_tuple(tupdesc, values, isnull); - } - PDS_release(pds); - - return count; -} - -/* - * ArrowAcquireSampleRows - */ -static int -ArrowAcquireSampleRows(Relation relation, - int elevel, - HeapTuple *rows, - int nrooms, - double *p_totalrows, - double *p_totaldeadrows) -{ - TupleDesc tupdesc = RelationGetDescr(relation); - ForeignTable *ft = GetForeignTable(RelationGetRelid(relation)); - List *filesList = NIL; - List *fdescList = NIL; - List *rb_state_list = NIL; - ListCell *lc; - bool writable; - int64 total_nrows = 0; - int64 count_nrows = 0; - int nsamples_min = nrooms / 100; - int nitems = 0; - - filesList = __arrowFdwExtractFilesList(ft->options, - NULL, - &writable); - foreach (lc, filesList) - { - char *fname = strVal(lfirst(lc)); - File fdesc; - List *rb_cached; - ListCell *cell; - - fdesc = PathNameOpenFile(fname, O_RDONLY | PG_BINARY); - if (fdesc < 0) - { - if (writable && errno == ENOENT) - continue; - elog(ERROR, "failed to open file '%s' on behalf of '%s'", - fname, RelationGetRelationName(relation)); - } - fdescList = lappend_int(fdescList, fdesc); - - rb_cached = arrowLookupOrBuildMetadataCache(fdesc, NULL); - foreach (cell, rb_cached) - { - RecordBatchState *rb_state = lfirst(cell); - - if (!arrowSchemaCompatibilityCheck(tupdesc, rb_state)) - elog(ERROR, "arrow file '%s' on behalf of foreign table '%s' has incompatible schema definition", - fname, RelationGetRelationName(relation)); - if (rb_state->rb_nitems == 0) - continue; /* not reasonable to sample, skipped */ - total_nrows += rb_state->rb_nitems; - - rb_state_list = lappend(rb_state_list, rb_state); - } - } - nrooms = Min(nrooms, total_nrows); - - /* fetch samples for each record-batch */ - foreach (lc, rb_state_list) - { - RecordBatchState *rb_state = lfirst(lc); - int nsamples; - - count_nrows += rb_state->rb_nitems; - nsamples = (double)nrooms * ((double)count_nrows / - (double)total_nrows) - nitems; - if (nitems + nsamples > nrooms) - nsamples = nrooms - nitems; - if (nsamples > nsamples_min) - nitems += RecordBatchAcquireSampleRows(relation, - rb_state, - rows + nitems, - nsamples); - } - foreach (lc, fdescList) - FileClose((File)lfirst_int(lc)); - - *p_totalrows = total_nrows; - *p_totaldeadrows = 0.0; - - return nitems; -} - -/* - * ArrowAnalyzeForeignTable - */ -static bool -ArrowAnalyzeForeignTable(Relation frel, - AcquireSampleRowsFunc *p_sample_rows_func, - BlockNumber *p_totalpages) -{ - ForeignTable *ft = GetForeignTable(RelationGetRelid(frel)); - List *filesList = arrowFdwExtractFilesList(ft->options); - ListCell *lc; - Size totalpages = 0; - - foreach (lc, filesList) - { - const char *fname = strVal(lfirst(lc)); - struct stat statbuf; - - if (stat(fname, &statbuf) != 0) - { - elog(NOTICE, "failed on stat('%s') on behalf of '%s', skipped", - fname, get_rel_name(ft->relid)); - continue; - } - totalpages += (statbuf.st_size + BLCKSZ - 1) / BLCKSZ; - } - - if (totalpages > MaxBlockNumber) - totalpages = MaxBlockNumber; - - *p_sample_rows_func = ArrowAcquireSampleRows; - *p_totalpages = totalpages; - - return true; -} - -/* - * ArrowImportForeignSchema - */ -static List * -ArrowImportForeignSchema(ImportForeignSchemaStmt *stmt, Oid serverOid) -{ - ArrowSchema schema; - List *filesList; - ListCell *lc; - int j; - StringInfoData cmd; - - /* sanity checks */ - switch (stmt->list_type) - { - case FDW_IMPORT_SCHEMA_ALL: - break; - case FDW_IMPORT_SCHEMA_LIMIT_TO: - elog(ERROR, "arrow_fdw does not support LIMIT TO clause"); - break; - case FDW_IMPORT_SCHEMA_EXCEPT: - elog(ERROR, "arrow_fdw does not support EXCEPT clause"); - break; - default: - elog(ERROR, "arrow_fdw: Bug? unknown list-type"); - break; - } - filesList = arrowFdwExtractFilesList(stmt->options); - if (filesList == NIL) - ereport(ERROR, - (errmsg("No valid apache arrow files are specified"), - errhint("Use 'file' or 'dir' option to specify apache arrow files on behalf of the foreign table"))); - - /* read the schema */ - memset(&schema, 0, sizeof(ArrowSchema)); - foreach (lc, filesList) - { - const char *fname = strVal(lfirst(lc)); - ArrowFileInfo af_info; - - readArrowFile(fname, &af_info, false); - if (lc == list_head(filesList)) - { - copyArrowNode(&schema.node, &af_info.footer.schema.node); - } - else - { - /* compatibility checks */ - ArrowSchema *stemp = &af_info.footer.schema; - - if (schema.endianness != stemp->endianness || - schema._num_fields != stemp->_num_fields) - elog(ERROR, "file '%s' has incompatible schema definition", fname); - for (j=0; j < schema._num_fields; j++) - { - if (!arrowFieldTypeIsEqual(&schema.fields[j], - &stemp->fields[j])) - elog(ERROR, "file '%s' has incompatible schema definition", fname); - } - } - } - - /* makes a command to define foreign table */ - initStringInfo(&cmd); - appendStringInfo(&cmd, "CREATE FOREIGN TABLE %s (\n", - quote_identifier(stmt->remote_schema)); - for (j=0; j < schema._num_fields; j++) - { - ArrowField *field = &schema.fields[j]; - const char *type_name = arrowTypeToPGTypeName(field); - - if (j > 0) - appendStringInfo(&cmd, ",\n"); - if (!field->name || field->_name_len == 0) - { - elog(NOTICE, "field %d has no name, so \"__col%02d\" is used", - j+1, j+1); - appendStringInfo(&cmd, " __col%02d %s", j+1, type_name); - } - else - appendStringInfo(&cmd, " %s %s", - quote_identifier(field->name), type_name); - } - appendStringInfo(&cmd, - "\n" - ") SERVER %s\n" - " OPTIONS (", stmt->server_name); - foreach (lc, stmt->options) - { - DefElem *defel = lfirst(lc); - - if (lc != list_head(stmt->options)) - appendStringInfo(&cmd, ",\n "); - appendStringInfo(&cmd, "%s '%s'", - defel->defname, - strVal(defel->arg)); - } - appendStringInfo(&cmd, ")"); - - return list_make1(cmd.data); -} - -/* - * pgstrom_arrow_fdw_import_file - * - * NOTE: Due to historical reason, PostgreSQL does not allow to define - * columns more than MaxHeapAttributeNumber (1600) for foreign-tables also, - * not only heap-tables. This restriction comes from NULL-bitmap length - * in HeapTupleHeaderData and width of t_hoff. - * However, it is not a reasonable restriction for foreign-table, because - * it does not use heap-format internally. - */ -static void -__insertPgAttributeTuple(Relation pg_attr_rel, - CatalogIndexState pg_attr_index, - Oid ftable_oid, - AttrNumber attnum, - ArrowField *field) -{ - Oid type_oid; - int32 type_mod; - int16 type_len; - bool type_byval; - char type_align; - int32 type_ndims; - char type_storage; - Datum values[Natts_pg_attribute]; - bool isnull[Natts_pg_attribute]; - HeapTuple tup; - ObjectAddress myself, referenced; - - type_oid = arrowTypeToPGTypeOid(field, &type_mod); - get_typlenbyvalalign(type_oid, - &type_len, - &type_byval, - &type_align); - type_ndims = (type_is_array(type_oid) ? 1 : 0); - type_storage = get_typstorage(type_oid); - - memset(values, 0, sizeof(values)); - memset(isnull, 0, sizeof(isnull)); - - values[Anum_pg_attribute_attrelid - 1] = ObjectIdGetDatum(ftable_oid); - values[Anum_pg_attribute_attname - 1] = CStringGetDatum(field->name); - values[Anum_pg_attribute_atttypid - 1] = ObjectIdGetDatum(type_oid); - values[Anum_pg_attribute_attstattarget - 1] = Int32GetDatum(-1); - values[Anum_pg_attribute_attlen - 1] = Int16GetDatum(type_len); - values[Anum_pg_attribute_attnum - 1] = Int16GetDatum(attnum); - values[Anum_pg_attribute_attndims - 1] = Int32GetDatum(type_ndims); - values[Anum_pg_attribute_attcacheoff - 1] = Int32GetDatum(-1); - values[Anum_pg_attribute_atttypmod - 1] = Int32GetDatum(type_mod); - values[Anum_pg_attribute_attbyval - 1] = BoolGetDatum(type_byval); - values[Anum_pg_attribute_attstorage - 1] = CharGetDatum(type_storage); - values[Anum_pg_attribute_attalign - 1] = CharGetDatum(type_align); - values[Anum_pg_attribute_attnotnull - 1] = BoolGetDatum(!field->nullable); - values[Anum_pg_attribute_attislocal - 1] = BoolGetDatum(true); - isnull[Anum_pg_attribute_attacl - 1] = true; - isnull[Anum_pg_attribute_attoptions - 1] = true; - isnull[Anum_pg_attribute_attfdwoptions - 1] = true; - isnull[Anum_pg_attribute_attmissingval - 1] = true; - - tup = heap_form_tuple(RelationGetDescr(pg_attr_rel), values, isnull); - CatalogTupleInsertWithInfo(pg_attr_rel, tup, pg_attr_index); - - /* add dependency */ - myself.classId = RelationRelationId; - myself.objectId = ftable_oid; - myself.objectSubId = attnum; - referenced.classId = TypeRelationId; - referenced.objectId = type_oid; - referenced.objectSubId = 0; - recordDependencyOn(&myself, &referenced, DEPENDENCY_NORMAL); - - heap_freetuple(tup); -} - -Datum -pgstrom_arrow_fdw_import_file(PG_FUNCTION_ARGS) -{ - CreateForeignTableStmt stmt; - ArrowSchema schema; - List *tableElts = NIL; - char *ftable_name; - char *file_name; - char *namespace_name; - DefElem *defel; - int j, nfields; - Oid ftable_oid; - Oid type_oid; - int type_mod; - ObjectAddress myself; - ArrowFileInfo af_info; - - /* read schema of the file */ - if (PG_ARGISNULL(0)) - elog(ERROR, "foreign table name is not supplied"); - ftable_name = text_to_cstring(PG_GETARG_TEXT_PP(0)); - - if (PG_ARGISNULL(1)) - elog(ERROR, "arrow filename is not supplied"); - file_name = text_to_cstring(PG_GETARG_TEXT_PP(1)); - defel = makeDefElem("file", (Node *)makeString(file_name), -1); - - if (PG_ARGISNULL(2)) - namespace_name = NULL; - else - namespace_name = text_to_cstring(PG_GETARG_TEXT_PP(2)); - - readArrowFile(file_name, &af_info, false); - copyArrowNode(&schema.node, &af_info.footer.schema.node); - if (schema._num_fields > SHRT_MAX) - Elog("Arrow file '%s' has too much fields: %d", - file_name, schema._num_fields); - - /* setup CreateForeignTableStmt */ - memset(&stmt, 0, sizeof(CreateForeignTableStmt)); - NodeSetTag(&stmt, T_CreateForeignTableStmt); - stmt.base.relation = makeRangeVar(namespace_name, ftable_name, -1); - - nfields = Min(schema._num_fields, 100); - for (j=0; j < nfields; j++) - { - ColumnDef *cdef; - - type_oid = arrowTypeToPGTypeOid(&schema.fields[j], &type_mod); - cdef = makeColumnDef(schema.fields[j].name, - type_oid, - type_mod, - InvalidOid); - tableElts = lappend(tableElts, cdef); - } - stmt.base.tableElts = tableElts; - stmt.base.oncommit = ONCOMMIT_NOOP; - stmt.servername = "arrow_fdw"; - stmt.options = list_make1(defel); - - myself = DefineRelation(&stmt.base, - RELKIND_FOREIGN_TABLE, - InvalidOid, - NULL, - __FUNCTION__); - ftable_oid = myself.objectId; - CreateForeignTable(&stmt, ftable_oid); - - if (nfields < schema._num_fields) - { - Relation c_rel = table_open(RelationRelationId, RowExclusiveLock); - Relation a_rel = table_open(AttributeRelationId, RowExclusiveLock); - CatalogIndexState c_index = CatalogOpenIndexes(c_rel); - CatalogIndexState a_index = CatalogOpenIndexes(a_rel); - HeapTuple tup; - - tup = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(ftable_oid)); - if (!HeapTupleIsValid(tup)) - elog(ERROR, "cache lookup failed for relation %u", ftable_oid); - - for (j=nfields; j < schema._num_fields; j++) - { - __insertPgAttributeTuple(a_rel, - a_index, - ftable_oid, - j+1, - &schema.fields[j]); - } - /* update relnatts also */ - ((Form_pg_class) GETSTRUCT(tup))->relnatts = schema._num_fields; - CatalogTupleUpdate(c_rel, &tup->t_self, tup); - - CatalogCloseIndexes(a_index); - CatalogCloseIndexes(c_index); - table_close(a_rel, RowExclusiveLock); - table_close(c_rel, RowExclusiveLock); - - CommandCounterIncrement(); - } - PG_RETURN_VOID(); -} -PG_FUNCTION_INFO_V1(pgstrom_arrow_fdw_import_file); - -/* - * ArrowIsForeignScanParallelSafe - */ -static bool -ArrowIsForeignScanParallelSafe(PlannerInfo *root, - RelOptInfo *rel, - RangeTblEntry *rte) -{ - return true; -} - -/* - * ArrowEstimateDSMForeignScan - */ -static Size -ArrowEstimateDSMForeignScan(ForeignScanState *node, - ParallelContext *pcxt) -{ - return MAXALIGN(sizeof(pg_atomic_uint32) * 3); -} - -/* - * ArrowInitializeDSMForeignScan - */ -static inline void -__ExecInitDSMArrowFdw(ArrowFdwState *af_state, - pg_atomic_uint32 *rbatch_index, - pg_atomic_uint32 *rbatch_nload, - pg_atomic_uint32 *rbatch_nskip) -{ - pg_atomic_init_u32(rbatch_index, 0); - af_state->rbatch_index = rbatch_index; - pg_atomic_init_u32(rbatch_nload, 0); - af_state->rbatch_nload = rbatch_nload; - pg_atomic_init_u32(rbatch_nskip, 0); - af_state->rbatch_nskip = rbatch_nskip; -} - -void -ExecInitDSMArrowFdw(ArrowFdwState *af_state, GpuTaskSharedState *gtss) -{ - __ExecInitDSMArrowFdw(af_state, - >ss->af_rbatch_index, - >ss->af_rbatch_nload, - >ss->af_rbatch_nskip); -} - -static void -ArrowInitializeDSMForeignScan(ForeignScanState *node, - ParallelContext *pcxt, - void *coordinate) -{ - pg_atomic_uint32 *atomic_buffer = coordinate; - - __ExecInitDSMArrowFdw((ArrowFdwState *)node->fdw_state, - atomic_buffer, - atomic_buffer + 1, - atomic_buffer + 2); -} - -/* - * ArrowReInitializeDSMForeignScan - */ -static void -__ExecReInitDSMArrowFdw(ArrowFdwState *af_state) -{ - pg_atomic_write_u32(af_state->rbatch_index, 0); -} - -void -ExecReInitDSMArrowFdw(ArrowFdwState *af_state) -{ - __ExecReInitDSMArrowFdw(af_state); -} - - -static void -ArrowReInitializeDSMForeignScan(ForeignScanState *node, - ParallelContext *pcxt, - void *coordinate) -{ - __ExecReInitDSMArrowFdw((ArrowFdwState *)node->fdw_state); -} - -/* - * ArrowInitializeWorkerForeignScan - */ -static inline void -__ExecInitWorkerArrowFdw(ArrowFdwState *af_state, - pg_atomic_uint32 *rbatch_index, - pg_atomic_uint32 *rbatch_nload, - pg_atomic_uint32 *rbatch_nskip) -{ - af_state->rbatch_index = rbatch_index; - af_state->rbatch_nload = rbatch_nload; - af_state->rbatch_nskip = rbatch_nskip; -} - -void -ExecInitWorkerArrowFdw(ArrowFdwState *af_state, - GpuTaskSharedState *gtss) -{ - __ExecInitWorkerArrowFdw(af_state, - >ss->af_rbatch_index, - >ss->af_rbatch_nload, - >ss->af_rbatch_nskip); -} - -static void -ArrowInitializeWorkerForeignScan(ForeignScanState *node, - shm_toc *toc, - void *coordinate) -{ - pg_atomic_uint32 *atomic_buffer = coordinate; - - __ExecInitWorkerArrowFdw((ArrowFdwState *)node->fdw_state, - atomic_buffer, - atomic_buffer + 1, - atomic_buffer + 2); -} - -/* - * ArrowShutdownForeignScan - */ -static inline void -__ExecShutdownArrowFdw(ArrowFdwState *af_state) -{ - uint32 temp; - - temp = pg_atomic_read_u32(af_state->rbatch_index); - pg_atomic_write_u32(&af_state->__rbatch_index_local, temp); - af_state->rbatch_index = &af_state->__rbatch_index_local; - - temp = pg_atomic_read_u32(af_state->rbatch_nload); - pg_atomic_write_u32(&af_state->__rbatch_nload_local, temp); - af_state->rbatch_nload = &af_state->__rbatch_nload_local; - - temp = pg_atomic_read_u32(af_state->rbatch_nskip); - pg_atomic_write_u32(&af_state->__rbatch_nskip_local, temp); - af_state->rbatch_nskip = &af_state->__rbatch_nskip_local; -} - -void -ExecShutdownArrowFdw(ArrowFdwState *af_state) -{ - __ExecShutdownArrowFdw(af_state); -} - -static void -ArrowShutdownForeignScan(ForeignScanState *node) -{ - __ExecShutdownArrowFdw((ArrowFdwState *)node->fdw_state); -} - -/* - * ArrowPlanForeignModify - */ -static List * -ArrowPlanForeignModify(PlannerInfo *root, - ModifyTable *plan, - Index resultRelation, - int subplan_index) -{ - RangeTblEntry *rte = planner_rt_fetch(resultRelation, root); - ForeignTable *ft = GetForeignTable(rte->relid); - List *filesList __attribute__((unused)); - bool writable; - - if (plan->operation != CMD_INSERT) - elog(ERROR, "not a supported operation on arrow_fdw foreign tables"); - - filesList = __arrowFdwExtractFilesList(ft->options, - NULL, - &writable); - if (!writable) - elog(ERROR, "arrow_fdw: foreign table \"%s\" is not writable", - get_rel_name(rte->relid)); - Assert(list_length(filesList) == 1); - - return NIL; -} - -/* - * ArrowBeginForeignModify - */ -static void -__ArrowBeginForeignModify(ResultRelInfo *rrinfo, int eflags) -{ - Relation frel = rrinfo->ri_RelationDesc; - TupleDesc tupdesc = RelationGetDescr(frel); - ForeignTable *ft = GetForeignTable(RelationGetRelid(frel)); - List *filesList = arrowFdwExtractFilesList(ft->options); - const char *fname; - File filp; - struct stat stat_buf; - ArrowFileInfo *af_info = NULL; - arrowWriteState *aw_state; - SQLtable *table; - MetadataCacheKey key; - off_t f_pos; - - Assert(list_length(filesList) == 1); - fname = strVal(linitial(filesList)); - - LockRelation(frel, ShareRowExclusiveLock); - filp = PathNameOpenFile(fname, O_RDWR | PG_BINARY); - if (filp >= 0) - { - af_info = alloca(sizeof(ArrowFileInfo)); - readArrowFileDesc(FileGetRawDesc(filp), af_info); - f_pos = createArrowWriteRedoLog(filp, false); - } - else if (errno == ENOENT) - { - filp = PathNameOpenFile(fname, O_RDWR | O_CREAT | O_EXCL | PG_BINARY); - if (filp < 0) - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not open file \"%s\": %m", fname))); - PG_TRY(); - { - f_pos = createArrowWriteRedoLog(filp, true); - } - PG_CATCH(); - { - unlink(fname); - PG_RE_THROW(); - } - PG_END_TRY(); - } - else - { - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not open file \"%s\": %m", fname))); - } - - if (fstat(FileGetRawDesc(filp), &stat_buf) != 0) - elog(ERROR, "failed on fstat('%s'): %m", FilePathName(filp)); - initMetadataCacheKey(&key, &stat_buf); - - aw_state = palloc0(offsetof(arrowWriteState, - sql_table.columns[tupdesc->natts])); - aw_state->memcxt = CurrentMemoryContext; - aw_state->file = filp; - memcpy(&aw_state->key, &key, sizeof(MetadataCacheKey)); - aw_state->hash = key.hash; - table = &aw_state->sql_table; - table->filename = FilePathName(filp); - table->fdesc = FileGetRawDesc(filp); - table->f_pos = f_pos; - if (af_info) - setupArrowSQLbufferBatches(table, af_info); - setupArrowSQLbufferSchema(table, tupdesc, af_info); - - rrinfo->ri_FdwState = aw_state; -} - -static void -ArrowBeginForeignModify(ModifyTableState *mtstate, - ResultRelInfo *rrinfo, - List *fdw_private, - int subplan_index, - int eflags) -{ - __ArrowBeginForeignModify(rrinfo, eflags); -} - -/* - * ArrowExecForeignInsert - */ -static TupleTableSlot * -ArrowExecForeignInsert(EState *estate, - ResultRelInfo *rrinfo, - TupleTableSlot *slot, - TupleTableSlot *planSlot) -{ - Relation frel = rrinfo->ri_RelationDesc; - TupleDesc tupdesc = RelationGetDescr(frel); - arrowWriteState *aw_state = rrinfo->ri_FdwState; - SQLtable *table = &aw_state->sql_table; - MemoryContext oldcxt; - size_t usage = 0; - int j; - - slot_getallattrs(slot); - oldcxt = MemoryContextSwitchTo(aw_state->memcxt); - for (j=0; j < tupdesc->natts; j++) - { - Form_pg_attribute attr = tupleDescAttr(tupdesc, j); - SQLfield *column = &table->columns[j]; - Datum datum = slot->tts_values[j]; - bool isnull = slot->tts_isnull[j]; - - if (isnull) - { - usage += sql_field_put_value(column, NULL, 0); - } - else if (attr->attbyval) - { - Assert(column->sql_type.pgsql.typbyval); - usage += sql_field_put_value(column, (char *)&datum, attr->attlen); - } - else if (attr->attlen == -1) - { - int vl_len = VARSIZE_ANY_EXHDR(datum); - char *vl_ptr = VARDATA_ANY(datum); - - Assert(column->sql_type.pgsql.typlen == -1); - usage += sql_field_put_value(column, vl_ptr, vl_len); - } - else - { - elog(ERROR, "Bug? unsupported type format"); - } - } - table->usage = usage; - table->nitems++; - MemoryContextSwitchTo(oldcxt); - - /* - * If usage exceeds the threshold of record-batch size, make a redo-log - * on demand, and write out the buffer. - */ - if (usage > table->segment_sz) - writeOutArrowRecordBatch(aw_state, false); - - return slot; -} - -/* - * ArrowEndForeignModify - */ -static void -ArrowEndForeignModify(EState *estate, - ResultRelInfo *rrinfo) -{ - arrowWriteState *aw_state = rrinfo->ri_FdwState; - - writeOutArrowRecordBatch(aw_state, true); -} - -#if PG_VERSION_NUM >= 110000 -/* - * MEMO: executor begin/end routine, if arrow_fdw is partitioned-leaf - * relations. In this case, ArrowBeginForeignModify shall not be called. - */ -static void -ArrowBeginForeignInsert(ModifyTableState *mtstate, - ResultRelInfo *rrinfo) -{ - __ArrowBeginForeignModify(rrinfo, 0); -} - -static void -ArrowEndForeignInsert(EState *estate, ResultRelInfo *rrinfo) -{ - arrowWriteState *aw_state = rrinfo->ri_FdwState; - - writeOutArrowRecordBatch(aw_state, true); + if (rb_field->null_count > 0) + len += rb_field->nullmap_length; + len += (rb_field->values_length + + rb_field->extra_length); + for (int j=0; j < rb_field->num_children; j++) + len += __recordBatchFieldLength(&rb_field->children[j]); + return len; } -#endif -/* - * ArrowExplainForeignModify - */ static void -ArrowExplainForeignModify(ModifyTableState *mtstate, - ResultRelInfo *rinfo, - List *fdw_private, - int subplan_index, - struct ExplainState *es) -{ - /* print something */ -} - -/* - * handler of Arrow_Fdw - */ -Datum -pgstrom_arrow_fdw_handler(PG_FUNCTION_ARGS) -{ - PG_RETURN_POINTER(&pgstrom_arrow_fdw_routine); -} -PG_FUNCTION_INFO_V1(pgstrom_arrow_fdw_handler); - -/* - * arrowFieldGetPGTypeHint - */ -static Oid -arrowFieldGetPGTypeHint(ArrowField *field) +ArrowGetForeignRelSize(PlannerInfo *root, + RelOptInfo *baserel, + Oid foreigntableid) { - Oid hint_oid = InvalidOid; - int i, j; + ForeignTable *ft = GetForeignTable(foreigntableid); + Relation frel = table_open(foreigntableid, NoLock); + List *filesList; + List *results = NIL; + Bitmapset *referenced = NULL; + ListCell *lc1, *lc2; + size_t totalLen = 0; + double ntuples = 0.0; + int parallel_nworkers; - for (i=0; i < field->_num_custom_metadata; i++) + /* columns to be referenced */ + foreach (lc1, baserel->baserestrictinfo) { - ArrowKeyValue *kv = &field->custom_metadata[i]; - char *namebuf, *pos; - Oid namespace_oid; - HeapTuple tup; - - if (kv->_key_len != 7 || strncmp(kv->key, "pg_type", 7) != 0) - continue; - namebuf = alloca(kv->_value_len + 10); - /* namespace name */ - pos = namebuf; - for (j=0; j < kv->_value_len; j++) - { - int c = kv->value[j]; - - if (c == '.') - break; - else if (c == '\\' && ++j < kv->_value_len) - c = kv->value[j]; - *pos++ = c; - } - *pos++ = '\0'; - - namespace_oid = get_namespace_oid(namebuf, true); - if (!OidIsValid(namespace_oid)) - continue; - - /* type name */ - pos = namebuf; - for (j++; j < kv->_value_len; j++) - { - int c = kv->value[j]; - - if (c == '\\' && ++j < kv->_value_len) - c = kv->value[j]; - *pos++ = c; - } - *pos++ = '\0'; - - tup = SearchSysCache2(TYPENAMENSP, - PointerGetDatum(namebuf), - ObjectIdGetDatum(namespace_oid)); - if (!HeapTupleIsValid(tup)) - continue; - hint_oid = PgTypeTupleGetOid(tup); + RestrictInfo *rinfo = lfirst(lc1); - ReleaseSysCache(tup); - - return hint_oid; + pull_varattnos((Node *)rinfo->clause, baserel->relid, &referenced); } - return InvalidOid; -} + referenced = pickup_outer_referenced(root, baserel, referenced); -static bool -__arrowStructTypeIsCompatible(ArrowField *field, Oid comp_oid) -{ - TupleDesc tupdesc; - int j; - bool compatible = false; - - if (pg_type_aclcheck(comp_oid, - GetUserId(), - ACL_USAGE) != ACLCHECK_OK) - return false; - - tupdesc = lookup_rowtype_tupdesc_noerror(comp_oid, -1, true); - if (tupdesc && tupdesc->natts == field->_num_children) + /* read arrow-file metadta */ + filesList = arrowFdwExtractFilesList(ft->options, ¶llel_nworkers); + foreach (lc1, filesList) { - for (j=0; j < tupdesc->natts; j++) - { - Form_pg_attribute attr = tupleDescAttr(tupdesc, j); - ArrowField *child = &field->children[j]; - Oid typoid; - int typmod; - - typoid = arrowTypeToPGTypeOid(child, &typmod); - if (typoid != attr->atttypid) - break; - } - if (j >= tupdesc->natts) - compatible = true; - } - if (tupdesc) - ReleaseTupleDesc(tupdesc); - - return compatible; -} - -static Oid -arrowTypeToPGTypeOid(ArrowField *field, int *p_type_mod) -{ - ArrowType *t = &field->type; - Oid hint_oid; - int i; + ArrowFileState *af_state; + char *fname = strVal(lfirst(lc1)); - hint_oid = arrowFieldGetPGTypeHint(field); - - /* extra module may provide own mapping */ - for (i=0; i < pgstrom_num_users_extra; i++) - { - pgstromUsersExtraDescriptor *extra = &pgstrom_users_extra_desc[i]; - Oid type_oid; + af_state = BuildArrowFileState(frel, fname, NULL); + if (!af_state) + continue; - if (extra->arrow_lookup_pgtype) + /* + * Size calculation based the record-batch metadata + */ + foreach (lc2, af_state->rb_list) { - type_oid = extra->arrow_lookup_pgtype(field, hint_oid, p_type_mod); - if (OidIsValid(type_oid)) - return type_oid; - } - } + RecordBatchState *rb_state = lfirst(lc2); - *p_type_mod = -1; - switch (t->node.tag) - { - case ArrowNodeTag__Int: - switch (t->Int.bitWidth) - { - case 8: - return INT1OID; - case 16: - return INT2OID; - case 32: - return INT4OID; - case 64: - return INT8OID; - default: - elog(ERROR, "%s is not supported", - arrowNodeName(&t->node)); - break; - } - break; - case ArrowNodeTag__FloatingPoint: - switch (t->FloatingPoint.precision) + /* whole-row reference? */ + if (bms_is_member(-FirstLowInvalidHeapAttributeNumber, referenced)) { - case ArrowPrecision__Half: - return FLOAT2OID; - case ArrowPrecision__Single: - return FLOAT4OID; - case ArrowPrecision__Double: - return FLOAT8OID; - default: - elog(ERROR, "%s is not supported", - arrowNodeName(&t->node)); + totalLen += rb_state->rb_length; } - break; - case ArrowNodeTag__Utf8: - return TEXTOID; - case ArrowNodeTag__Binary: - return BYTEAOID; - case ArrowNodeTag__Bool: - return BOOLOID; - case ArrowNodeTag__Decimal: - if (t->Decimal.bitWidth == 128) - return NUMERICOID; - break; - case ArrowNodeTag__Date: - return DATEOID; - case ArrowNodeTag__Time: - return TIMEOID; - case ArrowNodeTag__Timestamp: - if (t->Timestamp.timezone) - return TIMESTAMPTZOID; - return TIMESTAMPOID; - case ArrowNodeTag__Interval: - return INTERVALOID; - case ArrowNodeTag__List: - if (field->_num_children != 1) - elog(ERROR, "arrow_fdw: corrupted List type definition"); else { - ArrowField *child = &field->children[0]; - Oid type_oid; - Oid elem_oid; - int elem_mod; - - elem_oid = arrowTypeToPGTypeOid(child, &elem_mod); - type_oid = get_array_type(elem_oid); - if (!OidIsValid(type_oid)) - elog(ERROR, "array of %s type is not defined", - arrowNodeName(&t->node)); - return type_oid; - } - break; - - case ArrowNodeTag__Struct: - if (!OidIsValid(hint_oid) || - !__arrowStructTypeIsCompatible(field, hint_oid)) - { - Relation rel; - ScanKeyData skey[2]; - SysScanDesc sscan; - HeapTuple tup; + int j, k; - /* - * lookup composite type definition from pg_class - * At least, nattrs == _num_children - */ - rel = table_open(RelationRelationId, AccessShareLock); - ScanKeyInit(&skey[0], - Anum_pg_class_relkind, - BTEqualStrategyNumber, F_CHAREQ, - CharGetDatum(RELKIND_COMPOSITE_TYPE)); - ScanKeyInit(&skey[1], - Anum_pg_class_relnatts, - BTEqualStrategyNumber, F_INT2EQ, - Int16GetDatum(field->_num_children)); - - sscan = systable_beginscan(rel, InvalidOid, false, - NULL, 2, skey); - hint_oid = InvalidOid; - while (!OidIsValid(hint_oid) && - HeapTupleIsValid(tup = systable_getnext(sscan))) + for (k = bms_next_member(referenced, -1); + k >= 0; + k = bms_next_member(referenced, k)) { - Oid reltype = ((Form_pg_class) GETSTRUCT(tup))->reltype; - - if (__arrowStructTypeIsCompatible(field, reltype)) - hint_oid = reltype; + j = k + FirstLowInvalidHeapAttributeNumber; + if (j <= 0 || j > rb_state->nfields) + continue; + totalLen += __recordBatchFieldLength(&rb_state->fields[j-1]); } - systable_endscan(sscan); - table_close(rel, AccessShareLock); - - if (!OidIsValid(hint_oid)) - elog(ERROR, "arrow::%s is not supported", - arrowNodeName(&t->node)); - } - return hint_oid; - - case ArrowNodeTag__FixedSizeBinary: - if (t->FixedSizeBinary.byteWidth < 1 || - t->FixedSizeBinary.byteWidth > BLCKSZ) - elog(ERROR, "arrow_fdw: %s with byteWidth=%d is not supported", - t->node.tagName, - t->FixedSizeBinary.byteWidth); - if (hint_oid == MACADDROID && - t->FixedSizeBinary.byteWidth == sizeof(macaddr)) - { - return MACADDROID; - } - else if (hint_oid == INETOID && - (t->FixedSizeBinary.byteWidth == 4 || - t->FixedSizeBinary.byteWidth == 16)) - { - return INETOID; } - *p_type_mod = t->FixedSizeBinary.byteWidth; - return BPCHAROID; - default: - elog(ERROR, "arrow_fdw: type '%s' is not supported", - field->type.node.tagName); + ntuples += rb_state->rb_nitems; + } + results = lappend(results, af_state); } - return InvalidOid; + table_close(frel, NoLock); + + /* setup baserel */ + baserel->rel_parallel_workers = parallel_nworkers; + baserel->fdw_private = list_make2(results, referenced); + baserel->pages = totalLen / BLCKSZ; + baserel->tuples = ntuples; + baserel->rows = ntuples * + clauselist_selectivity(root, + baserel->baserestrictinfo, + 0, + JOIN_INNER, + NULL); } -static const char * -arrowTypeToPGTypeName(ArrowField *field) +/* + * cost_arrow_fdw_seqscan + */ +static void +cost_arrow_fdw_seqscan(Path *path, + PlannerInfo *root, + RelOptInfo *baserel, + ParamPathInfo *param_info, + int num_workers) { - Oid typoid; - int typmod; - HeapTuple tup; - Form_pg_type type; - char *schema; - char *result; + Cost startup_cost = 0.0; + Cost disk_run_cost = 0.0; + Cost cpu_run_cost = 0.0; + QualCost qcost; + double nrows; + double spc_seq_page_cost; - typoid = arrowTypeToPGTypeOid(field, &typmod); - if (!OidIsValid(typoid)) - return NULL; - tup = SearchSysCache1(TYPEOID, ObjectIdGetDatum(typoid)); - if (!HeapTupleIsValid(tup)) - elog(ERROR, "cache lookup failed for type %u", typoid); - type = (Form_pg_type) GETSTRUCT(tup); - schema = get_namespace_name(type->typnamespace); - if (typmod < 0) - result = psprintf("%s.%s", - quote_identifier(schema), - quote_identifier(NameStr(type->typname))); + if (param_info) + nrows = param_info->ppi_rows; else - result = psprintf("%s.%s(%d)", - quote_identifier(schema), - quote_identifier(NameStr(type->typname)), - typmod); - ReleaseSysCache(tup); + nrows = baserel->rows; - return result; -} + /* arrow_fdw.enabled */ + if (!arrow_fdw_enabled) + startup_cost += disable_cost; -#if 0 -//no longer needed? + /* + * Storage costs + * + * XXX - smaller number of columns to read shall have less disk cost + * because of columnar format. Right now, we don't discount cost for + * the pages not to be read. + */ + get_tablespace_page_costs(baserel->reltablespace, + NULL, + &spc_seq_page_cost); + disk_run_cost = spc_seq_page_cost * baserel->pages; -/* - * arrowTypeIsConvertible - */ -static bool -arrowTypeIsConvertible(Oid type_oid, int typemod) -{ - HeapTuple tup; - Form_pg_type typeForm; - bool retval = false; + /* CPU costs */ + if (param_info) + { + cost_qual_eval(&qcost, param_info->ppi_clauses, root); + qcost.startup += baserel->baserestrictcost.startup; + qcost.per_tuple += baserel->baserestrictcost.per_tuple; + } + else + qcost = baserel->baserestrictcost; + startup_cost += qcost.startup; + cpu_run_cost = (cpu_tuple_cost + qcost.per_tuple) * baserel->tuples; + + /* tlist evaluation costs */ + startup_cost += path->pathtarget->cost.startup; + cpu_run_cost += path->pathtarget->cost.per_tuple * path->rows; - switch (type_oid) + /* adjust cost for CPU parallelism */ + if (num_workers > 0) { - case INT1OID: /* Int8 */ - case INT2OID: /* Int16 */ - case INT4OID: /* Int32 */ - case INT8OID: /* Int64 */ - case FLOAT2OID: /* FP16 */ - case FLOAT4OID: /* FP32 */ - case FLOAT8OID: /* FP64 */ - case TEXTOID: /* Utf8 */ - case BYTEAOID: /* Binary */ - case BOOLOID: /* Bool */ - case NUMERICOID: /* Decimal */ - case DATEOID: /* Date */ - case TIMEOID: /* Time */ - case TIMESTAMPOID: /* Timestamp */ - case TIMESTAMPTZOID:/* TimestampTz */ - case INTERVALOID: /* Interval */ - case BPCHAROID: /* FixedSizeBinary */ - return true; - default: - tup = SearchSysCache1(TYPEOID, ObjectIdGetDatum(type_oid)); - if (!HeapTupleIsValid(tup)) - elog(ERROR, "cache lookup failed for type %u", type_oid); - typeForm = (Form_pg_type) GETSTRUCT(tup); + double leader_contribution; + double parallel_divisor = (double) num_workers; - if (OidIsValid(typeForm->typelem) && typeForm->typlen == -1) - { - retval = arrowTypeIsConvertible(typeForm->typelem, typemod); - } - else if (typeForm->typtype == TYPTYPE_COMPOSITE) - { - Relation rel; - TupleDesc tupdesc; - int j; + /* see get_parallel_divisor() */ + leader_contribution = 1.0 - (0.3 * (double)num_workers); + parallel_divisor += Max(leader_contribution, 0.0); - rel = relation_open(typeForm->typrelid, AccessShareLock); - tupdesc = RelationGetDescr(rel); - for (j=0; j < tupdesc->natts; j++) - { - Form_pg_attribute attr = tupleDescAttr(tupdesc, j); + /* The CPU cost is divided among all the workers. */ + cpu_run_cost /= parallel_divisor; - if (!arrowTypeIsConvertible(attr->atttypid, - attr->atttypmod)) - break; - } - if (j >= tupdesc->natts) - retval = true; - relation_close(rel, AccessShareLock); - } - ReleaseSysCache(tup); + /* Estimated row count per background worker process */ + nrows = clamp_row_est(nrows / parallel_divisor); } - return retval; + path->rows = nrows; + path->startup_cost = startup_cost; + path->total_cost = startup_cost + cpu_run_cost + disk_run_cost; + path->parallel_workers = num_workers; } -#endif /* - * arrowFieldLength + * ArrowGetForeignPaths */ -static size_t -arrowFieldLength(ArrowField *field, int64 nitems) +static void +ArrowGetForeignPaths(PlannerInfo *root, + RelOptInfo *baserel, + Oid foreigntableid) { - ArrowType *type = &field->type; - size_t length = 0; + ForeignPath *fpath; + ParamPathInfo *param_info; + Relids required_outer = baserel->lateral_relids; + + param_info = get_baserel_parampathinfo(root, baserel, required_outer); + fpath = create_foreignscan_path(root, + baserel, + NULL, /* default pathtarget */ + -1.0, /* dummy */ + -1.0, /* dummy */ + -1.0, /* dummy */ + NIL, /* no pathkeys */ + required_outer, + NULL, /* no extra plan */ + NIL); /* no particular private */ + cost_arrow_fdw_seqscan(&fpath->path, + root, + baserel, + param_info, 0); + add_path(baserel, &fpath->path); - switch (type->node.tag) + if (baserel->consider_parallel) { - case ArrowNodeTag__Int: - switch (type->Int.bitWidth) - { - case 8: - length = nitems; - break; - case 16: - length = 2 * nitems; - break; - case 32: - length = 4 * nitems; - break; - case 64: - length = 8 * nitems; - break; - default: - elog(ERROR, "Not a supported Int width: %d", - type->Int.bitWidth); - } - break; - case ArrowNodeTag__FloatingPoint: - switch (type->FloatingPoint.precision) - { - case ArrowPrecision__Half: - length = sizeof(cl_short) * nitems; - break; - case ArrowPrecision__Single: - length = sizeof(cl_float) * nitems; - break; - case ArrowPrecision__Double: - length = sizeof(cl_double) * nitems; - break; - default: - elog(ERROR, "Not a supported FloatingPoint precision"); - } - break; - case ArrowNodeTag__Utf8: - case ArrowNodeTag__Binary: - case ArrowNodeTag__List: - length = sizeof(cl_uint) * (nitems + 1); - break; - case ArrowNodeTag__Bool: - length = BITMAPLEN(nitems); - break; - case ArrowNodeTag__Decimal: - length = sizeof(int128) * nitems; - break; - case ArrowNodeTag__Date: - switch (type->Date.unit) - { - case ArrowDateUnit__Day: - length = sizeof(cl_int) * nitems; - break; - case ArrowDateUnit__MilliSecond: - length = sizeof(cl_long) * nitems; - break; - default: - elog(ERROR, "Not a supported Date unit"); - } - break; - case ArrowNodeTag__Time: - switch (type->Time.unit) - { - case ArrowTimeUnit__Second: - case ArrowTimeUnit__MilliSecond: - length = sizeof(cl_int) * nitems; - break; - case ArrowTimeUnit__MicroSecond: - case ArrowTimeUnit__NanoSecond: - length = sizeof(cl_long) * nitems; - break; - default: - elog(ERROR, "Not a supported Time unit"); - } - break; - case ArrowNodeTag__Timestamp: - length = sizeof(cl_long) * nitems; - break; - case ArrowNodeTag__Interval: - switch (type->Interval.unit) - { - case ArrowIntervalUnit__Year_Month: - length = sizeof(cl_uint) * nitems; - break; - case ArrowIntervalUnit__Day_Time: - length = sizeof(cl_long) * nitems; - break; - default: - elog(ERROR, "Not a supported Interval unit"); - } - break; - case ArrowNodeTag__Struct: //to be supported later - length = 0; /* only nullmap */ - break; - case ArrowNodeTag__FixedSizeBinary: - length = (size_t)type->FixedSizeBinary.byteWidth * nitems; - break; - default: - elog(ERROR, "Arrow Type '%s' is not supported now", - type->node.tagName); - break; + int num_workers = + compute_parallel_worker(baserel, + baserel->pages, -1.0, + max_parallel_workers_per_gather); + if (num_workers == 0) + return; + + fpath = create_foreignscan_path(root, + baserel, + NULL, /* default pathtarget */ + -1.0, /* dummy */ + -1.0, /* dummy */ + -1.0, /* dummy */ + NIL, /* no pathkeys */ + required_outer, + NULL, /* no extra plan */ + NIL); /* no particular private */ + fpath->path.parallel_aware = true; + cost_arrow_fdw_seqscan(&fpath->path, + root, + baserel, + param_info, + num_workers); + add_partial_path(baserel, (Path *)fpath); } - return length; } /* - * arrowSchemaCompatibilityCheck + * ArrowGetForeignPlan */ -static bool -__arrowSchemaCompatibilityCheck(TupleDesc tupdesc, - RecordBatchFieldState *rb_fstate) +static ForeignScan * +ArrowGetForeignPlan(PlannerInfo *root, + RelOptInfo *baserel, + Oid foreigntableid, + ForeignPath *best_path, + List *tlist, + List *scan_clauses, + Plan *outer_plan) { - int j; + Bitmapset *referenced = lsecond(baserel->fdw_private); + List *ref_list = NIL; + int k; - for (j=0; j < tupdesc->natts; j++) + for (k = bms_next_member(referenced, -1); + k >= 0; + k = bms_next_member(referenced, k)) { - RecordBatchFieldState *fstate = &rb_fstate[j]; - Form_pg_attribute attr = tupleDescAttr(tupdesc, j); - - if (!fstate->children) - { - /* shortcut, it should be a scalar built-in type */ - Assert(fstate->num_children == 0); - if (attr->atttypid != fstate->atttypid) - return false; - } - else - { - Form_pg_type typ; - HeapTuple tup; - bool type_is_ok = true; - - tup = SearchSysCache1(TYPEOID, ObjectIdGetDatum(attr->atttypid)); - if (!HeapTupleIsValid(tup)) - elog(ERROR, "cache lookup failed for type %u", attr->atttypid); - typ = (Form_pg_type) GETSTRUCT(tup); - if (OidIsValid(typ->typelem) && typ->typlen == -1 && - fstate->num_children == 1) - { - /* Arrow::List */ - RecordBatchFieldState *cstate = &fstate->children[0]; - - if (typ->typelem == cstate->atttypid) - { - /* - * overwrite typoid / typmod because a same arrow file - * can be reused, and it may be on behalf of different - * user defined data type. - */ - fstate->atttypid = attr->atttypid; - fstate->atttypmod = attr->atttypmod; - } - else - { - type_is_ok = false; - } - } - else if (typ->typlen == -1 && OidIsValid(typ->typrelid)) - { - /* Arrow::Struct */ - TupleDesc sdesc = lookup_rowtype_tupdesc(attr->atttypid, - attr->atttypmod); - if (sdesc->natts == fstate->num_children && - __arrowSchemaCompatibilityCheck(sdesc, fstate->children)) - { - /* see comment above */ - fstate->atttypid = attr->atttypid; - fstate->atttypmod = attr->atttypmod; - } - else - { - type_is_ok = false; - } - DecrTupleDescRefCount(sdesc); - - } - else - { - /* unknown */ - type_is_ok = false; - } - ReleaseSysCache(tup); - if (!type_is_ok) - return false; - } + ref_list = lappend_int(ref_list, k); } - return true; + return make_foreignscan(tlist, + extract_actual_clauses(scan_clauses, false), + baserel->relid, + NIL, /* no expressions to evaluate */ + ref_list, /* list of referenced attnums */ + NIL, /* no custom tlist */ + NIL, /* no remote quals */ + outer_plan); } -static bool -arrowSchemaCompatibilityCheck(TupleDesc tupdesc, RecordBatchState *rb_state) +/* ---------------------------------------------------------------- + * + * Routines related to Arrow datum fetch + * + * ---------------------------------------------------------------- + */ +static void pg_datum_arrow_ref(kern_data_store *kds, + kern_colmeta *cmeta, + size_t index, + Datum *p_datum, + bool *p_isnull); + +static Datum +pg_varlena32_arrow_ref(kern_data_store *kds, + kern_colmeta *cmeta, size_t index) { - if (tupdesc->natts != rb_state->ncols) - return false; - return __arrowSchemaCompatibilityCheck(tupdesc, rb_state->columns); + uint32_t *offset = (uint32_t *)((char *)kds + + __kds_unpack(cmeta->values_offset)); + char *extra = (char *)kds + __kds_unpack(cmeta->extra_offset); + uint32_t len; + struct varlena *res; + + if (sizeof(uint32_t) * (index+2) > __kds_unpack(cmeta->values_length)) + elog(ERROR, "corruption? varlena index out of range"); + len = offset[index+1] - offset[index]; + if (offset[index] > offset[index+1] || + offset[index+1] > __kds_unpack(cmeta->extra_length)) + elog(ERROR, "corruption? varlena points out of extra buffer"); + if (len >= (1UL<values_offset)); + uint64_t *offset = (uint64_t *)((char *)kds + + __kds_unpack(cmeta->values_offset)); char *extra = (char *)kds + __kds_unpack(cmeta->extra_offset); - cl_uint len; + uint64_t len; struct varlena *res; - if (sizeof(uint32) * (index+2) > __kds_unpack(cmeta->values_length)) + if (sizeof(uint64_t) * (index+2) > __kds_unpack(cmeta->values_length)) elog(ERROR, "corruption? varlena index out of range"); len = offset[index+1] - offset[index]; if (offset[index] > offset[index+1] || offset[index+1] > __kds_unpack(cmeta->extra_length)) elog(ERROR, "corruption? varlena points out of extra buffer"); - + if (len >= (1UL<values_offset)); + char *values = ((char *)kds + __kds_unpack(cmeta->values_offset)); size_t length = __kds_unpack(cmeta->values_length); - cl_int unitsz = cmeta->atttypmod - VARHDRSZ; + int32_t unitsz = cmeta->attopts.fixed_size_binary.byteWidth; struct varlena *res; if (unitsz <= 0) @@ -4000,62 +2862,29 @@ static Datum pg_bool_arrow_ref(kern_data_store *kds, kern_colmeta *cmeta, size_t index) { - uint8 *bitmap = (uint8 *)kds + __kds_unpack(cmeta->values_offset); - size_t length = __kds_unpack(cmeta->values_length); - uint8 mask = (1 << (index & 7)); + uint8_t *bitmap = (uint8_t *)kds + __kds_unpack(cmeta->values_offset); + size_t length = __kds_unpack(cmeta->values_length); + uint8_t mask = (1 << (index & 7)); - index >>= 3; - if (sizeof(uint8) * index >= length) + if (sizeof(uint8_t) * index >= length) elog(ERROR, "corruption? bool points out of range"); - return BoolGetDatum((bitmap[index] & mask) != 0 ? true : false); -} - -static Datum -pg_int1_arrow_ref(kern_data_store *kds, - kern_colmeta *cmeta, size_t index) -{ - int8 *values = (int8 *)((char *)kds + __kds_unpack(cmeta->values_offset)); - size_t length = __kds_unpack(cmeta->values_length); - - if (sizeof(int8) * index >= length) - elog(ERROR, "corruption? int8 points out of range"); - return values[index]; -} - -static Datum -pg_int2_arrow_ref(kern_data_store *kds, - kern_colmeta *cmeta, size_t index) -{ - int16 *values = (int16 *)((char *)kds + __kds_unpack(cmeta->values_offset)); - size_t length = __kds_unpack(cmeta->values_length); - - if (sizeof(int16) * index >= length) - elog(ERROR, "corruption? int16 points out of range"); - return values[index]; -} - -static Datum -pg_int4_arrow_ref(kern_data_store *kds, - kern_colmeta *cmeta, size_t index) -{ - int32 *values = (int32 *)((char *)kds + __kds_unpack(cmeta->values_offset)); - size_t length = __kds_unpack(cmeta->values_length); - - if (sizeof(int32) * index >= length) - elog(ERROR, "corruption? int32 points out of range"); - return values[index]; + return BoolGetDatum((bitmap[index>>3] & mask) != 0 ? true : false); } static Datum -pg_int8_arrow_ref(kern_data_store *kds, - kern_colmeta *cmeta, size_t index) +pg_simple_arrow_ref(kern_data_store *kds, + kern_colmeta *cmeta, size_t index) { - int64 *values = (int64 *)((char *)kds + __kds_unpack(cmeta->values_offset)); - size_t length = __kds_unpack(cmeta->values_length); + int32_t unitsz = cmeta->attopts.unitsz; + char *values = (char *)kds + __kds_unpack(cmeta->values_offset); + size_t length = __kds_unpack(cmeta->values_length); + Datum retval = 0; - if (sizeof(int64) * index >= length) - elog(ERROR, "corruption? int64 points out of range"); - return values[index]; + Assert(unitsz > 0 && unitsz <= sizeof(Datum)); + if (unitsz * index >= length) + elog(ERROR, "corruption? simple int8 points out of range"); + memcpy(&retval, values + unitsz * index, unitsz); + return retval; } static Datum @@ -4066,18 +2895,12 @@ pg_numeric_arrow_ref(kern_data_store *kds, char *base = (char *)kds + __kds_unpack(cmeta->values_offset); size_t length = __kds_unpack(cmeta->values_length); int dscale = cmeta->attopts.decimal.scale; - Int128_t decimal; + int128_t ival; - if (sizeof(int128) * index >= length) + if (sizeof(int128_t) * index >= length) elog(ERROR, "corruption? numeric points out of range"); - decimal.ival = ((int128 *)base)[index]; - - while (dscale > 0 && decimal.ival % 10 == 0) - { - decimal.ival /= 10; - dscale--; - } - pg_numeric_to_varlena(result, dscale, decimal); + ival = ((int128_t *)base)[index]; + __xpu_numeric_to_varlena(result, dscale, ival); return PointerGetDatum(result); } @@ -4267,11 +3090,11 @@ pg_inet_arrow_ref(kern_data_store *kds, static Datum pg_array_arrow_ref(kern_data_store *kds, kern_colmeta *smeta, - cl_uint start, cl_uint end) + uint32_t start, uint32_t end) { ArrayType *res; size_t sz; - cl_uint i, nitems = end - start; + uint32_t i, nitems = end - start; bits8 *nullmap = NULL; size_t usage, __usage; @@ -4336,7 +3159,7 @@ pg_array_arrow_ref(kern_data_store *kds, } else if (smeta->attlen == -1) { - cl_int vl_len = VARSIZE(datum); + int32_t vl_len = VARSIZE(datum); if (nullmap) nullmap[i>>3] |= (1<<(i&7)); @@ -4372,7 +3195,7 @@ pg_datum_arrow_ref(kern_data_store *kds, bool *p_isnull) { Datum datum = 0; - bool isnull = true; + bool isnull = false; if (cmeta->nullmap_offset != 0) { @@ -4380,127 +3203,134 @@ pg_datum_arrow_ref(kern_data_store *kds, uint8 *nullmap = (uint8 *)kds + nullmap_offset; if (att_isnull(index, nullmap)) + { + isnull = true; goto out; + } } - - if (cmeta->atttypkind == TYPE_KIND__ARRAY) - { - /* array type */ - kern_colmeta *smeta; - uint32 *offset; - - if (cmeta->num_subattrs != 1 || - cmeta->idx_subattrs < kds->ncols || - cmeta->idx_subattrs >= kds->nr_colmeta) - elog(ERROR, "Bug? corrupted kernel column metadata"); - if (sizeof(uint32) * (index+2) > __kds_unpack(cmeta->values_length)) - elog(ERROR, "Bug? array index is out of range"); - smeta = &kds->colmeta[cmeta->idx_subattrs]; - offset = (uint32 *)((char *)kds + __kds_unpack(cmeta->values_offset)); - datum = pg_array_arrow_ref(kds, smeta, - offset[index], - offset[index+1]); - isnull = false; - } - else if (cmeta->atttypkind == TYPE_KIND__COMPOSITE) + + switch (cmeta->attopts.tag) { - /* composite type */ - TupleDesc tupdesc = lookup_rowtype_tupdesc(cmeta->atttypid, -1); - Datum *sub_values = alloca(sizeof(Datum) * tupdesc->natts); - bool *sub_isnull = alloca(sizeof(bool) * tupdesc->natts); - HeapTuple htup; - int j; - - if (tupdesc->natts != cmeta->num_subattrs) - elog(ERROR, "Struct definition is conrrupted?"); - if (cmeta->idx_subattrs < kds->ncols || - cmeta->idx_subattrs + cmeta->num_subattrs > kds->nr_colmeta) - elog(ERROR, "Bug? strange kernel column metadata"); - - for (j=0; j < tupdesc->natts; j++) - { - kern_colmeta *sub_meta = &kds->colmeta[cmeta->idx_subattrs + j]; - - pg_datum_arrow_ref(kds, sub_meta, index, - sub_values + j, - sub_isnull + j); - } - htup = heap_form_tuple(tupdesc, sub_values, sub_isnull); + case ArrowType__Int: + case ArrowType__FloatingPoint: + datum = pg_simple_arrow_ref(kds, cmeta, index); + break; + case ArrowType__Bool: + datum = pg_bool_arrow_ref(kds, cmeta, index); + break; + case ArrowType__Decimal: + datum = pg_numeric_arrow_ref(kds, cmeta, index); + break; + case ArrowType__Date: + datum = pg_date_arrow_ref(kds, cmeta, index); + break; + case ArrowType__Time: + datum = pg_time_arrow_ref(kds, cmeta, index); + break; + case ArrowType__Timestamp: + datum = pg_timestamp_arrow_ref(kds, cmeta, index); + break; + case ArrowType__Interval: + datum = pg_interval_arrow_ref(kds, cmeta, index); + break; + case ArrowType__Utf8: + case ArrowType__Binary: + datum = pg_varlena32_arrow_ref(kds, cmeta, index); + break; + case ArrowType__LargeUtf8: + case ArrowType__LargeBinary: + datum = pg_varlena64_arrow_ref(kds, cmeta, index); + break; - ReleaseTupleDesc(tupdesc); + case ArrowType__FixedSizeBinary: + switch (cmeta->atttypid) + { + case MACADDROID: + datum = pg_macaddr_arrow_ref(kds, cmeta, index); + break; + case INETOID: + datum = pg_inet_arrow_ref(kds, cmeta, index); + break; + case BPCHAROID: + datum = pg_bpchar_arrow_ref(kds, cmeta, index); + break; + default: + elog(ERROR, "unknown FixedSizeBinary mapping"); + break; + } + break; + + case ArrowType__List: + { + kern_colmeta *smeta; + uint32_t *offset; + + if (cmeta->num_subattrs != 1 || + cmeta->idx_subattrs < kds->ncols || + cmeta->idx_subattrs >= kds->nr_colmeta) + elog(ERROR, "Bug? corrupted kernel column metadata"); + if (sizeof(uint32_t) * (index+2) > __kds_unpack(cmeta->values_length)) + elog(ERROR, "Bug? array index is out of range"); + smeta = &kds->colmeta[cmeta->idx_subattrs]; + offset = (uint32_t *)((char *)kds + __kds_unpack(cmeta->values_offset)); + datum = pg_array_arrow_ref(kds, smeta, + offset[index], + offset[index+1]); + isnull = false; + } + break; - datum = PointerGetDatum(htup->t_data); - isnull = false; - } - else if (cmeta->atttypkind != TYPE_KIND__NULL) - { - /* anything else, except for unreferenced column */ - int i; + case ArrowType__LargeList: + { + kern_colmeta *smeta; + uint64_t *offset; + + if (cmeta->num_subattrs != 1 || + cmeta->idx_subattrs < kds->ncols || + cmeta->idx_subattrs >= kds->nr_colmeta) + elog(ERROR, "Bug? corrupted kernel column metadata"); + if (sizeof(uint64_t) * (index+2) > __kds_unpack(cmeta->values_length)) + elog(ERROR, "Bug? array index is out of range"); + smeta = &kds->colmeta[cmeta->idx_subattrs]; + offset = (uint64_t *)((char *)kds + __kds_unpack(cmeta->values_offset)); + datum = pg_array_arrow_ref(kds, smeta, + offset[index], + offset[index+1]); + isnull = false; + } + break; - switch (cmeta->atttypid) - { - case INT1OID: - datum = pg_int1_arrow_ref(kds, cmeta, index); - break; - case INT2OID: - case FLOAT2OID: - datum = pg_int2_arrow_ref(kds, cmeta, index); - break; - case INT4OID: - case FLOAT4OID: - datum = pg_int4_arrow_ref(kds, cmeta, index); - break; - case INT8OID: - case FLOAT8OID: - datum = pg_int8_arrow_ref(kds, cmeta, index); - break; - case TEXTOID: - case BYTEAOID: - datum = pg_varlena_arrow_ref(kds, cmeta, index); - break; - case BPCHAROID: - datum = pg_bpchar_arrow_ref(kds, cmeta, index); - break; - case BOOLOID: - datum = pg_bool_arrow_ref(kds, cmeta, index); - break; - case NUMERICOID: - datum = pg_numeric_arrow_ref(kds, cmeta, index); - break; - case DATEOID: - datum = pg_date_arrow_ref(kds, cmeta, index); - break; - case TIMEOID: - datum = pg_time_arrow_ref(kds, cmeta, index); - break; - case TIMESTAMPOID: - case TIMESTAMPTZOID: - datum = pg_timestamp_arrow_ref(kds, cmeta, index); - break; - case INTERVALOID: - datum = pg_interval_arrow_ref(kds, cmeta, index); - break; - case MACADDROID: - datum = pg_macaddr_arrow_ref(kds, cmeta, index); - break; - case INETOID: - datum = pg_inet_arrow_ref(kds, cmeta, index); - break; - default: - for (i=0; i < pgstrom_num_users_extra; i++) + case ArrowType__Struct: + { + TupleDesc tupdesc = lookup_rowtype_tupdesc(cmeta->atttypid, -1); + Datum *sub_values = alloca(sizeof(Datum) * tupdesc->natts); + bool *sub_isnull = alloca(sizeof(bool) * tupdesc->natts); + HeapTuple htup; + + if (tupdesc->natts != cmeta->num_subattrs) + elog(ERROR, "Struct definition is conrrupted?"); + if (cmeta->idx_subattrs < kds->ncols || + cmeta->idx_subattrs + cmeta->num_subattrs > kds->nr_colmeta) + elog(ERROR, "Bug? strange kernel column metadata"); + for (int j=0; j < tupdesc->natts; j++) { - pgstromUsersExtraDescriptor *extra = &pgstrom_users_extra_desc[i]; + kern_colmeta *sub_meta = &kds->colmeta[cmeta->idx_subattrs + j]; - if (extra->arrow_datum_ref && - extra->arrow_datum_ref(kds, cmeta, index, &datum, &isnull)) - { - goto out; - } + pg_datum_arrow_ref(kds, sub_meta, index, + sub_values + j, + sub_isnull + j); } - elog(ERROR, "Bug? unexpected datum type: %u", cmeta->atttypid); - break; - } - isnull = false; + htup = heap_form_tuple(tupdesc, sub_values, sub_isnull); + + ReleaseTupleDesc(tupdesc); + + datum = PointerGetDatum(htup->t_data); + isnull = false; + } + break; + default: + /* TODO: custom data type support here */ + elog(ERROR, "arrow_fdw: unknown or unsupported type"); } out: *p_datum = datum; @@ -4511,20 +3341,25 @@ pg_datum_arrow_ref(kern_data_store *kds, * KDS_fetch_tuple_arrow */ bool -KDS_fetch_tuple_arrow(TupleTableSlot *slot, +kds_arrow_fetch_tuple(TupleTableSlot *slot, kern_data_store *kds, - size_t index) + size_t index, + const Bitmapset *referenced) { - int j; + int j, k; if (index >= kds->nitems) return false; ExecStoreAllNullTuple(slot); - for (j=0; j < kds->ncols; j++) + for (k = bms_next_member(referenced, -1); + k >= 0; + k = bms_next_member(referenced, k)) { - kern_colmeta *cmeta = &kds->colmeta[j]; - - pg_datum_arrow_ref(kds, cmeta, + j = k + FirstLowInvalidHeapAttributeNumber - 1; + if (j < 0) + continue; + pg_datum_arrow_ref(kds, + &kds->colmeta[j], index, slot->tts_values + j, slot->tts_isnull + j); @@ -4532,1559 +3367,1151 @@ KDS_fetch_tuple_arrow(TupleTableSlot *slot, return true; } +/* ---------------------------------------------------------------- + * + * Executor callbacks + * + * ---------------------------------------------------------------- + */ + /* - * arrowFdwExtractFilesList + * __arrowFdwExecInit */ -static List * -__arrowFdwExtractFilesList(List *options_list, - int *p_parallel_nworkers, - bool *p_writable) -{ - ListCell *lc; - List *filesList = NIL; - char *dir_path = NULL; - char *dir_suffix = NULL; - int parallel_nworkers = -1; - bool writable = false; /* default: read-only */ +static ArrowFdwState * +__arrowFdwExecInit(ScanState *ss, + List *outer_quals, + const Bitmapset *outer_refs, + const Bitmapset **p_optimal_gpus, + const DpuStorageEntry **p_ds_entry) +{ + Relation frel = ss->ss_currentRelation; + TupleDesc tupdesc = RelationGetDescr(frel); + ForeignTable *ft = GetForeignTable(RelationGetRelid(frel)); + Bitmapset *referenced = NULL; + Bitmapset *stat_attrs = NULL; + Bitmapset *optimal_gpus = NULL; + const DpuStorageEntry *ds_entry = NULL; + bool whole_row_ref = false; + List *filesList; + List *af_states_list = NIL; + uint32_t rb_nrooms = 0; + uint32_t rb_nitems = 0; + ArrowFdwState *arrow_state; + ListCell *lc1, *lc2; - foreach (lc, options_list) + Assert(RelationIsArrowFdw(frel)); + /* expand 'referenced' if it has whole-row reference */ + if (bms_is_member(-FirstLowInvalidHeapAttributeNumber, outer_refs)) + whole_row_ref = true; + for (int j=0; j < tupdesc->natts; j++) { - DefElem *defel = lfirst(lc); + Form_pg_attribute attr = TupleDescAttr(tupdesc, j); + int k = attr->attnum - FirstLowInvalidHeapAttributeNumber; - Assert(IsA(defel->arg, String)); - if (strcmp(defel->defname, "file") == 0) - { - char *temp = strVal(defel->arg); - filesList = lappend(filesList, makeString(pstrdup(temp))); - } - else if (strcmp(defel->defname, "files") == 0) - { - char *temp = pstrdup(strVal(defel->arg)); - char *saveptr; - char *tok, *pos; + if (attr->attisdropped) + continue; + if (whole_row_ref || bms_is_member(k, outer_refs)) + referenced = bms_add_member(referenced, k); + } - while ((tok = strtok_r(temp, ",", &saveptr)) != NULL) + /* setup ArrowFileState */ + filesList = arrowFdwExtractFilesList(ft->options, NULL); + foreach (lc1, filesList) + { + char *fname = strVal(lfirst(lc1)); + ArrowFileState *af_state; + + af_state = BuildArrowFileState(frel, fname, &stat_attrs); + if (af_state) + { + rb_nrooms += list_length(af_state->rb_list); + if (p_optimal_gpus) { - while (isspace(*tok)) - tok++; - pos = tok + strlen(tok) - 1; - while (pos >= tok && isspace(*pos)) - *pos-- = '\0'; + const Bitmapset *__optimal_gpus = GetOptimalGpuForFile(fname); - filesList = lappend(filesList, makeString(pstrdup(tok))); + if (af_states_list == NIL) + optimal_gpus = bms_copy(__optimal_gpus); + else + optimal_gpus = bms_intersect(optimal_gpus, __optimal_gpus); + } + if (p_ds_entry) + { + const DpuStorageEntry *ds_temp; - temp = NULL; + if (af_states_list == NIL) + ds_entry = GetOptimalDpuForFile(fname, &af_state->dpu_path); + else if (ds_entry) + { + ds_temp = GetOptimalDpuForFile(fname, &af_state->dpu_path); + if (!DpuStorageEntryIsEqual(ds_entry, ds_temp)) + ds_entry = NULL; + } } + af_states_list = lappend(af_states_list, af_state); } - else if (strcmp(defel->defname, "dir") == 0) - { - dir_path = strVal(defel->arg); - } - else if (strcmp(defel->defname, "suffix") == 0) - { - dir_suffix = strVal(defel->arg); - } - else if (strcmp(defel->defname, "parallel_workers") == 0) - { - if (parallel_nworkers >= 0) - elog(ERROR, "'parallel_workers' appeared twice"); - parallel_nworkers = atoi(strVal(defel->arg)); - } - else if (strcmp(defel->defname, "writable") == 0) + } + + /* setup ArrowFdwState */ + arrow_state = palloc0(offsetof(ArrowFdwState, rb_states[rb_nrooms])); + arrow_state->referenced = referenced; + if (arrow_fdw_stats_hint_enabled) + arrow_state->stats_hint = execInitArrowStatsHint(ss, outer_quals, stat_attrs); + arrow_state->rbatch_index = &arrow_state->__rbatch_index_local; + arrow_state->rbatch_nload = &arrow_state->__rbatch_nload_local; + arrow_state->rbatch_nskip = &arrow_state->__rbatch_nskip_local; + initStringInfo(&arrow_state->chunk_buffer); + arrow_state->curr_filp = -1; + arrow_state->curr_kds = NULL; + arrow_state->curr_index = 0; + arrow_state->af_states_list = af_states_list; + foreach (lc1, af_states_list) + { + ArrowFileState *af_state = lfirst(lc1); + + foreach (lc2, af_state->rb_list) { - writable = defGetBoolean(defel); + RecordBatchState *rb_state = lfirst(lc2); + + arrow_state->rb_states[rb_nitems++] = rb_state; } - else - elog(ERROR, "arrow: unknown option (%s)", defel->defname); } - if (dir_suffix && !dir_path) - elog(ERROR, "arrow: cannot use 'suffix' option without 'dir'"); + Assert(rb_nrooms == rb_nitems); + arrow_state->rb_nitems = rb_nitems; + + if (p_optimal_gpus) + *p_optimal_gpus = optimal_gpus; + if (p_ds_entry) + *p_ds_entry = ds_entry; + + return arrow_state; +} + +/* + * pgstromArrowFdwExecInit + */ +bool +pgstromArrowFdwExecInit(pgstromTaskState *pts, + List *outer_quals, + const Bitmapset *outer_refs) +{ + Relation frel = pts->css.ss.ss_currentRelation; + ArrowFdwState *arrow_state = NULL; - if (writable) + if (RelationIsArrowFdw(frel)) { - if (dir_path) - elog(ERROR, "arrow: 'dir_path' and 'writable' options are exclusive"); - if (list_length(filesList) == 0) - elog(ERROR, "arrow: 'writable' needs a backend file specified by 'file' option"); - if (list_length(filesList) > 1) - elog(ERROR, "arrow: 'writable' cannot use multiple backend files"); + arrow_state = __arrowFdwExecInit(&pts->css.ss, + outer_quals, + outer_refs, + (pts->task_kind & DEVKIND__NVIDIA_GPU) != 0 + ? &pts->optimal_gpus : NULL, + (pts->task_kind & DEVKIND__NVIDIA_DPU) != 0 + ? &pts->ds_entry : NULL); } + pts->arrow_state = arrow_state; + return (pts->arrow_state != NULL); +} - if (dir_path) - { - struct dirent *dentry; - DIR *dir; - char *temp; +/* + * ArrowBeginForeignScan + */ +static void +ArrowBeginForeignScan(ForeignScanState *node, int eflags) +{ + ForeignScan *fscan = (ForeignScan *)node->ss.ps.plan; + Bitmapset *referenced = NULL; + ListCell *lc; - dir = AllocateDir(dir_path); - while ((dentry = ReadDir(dir, dir_path)) != NULL) - { - if (strcmp(dentry->d_name, ".") == 0 || - strcmp(dentry->d_name, "..") == 0) - continue; - if (dir_suffix) - { - int dlen = strlen(dentry->d_name); - int slen = strlen(dir_suffix); - int diff; + foreach (lc, fscan->fdw_private) + { + int k = lfirst_int(lc); - if (dlen < 2 + slen) - continue; - diff = dlen - slen; - if (dentry->d_name[diff-1] != '.' || - strcmp(dentry->d_name + diff, dir_suffix) != 0) - continue; - } - temp = psprintf("%s/%s", dir_path, dentry->d_name); - filesList = lappend(filesList, makeString(temp)); - } - FreeDir(dir); + referenced = bms_add_member(referenced, k); } + node->fdw_state = __arrowFdwExecInit(&node->ss, + fscan->scan.plan.qual, + referenced, + NULL, /* no GPU */ + NULL); /* no DPU */ +} - if (filesList == NIL) - elog(ERROR, "no files are configured on behalf of the arrow_fdw foreign table"); - foreach (lc, filesList) - { - const char *fname = strVal(lfirst(lc)); +/* + * ExecArrowScanChunk + */ +static inline RecordBatchState * +__arrowFdwNextRecordBatch(ArrowFdwState *arrow_state) +{ + RecordBatchState *rb_state; + uint32_t rb_index; - if (!writable) - { - if (access(fname, R_OK) != 0) - elog(ERROR, "unable to read '%s': %m", fname); - } - else +retry: + rb_index = pg_atomic_fetch_add_u32(arrow_state->rbatch_index, 1); + if (rb_index >= arrow_state->rb_nitems) + return NULL; /* no more chunks to load */ + rb_state = arrow_state->rb_states[rb_index]; + if (arrow_state->stats_hint) + { + if (execCheckArrowStatsHint(arrow_state->stats_hint, rb_state)) { - if (access(fname, R_OK | W_OK) != 0) - { - if (errno != ENOENT) - elog(ERROR, "unable to read/write '%s': %m", fname); - else - { - char *temp = pstrdup(fname); - char *dname = dirname(temp); - - if (access(dname, R_OK | W_OK | X_OK) != 0) - elog(ERROR, "unable to create '%s': %m", fname); - pfree(temp); - } - } + pg_atomic_fetch_add_u32(arrow_state->rbatch_nskip, 1); + goto retry; } + pg_atomic_fetch_add_u32(arrow_state->rbatch_nload, 1); } - /* other properties */ - if (p_parallel_nworkers) - *p_parallel_nworkers = parallel_nworkers; - if (p_writable) - *p_writable = writable; - - return filesList; + return rb_state; } -static List * -arrowFdwExtractFilesList(List *options_list) +/* + * pgstromScanChunkArrowFdw + */ +XpuCommand * +pgstromScanChunkArrowFdw(pgstromTaskState *pts, + struct iovec *xcmd_iov, int *xcmd_iovcnt) { - return __arrowFdwExtractFilesList(options_list, NULL, NULL); -} + ArrowFdwState *arrow_state = pts->arrow_state; + StringInfo chunk_buffer = &arrow_state->chunk_buffer; + RecordBatchState *rb_state; + ArrowFileState *af_state; + strom_io_vector *iovec; + XpuCommand *xcmd; + uint32_t kds_src_offset; + uint32_t kds_src_iovec; + uint32_t kds_src_pathname; + + rb_state = __arrowFdwNextRecordBatch(arrow_state); + if (!rb_state) + return NULL; + af_state = rb_state->af_state; + + /* XpuCommand header */ + resetStringInfo(chunk_buffer); + appendBinaryStringInfo(chunk_buffer, + pts->xcmd_buf.data, + pts->xcmd_buf.len); + /* kds_src + iovec */ + kds_src_offset = chunk_buffer->len; + iovec = arrowFdwLoadRecordBatch(pts->css.ss.ss_currentRelation, + arrow_state->referenced, + rb_state, + chunk_buffer); + kds_src_iovec = __appendBinaryStringInfo(chunk_buffer, + iovec, + offsetof(strom_io_vector, + ioc[iovec->nr_chunks])); + /* arrow filename */ + kds_src_pathname = chunk_buffer->len; + if (!pts->ds_entry) + appendStringInfoString(chunk_buffer, af_state->filename); + else + appendStringInfoString(chunk_buffer, af_state->dpu_path); + appendStringInfoChar(chunk_buffer, '\0'); + + /* assign offset of XpuCommand */ + xcmd = (XpuCommand *)chunk_buffer->data; + xcmd->length = chunk_buffer->len; + xcmd->u.task.kds_src_pathname = kds_src_pathname; + xcmd->u.task.kds_src_iovec = kds_src_iovec; + xcmd->u.task.kds_src_offset = kds_src_offset; + xcmd_iov->iov_base = xcmd; + xcmd_iov->iov_len = xcmd->length; + *xcmd_iovcnt = 1; + + return xcmd; +} /* - * validator of Arrow_Fdw + * ArrowIterateForeignScan */ -Datum -pgstrom_arrow_fdw_validator(PG_FUNCTION_ARGS) +static TupleTableSlot * +ArrowIterateForeignScan(ForeignScanState *node) { - List *options_list = untransformRelOptions(PG_GETARG_DATUM(0)); - Oid catalog = PG_GETARG_OID(1); + ArrowFdwState *arrow_state = node->fdw_state; + TupleTableSlot *slot = node->ss.ss_ScanTupleSlot; + kern_data_store *kds; - if (catalog == ForeignTableRelationId) + while ((kds = arrow_state->curr_kds) == NULL || + arrow_state->curr_index >= kds->nitems) { - List *filesList; - ListCell *lc; + RecordBatchState *rb_state; - filesList = arrowFdwExtractFilesList(options_list); - foreach (lc, filesList) - { - ArrowFileInfo af_info; - const char *fname = strVal(lfirst(lc)); + arrow_state->curr_index = 0; + arrow_state->curr_kds = NULL; + rb_state = __arrowFdwNextRecordBatch(arrow_state); + if (!rb_state) + return NULL; + arrow_state->curr_kds + = arrowFdwFillupRecordBatch(node->ss.ss_currentRelation, + arrow_state->referenced, + rb_state, + &arrow_state->chunk_buffer); + } + Assert(kds && arrow_state->curr_index < kds->nitems); + if (kds_arrow_fetch_tuple(slot, kds, + arrow_state->curr_index++, + arrow_state->referenced)) + return slot; + return NULL; +} - readArrowFile(fname, &af_info, true); - } - } - else if (options_list != NIL) - { - const char *label; - char temp[80]; +/* + * ArrowReScanForeignScan + */ +void +pgstromArrowFdwExecReset(ArrowFdwState *arrow_state) +{ + pg_atomic_write_u32(arrow_state->rbatch_index, 0); + if (arrow_state->curr_kds) + pfree(arrow_state->curr_kds); + arrow_state->curr_kds = NULL; + arrow_state->curr_index = 0; +} - switch (catalog) - { - case ForeignDataWrapperRelationId: - label = "FOREIGN DATA WRAPPER"; - break; - case ForeignServerRelationId: - label = "SERVER"; - break; - case UserMappingRelationId: - label = "USER MAPPING"; - break; - case AttributeRelationId: - label = "attribute of FOREIGN TABLE"; - break; - default: - snprintf(temp, sizeof(temp), - "[unexpected object catalog=%u]", catalog); - label = temp; - break; - } - elog(ERROR, "Arrow_Fdw does not support any options for %s", label); - } - PG_RETURN_VOID(); +static void +ArrowReScanForeignScan(ForeignScanState *node) +{ + pgstromArrowFdwExecReset(node->fdw_state); +} + +/* + * ExecEndArrowScan + */ +void +pgstromArrowFdwExecEnd(ArrowFdwState *arrow_state) +{ + if (arrow_state->curr_filp >= 0) + FileClose(arrow_state->curr_filp); + if (arrow_state->stats_hint) + execEndArrowStatsHint(arrow_state->stats_hint); +} + +static void +ArrowEndForeignScan(ForeignScanState *node) +{ + pgstromArrowFdwExecEnd(node->fdw_state); } -PG_FUNCTION_INFO_V1(pgstrom_arrow_fdw_validator); /* - * pgstrom_arrow_fdw_precheck_schema + * ArrowIsForeignScanParallelSafe */ -static void -arrow_fdw_precheck_schema(Relation rel) +static bool +ArrowIsForeignScanParallelSafe(PlannerInfo *root, + RelOptInfo *rel, + RangeTblEntry *rte) { - TupleDesc tupdesc = RelationGetDescr(rel); - ForeignTable *ft = GetForeignTable(RelationGetRelid(rel)); - List *filesList; - ListCell *lc; - bool writable; -#if 0 - int j; - - /* check schema definition is supported by Apache Arrow */ - for (j=0; j < tupdesc->natts; j++) - { - Form_pg_attribute attr = tupleDescAttr(tupdesc, j); - - if (!arrowTypeIsConvertible(attr->atttypid, - attr->atttypmod)) - elog(ERROR, "column %s of foreign table %s has %s type that is not convertible any supported Apache Arrow types", - NameStr(attr->attname), - RelationGetRelationName(rel), - format_type_be(attr->atttypid)); - } -#endif - filesList = __arrowFdwExtractFilesList(ft->options, - NULL, - &writable); - foreach (lc, filesList) - { - const char *fname = strVal(lfirst(lc)); - File filp; - List *rb_cached = NIL; - ListCell *cell; + return true; +} - filp = PathNameOpenFile(fname, O_RDONLY | PG_BINARY); - if (filp < 0) - { - if (writable && errno == ENOENT) - continue; - elog(ERROR, "failed to open '%s' on behalf of '%s': %m", - fname, RelationGetRelationName(rel)); - } - /* check schema compatibility */ - rb_cached = arrowLookupOrBuildMetadataCache(filp, NULL); - foreach (cell, rb_cached) - { - RecordBatchState *rb_state = lfirst(cell); +/* + * ArrowEstimateDSMForeignScan + */ +static Size +ArrowEstimateDSMForeignScan(ForeignScanState *node, + ParallelContext *pcxt) +{ + return offsetof(pgstromSharedState, inners); +} - if (!arrowSchemaCompatibilityCheck(tupdesc, rb_state)) - elog(ERROR, "arrow file '%s' on behalf of the foreign table '%s' has incompatible schema definition", - fname, RelationGetRelationName(rel)); - } - list_free(rb_cached); - } +/* + * ArrowInitializeDSMForeignScan + */ +void +pgstromArrowFdwInitDSM(ArrowFdwState *arrow_state, + pgstromSharedState *ps_state) +{ + arrow_state->rbatch_index = &ps_state->arrow_rbatch_index; + arrow_state->rbatch_nload = &ps_state->arrow_rbatch_nload; + arrow_state->rbatch_nskip = &ps_state->arrow_rbatch_nskip; } -Datum -pgstrom_arrow_fdw_precheck_schema(PG_FUNCTION_ARGS) +static void +ArrowInitializeDSMForeignScan(ForeignScanState *node, + ParallelContext *pcxt, + void *coordinate) { - EventTriggerData *trigdata; - - if (!CALLED_AS_EVENT_TRIGGER(fcinfo)) - elog(ERROR, "%s: must be called as EventTrigger", - __FUNCTION__); - trigdata = (EventTriggerData *) fcinfo->context; - if (strcmp(trigdata->event, "ddl_command_end") != 0) - elog(ERROR, "%s: must be called on ddl_command_end event", - __FUNCTION__); - if (strcmp(GetCommandTagName(trigdata->tag), - "CREATE FOREIGN TABLE") == 0) - { - CreateStmt *stmt = (CreateStmt *)trigdata->parsetree; - Relation rel; - - rel = relation_openrv_extended(stmt->relation, AccessShareLock, true); - if (!rel) - PG_RETURN_NULL(); - if (rel->rd_rel->relkind == RELKIND_FOREIGN_TABLE && - GetFdwRoutineForRelation(rel, false) == &pgstrom_arrow_fdw_routine) - { - arrow_fdw_precheck_schema(rel); - } - relation_close(rel, AccessShareLock); - } - else if (strcmp(GetCommandTagName(trigdata->tag), - "ALTER FOREIGN TABLE") == 0 && - IsA(trigdata->parsetree, AlterTableStmt)) - { - AlterTableStmt *stmt = (AlterTableStmt *)trigdata->parsetree; - Relation rel; - ListCell *lc; - bool has_schema_change = false; - - rel = relation_openrv_extended(stmt->relation, AccessShareLock, true); - if (!rel) - PG_RETURN_NULL(); - if (rel->rd_rel->relkind == RELKIND_FOREIGN_TABLE && - GetFdwRoutineForRelation(rel, false) == &pgstrom_arrow_fdw_routine) - { - foreach (lc, stmt->cmds) - { - AlterTableCmd *cmd = lfirst(lc); + pgstromSharedState *ps_state = (pgstromSharedState *)coordinate; - if (cmd->subtype == AT_AddColumn || - cmd->subtype == AT_DropColumn || - cmd->subtype == AT_AlterColumnType) - { - has_schema_change = true; - break; - } - } - if (has_schema_change) - arrow_fdw_precheck_schema(rel); - } - relation_close(rel, AccessShareLock); - } - PG_RETURN_NULL(); + memset(ps_state, 0, offsetof(pgstromSharedState, inners)); + pgstromArrowFdwInitDSM(node->fdw_state, ps_state); } -PG_FUNCTION_INFO_V1(pgstrom_arrow_fdw_precheck_schema); /* - * arrowInvalidateMetadataCache - * - * NOTE: caller must have lock_slots[] with EXCLUSIVE mode + * ArrowInitializeWorkerForeignScan */ -static uint64 -__arrowInvalidateMetadataCache(arrowMetadataCache *mcache, bool detach_lru) +void +pgstromArrowFdwAttachDSM(ArrowFdwState *arrow_state, + pgstromSharedState *ps_state) { - arrowMetadataCache *mtemp; - dlist_node *dnode; - uint64 released = 0; - - while (!dlist_is_empty(&mcache->siblings)) - { - dnode = dlist_pop_head_node(&mcache->siblings); - mtemp = dlist_container(arrowMetadataCache, chain, dnode); - Assert(dlist_is_empty(&mtemp->siblings) && - !mtemp->lru_chain.prev && !mtemp->lru_chain.next); - dlist_delete(&mtemp->chain); - released += MAXALIGN(offsetof(arrowMetadataCache, - fstate[mtemp->nfields])); - pfree(mtemp); - } - released += MAXALIGN(offsetof(arrowMetadataCache, - fstate[mcache->nfields])); - if (detach_lru) - { - SpinLockAcquire(&arrow_metadata_state->lru_lock); - dlist_delete(&mcache->lru_chain); - SpinLockRelease(&arrow_metadata_state->lru_lock); - } - dlist_delete(&mcache->chain); - pfree(mcache); - - return pg_atomic_sub_fetch_u64(&arrow_metadata_state->consumed, released); + arrow_state->rbatch_index = &ps_state->arrow_rbatch_index; + arrow_state->rbatch_nload = &ps_state->arrow_rbatch_nload; + arrow_state->rbatch_nskip = &ps_state->arrow_rbatch_nskip; } static void -arrowInvalidateMetadataCache(MetadataCacheKey *mkey, bool detach_lru) +ArrowInitializeWorkerForeignScan(ForeignScanState *node, + shm_toc *toc, + void *coordinate) { - dlist_mutable_iter miter; - int index = mkey->hash % ARROW_METADATA_HASH_NSLOTS; - - dlist_foreach_modify(miter, &arrow_metadata_state->hash_slots[index]) - { - arrowMetadataCache *mcache - = dlist_container(arrowMetadataCache, chain, miter.cur); + pgstromSharedState *ps_state = (pgstromSharedState *)coordinate; - if (mcache->stat_buf.st_dev == mkey->st_dev && - mcache->stat_buf.st_ino == mkey->st_ino) - { - elog(DEBUG2, "arrow_fdw: metadata cache invalidation for the file (st_dev=%lu/st_ino=%lu)", - mkey->st_dev, mkey->st_ino); - __arrowInvalidateMetadataCache(mcache, true); - } - } + pgstromArrowFdwAttachDSM(node->fdw_state, ps_state); } /* - * copyMetadataFieldCache - copy for nested structure + * ArrowShutdownForeignScan */ -static int -copyMetadataFieldCache(RecordBatchFieldState *dest_curr, - RecordBatchFieldState *dest_tail, - int nattrs, - RecordBatchFieldState *columns, - Bitmapset **p_stat_attrs) +void +pgstromArrowFdwShutdown(ArrowFdwState *arrow_state) { - RecordBatchFieldState *dest_next = dest_curr + nattrs; - int j, k, nslots = nattrs; + uint32 temp; - if (dest_next > dest_tail) - return -1; + temp = pg_atomic_read_u32(arrow_state->rbatch_index); + pg_atomic_write_u32(&arrow_state->__rbatch_index_local, temp); + arrow_state->rbatch_index = &arrow_state->__rbatch_index_local; - for (j=0; j < nattrs; j++) - { - RecordBatchFieldState *__dest = dest_curr + j; - RecordBatchFieldState *__orig = columns + j; + temp = pg_atomic_read_u32(arrow_state->rbatch_nload); + pg_atomic_write_u32(&arrow_state->__rbatch_nload_local, temp); + arrow_state->rbatch_nload = &arrow_state->__rbatch_nload_local; + + temp = pg_atomic_read_u32(arrow_state->rbatch_nskip); + pg_atomic_write_u32(&arrow_state->__rbatch_nskip_local, temp); + arrow_state->rbatch_nskip = &arrow_state->__rbatch_nskip_local; - memcpy(__dest, __orig, sizeof(RecordBatchFieldState)); - if (__dest->num_children == 0) - Assert(__dest->children == NULL); - else - { - __dest->children = dest_next; - k = copyMetadataFieldCache(dest_next, - dest_tail, - __orig->num_children, - __orig->children, - NULL); - if (k < 0) - return -1; - dest_next += k; - nslots += k; - } - if (p_stat_attrs && !__orig->stat_isnull) - *p_stat_attrs = bms_add_member(*p_stat_attrs, j+1); - } - return nslots; } -/* - * makeRecordBatchStateFromCache - * - setup RecordBatchState from arrowMetadataCache - */ -static RecordBatchState * -makeRecordBatchStateFromCache(arrowMetadataCache *mcache, - File fdesc, - Bitmapset **p_stat_attrs) +static void +ArrowShutdownForeignScan(ForeignScanState *node) { - RecordBatchState *rbstate; - - rbstate = palloc0(offsetof(RecordBatchState, - columns[mcache->nfields])); - rbstate->fdesc = fdesc; - memcpy(&rbstate->stat_buf, &mcache->stat_buf, sizeof(struct stat)); - rbstate->rb_index = mcache->rb_index; - rbstate->rb_offset = mcache->rb_offset; - rbstate->rb_length = mcache->rb_length; - rbstate->rb_nitems = mcache->rb_nitems; - rbstate->ncols = mcache->ncols; - copyMetadataFieldCache(rbstate->columns, - rbstate->columns + mcache->nfields, - mcache->ncols, - mcache->fstate, - p_stat_attrs); - return rbstate; + pgstromArrowFdwShutdown(node->fdw_state); } /* - * arrowReclaimMetadataCache + * ArrowExplainForeignScan */ -static void -arrowReclaimMetadataCache(void) +void +pgstromArrowFdwExplain(ArrowFdwState *arrow_state, + Relation frel, + ExplainState *es, + List *dcontext) { - arrowMetadataCache *mcache; - LWLock *lock = NULL; - dlist_node *dnode; - uint32 lru_hash; - uint32 lru_index; - uint64 consumed; - - consumed = pg_atomic_read_u64(&arrow_metadata_state->consumed); - if (consumed <= arrow_metadata_cache_size) - return; + TupleDesc tupdesc = RelationGetDescr(frel); + size_t *chunk_sz; + ListCell *lc1, *lc2; + int fcount = 0; + int j, k; + char label[100]; + StringInfoData buf; - SpinLockAcquire(&arrow_metadata_state->lru_lock); - if (dlist_is_empty(&arrow_metadata_state->lru_list)) + initStringInfo(&buf); + /* shows referenced columns */ + for (k = bms_next_member(arrow_state->referenced, -1); + k >= 0; + k = bms_next_member(arrow_state->referenced, k)) { - SpinLockRelease(&arrow_metadata_state->lru_lock); - return; - } - dnode = dlist_tail_node(&arrow_metadata_state->lru_list); - mcache = dlist_container(arrowMetadataCache, lru_chain, dnode); - lru_hash = mcache->hash; - SpinLockRelease(&arrow_metadata_state->lru_lock); - - do { - lru_index = lru_hash % ARROW_METADATA_HASH_NSLOTS; - lock = &arrow_metadata_state->lock_slots[lru_index]; - - LWLockAcquire(lock, LW_EXCLUSIVE); - SpinLockAcquire(&arrow_metadata_state->lru_lock); - if (dlist_is_empty(&arrow_metadata_state->lru_list)) - { - SpinLockRelease(&arrow_metadata_state->lru_lock); - LWLockRelease(lock); - break; - } - dnode = dlist_tail_node(&arrow_metadata_state->lru_list); - mcache = dlist_container(arrowMetadataCache, lru_chain, dnode); - if (mcache->hash == lru_hash) + j = k + FirstLowInvalidHeapAttributeNumber; + + if (j > 0) { - dlist_delete(&mcache->lru_chain); - memset(&mcache->lru_chain, 0, sizeof(dlist_node)); - SpinLockRelease(&arrow_metadata_state->lru_lock); - consumed = __arrowInvalidateMetadataCache(mcache, false); + Form_pg_attribute attr = TupleDescAttr(tupdesc, j-1); + const char *attname = NameStr(attr->attname); + + if (buf.len > 0) + appendStringInfoString(&buf, ", "); + appendStringInfoString(&buf, quote_identifier(attname)); } - else + } + ExplainPropertyText("referenced", buf.data, es); + + /* shows stats hint if any */ + if (arrow_state->stats_hint) + { + arrowStatsHint *stats_hint = arrow_state->stats_hint; + + resetStringInfo(&buf); + foreach (lc1, stats_hint->orig_quals) { - /* LRU-tail was referenced by someone, try again */ - lru_hash = mcache->hash; - SpinLockRelease(&arrow_metadata_state->lru_lock); - } - LWLockRelease(lock); - } while (consumed > arrow_metadata_cache_size); -} + Node *qual = lfirst(lc1); + char *temp; -/* - * __arrowBuildMetadataCache - * - * NOTE: caller must have exclusive lock on arrow_metadata_state->lock_slots[] - */ -static arrowMetadataCache * -__arrowBuildMetadataCache(List *rb_state_list, uint32 hash) -{ - arrowMetadataCache *mcache = NULL; - arrowMetadataCache *mtemp; - dlist_node *dnode; - Size sz, consumed = 0; - int nfields; - ListCell *lc; + temp = deparse_expression(qual, dcontext, es->verbose, false); + if (buf.len > 0) + appendStringInfoString(&buf, ", "); + appendStringInfoString(&buf, temp); + pfree(temp); + } + if (es->analyze) + appendStringInfo(&buf, " [loaded: %u, skipped: %u]", + pg_atomic_read_u32(arrow_state->rbatch_nload), + pg_atomic_read_u32(arrow_state->rbatch_nskip)); + ExplainPropertyText("Stats-Hint", buf.data, es); + } - foreach (lc, rb_state_list) + /* shows files on behalf of the foreign table */ + chunk_sz = alloca(sizeof(size_t) * tupdesc->natts); + memset(chunk_sz, 0, sizeof(size_t) * tupdesc->natts); + foreach (lc1, arrow_state->af_states_list) { - RecordBatchState *rbstate = lfirst(lc); - - if (!mcache) - nfields = RecordBatchFieldCount(rbstate); - else - Assert(nfields == RecordBatchFieldCount(rbstate)); + ArrowFileState *af_state = lfirst(lc1); + size_t total_sz = af_state->stat_buf.st_size; + size_t read_sz = 0; + size_t sz; - sz = offsetof(arrowMetadataCache, fstate[nfields]); - mtemp = MemoryContextAllocZero(TopSharedMemoryContext, sz); - if (!mtemp) + foreach (lc2, af_state->rb_list) { - /* !!out of memory!! */ - if (mcache) + RecordBatchState *rb_state = lfirst(lc2); + + if (bms_is_member(-FirstLowInvalidHeapAttributeNumber, + arrow_state->referenced)) + { + /* whole-row reference */ + read_sz += rb_state->rb_length; + } + else { - while (!dlist_is_empty(&mcache->siblings)) + for (k = bms_next_member(arrow_state->referenced, -1); + k >= 0; + k = bms_next_member(arrow_state->referenced, k)) { - dnode = dlist_pop_head_node(&mcache->siblings); - mtemp = dlist_container(arrowMetadataCache, - chain, dnode); - pfree(mtemp); + j = k + FirstLowInvalidHeapAttributeNumber - 1; + if (j < 0 || j >= tupdesc->natts) + continue; + sz = __recordBatchFieldLength(&rb_state->fields[j]); + read_sz += sz; + chunk_sz[j] += sz; } - pfree(mcache); } - return NULL; } - dlist_init(&mtemp->siblings); - memcpy(&mtemp->stat_buf, &rbstate->stat_buf, sizeof(struct stat)); - mtemp->hash = hash; - mtemp->rb_index = rbstate->rb_index; - mtemp->rb_offset = rbstate->rb_offset; - mtemp->rb_length = rbstate->rb_length; - mtemp->rb_nitems = rbstate->rb_nitems; - mtemp->ncols = rbstate->ncols; - mtemp->nfields = - copyMetadataFieldCache(mtemp->fstate, - mtemp->fstate + nfields, - rbstate->ncols, - rbstate->columns, - NULL); - Assert(mtemp->nfields == nfields); - - if (!mcache) - mcache = mtemp; + /* file size and read size */ + if (es->format == EXPLAIN_FORMAT_TEXT) + { + resetStringInfo(&buf); + appendStringInfo(&buf, "%s (read: %s, size: %s)", + af_state->filename, + format_bytesz(read_sz), + format_bytesz(total_sz)); + snprintf(label, sizeof(label), "file%d", fcount); + ExplainPropertyText(label, buf.data, es); + } else - dlist_push_tail(&mcache->siblings, &mtemp->chain); - consumed += MAXALIGN(sz); + { + snprintf(label, sizeof(label), "file%d", fcount); + ExplainPropertyText(label, af_state->filename, es); + + snprintf(label, sizeof(label), "file%d-read", fcount); + ExplainPropertyText(label, format_bytesz(read_sz), es); + + snprintf(label, sizeof(label), "file%d-size", fcount); + ExplainPropertyText(label, format_bytesz(total_sz), es); + } + fcount++; } - pg_atomic_add_fetch_u64(&arrow_metadata_state->consumed, consumed); - return mcache; + /* read-size per column (only verbose mode) */ + if (es->verbose && arrow_state->rb_nitems > 0 && + !bms_is_member(-FirstLowInvalidHeapAttributeNumber, + arrow_state->referenced)) + { + resetStringInfo(&buf); + for (k = bms_next_member(arrow_state->referenced, -1); + k >= 0; + k = bms_next_member(arrow_state->referenced, k)) + { + Form_pg_attribute attr; + + j = k + FirstLowInvalidHeapAttributeNumber - 1; + if (j < 0 || j >= tupdesc->natts) + continue; + attr = TupleDescAttr(tupdesc, j); + snprintf(label, sizeof(label), " %s", NameStr(attr->attname)); + ExplainPropertyText(label, format_bytesz(chunk_sz[j]), es); + } + } + pfree(buf.data); } +static void +ArrowExplainForeignScan(ForeignScanState *node, ExplainState *es) +{ + Relation frel = node->ss.ss_currentRelation; + List *dcontext; + + dcontext = set_deparse_context_plan(es->deparse_cxt, + node->ss.ps.plan, + NULL); + pgstromArrowFdwExplain(node->fdw_state, frel, es, dcontext); +} /* - * checkArrowRecordBatchIsVisible - * - * NOTE: It must be called under shared lock on lock_slots[] + * ArrowAnalyzeForeignTable */ -static bool -checkArrowRecordBatchIsVisible(RecordBatchState *rbstate, - dlist_head *mvcc_slot) +static int +RecordBatchAcquireSampleRows(Relation relation, + RecordBatchState *rb_state, + HeapTuple *rows, + int nsamples) { - dlist_iter iter; + TupleDesc tupdesc = RelationGetDescr(relation); + kern_data_store *kds; + Bitmapset *referenced = NULL; + StringInfoData buffer; + Datum *values; + bool *isnull; + int count; + uint32_t index; - dlist_foreach(iter, mvcc_slot) + /* ANALYZE needs to fetch all the attributes */ + referenced = bms_make_singleton(-FirstLowInvalidHeapAttributeNumber); + initStringInfo(&buffer); + kds = arrowFdwFillupRecordBatch(relation, + referenced, + rb_state, + &buffer); + values = alloca(sizeof(Datum) * tupdesc->natts); + isnull = alloca(sizeof(bool) * tupdesc->natts); + for (count = 0; count < nsamples; count++) { - arrowWriteMVCCLog *mvcc = dlist_container(arrowWriteMVCCLog, - chain, iter.cur); - if (mvcc->key.st_dev == rbstate->stat_buf.st_dev && - mvcc->key.st_ino == rbstate->stat_buf.st_ino && - mvcc->record_batch == rbstate->rb_index) + /* fetch a row randomly */ + index = (double)kds->nitems * drand48(); + Assert(index < kds->nitems); + + for (int j=0; j < kds->ncols; j++) { - if (TransactionIdIsCurrentTransactionId(mvcc->xid)) - return true; - else - return false; + kern_colmeta *cmeta = &kds->colmeta[j]; + + pg_datum_arrow_ref(kds, + cmeta, + index, + values + j, + isnull + j); } + rows[count] = heap_form_tuple(tupdesc, values, isnull); } - return true; + pfree(buffer.data); + + return count; } -/* - * arrowLookupOrBuildMetadataCache - */ -List * -arrowLookupOrBuildMetadataCache(File fdesc, Bitmapset **p_stat_attrs) +static int +ArrowAcquireSampleRows(Relation relation, + int elevel, + HeapTuple *rows, + int nrooms, + double *p_totalrows, + double *p_totaldeadrows) { - MetadataCacheKey key; - struct stat stat_buf; - uint32 index; - LWLock *lock; - dlist_head *hash_slot; - dlist_head *mvcc_slot; - dlist_iter iter1, iter2; - bool has_exclusive = false; - List *results = NIL; - - if (fstat(FileGetRawDesc(fdesc), &stat_buf) != 0) - elog(ERROR, "failed on fstat('%s'): %m", FilePathName(fdesc)); - - index = initMetadataCacheKey(&key, &stat_buf); - lock = &arrow_metadata_state->lock_slots[index]; - hash_slot = &arrow_metadata_state->hash_slots[index]; - mvcc_slot = &arrow_metadata_state->mvcc_slots[index]; - - LWLockAcquire(lock, LW_SHARED); -retry: - dlist_foreach(iter1, hash_slot) - { - arrowMetadataCache *mcache - = dlist_container(arrowMetadataCache, chain, iter1.cur); - if (mcache->stat_buf.st_dev == stat_buf.st_dev && - mcache->stat_buf.st_ino == stat_buf.st_ino) - { - RecordBatchState *rbstate; + ForeignTable *ft = GetForeignTable(RelationGetRelid(relation)); + List *filesList = arrowFdwExtractFilesList(ft->options, NULL); + List *rb_state_list = NIL; + ListCell *lc1, *lc2; + int64 total_nrows = 0; + int64 count_nrows = 0; + int nsamples_min = nrooms / 100; + int nitems = 0; - Assert(mcache->hash == key.hash); - if (timespec_comp(&mcache->stat_buf.st_mtim, - &stat_buf.st_mtim) < 0 || - timespec_comp(&mcache->stat_buf.st_ctim, - &stat_buf.st_ctim) < 0) - { - char buf1[80], buf2[80], buf3[80], buf4[80]; - char *tail; + foreach (lc1, filesList) + { + ArrowFileState *af_state; + char *fname = strVal(lfirst(lc1)); - if (!has_exclusive) - { - LWLockRelease(lock); - LWLockAcquire(lock, LW_EXCLUSIVE); - has_exclusive = true; - goto retry; - } - ctime_r(&mcache->stat_buf.st_mtime, buf1); - ctime_r(&mcache->stat_buf.st_ctime, buf2); - ctime_r(&stat_buf.st_mtime, buf3); - ctime_r(&stat_buf.st_ctime, buf4); - for (tail=buf1+strlen(buf1)-1; isspace(*tail); *tail--='\0'); - for (tail=buf2+strlen(buf2)-1; isspace(*tail); *tail--='\0'); - for (tail=buf3+strlen(buf3)-1; isspace(*tail); *tail--='\0'); - for (tail=buf4+strlen(buf4)-1; isspace(*tail); *tail--='\0'); - elog(DEBUG2, "arrow_fdw: metadata cache for '%s' (m:%s, c:%s) is older than the latest file (m:%s, c:%s), so invalidated", - FilePathName(fdesc), buf1, buf2, buf3, buf4); - __arrowInvalidateMetadataCache(mcache, true); - break; - } - /* - * Ok, arrow file metadata cache found and still valid - * - * NOTE: we currently support min/max statistics on the top- - * level variables only, not sub-field of the composite values. - */ - rbstate = makeRecordBatchStateFromCache(mcache, fdesc, - p_stat_attrs); - if (checkArrowRecordBatchIsVisible(rbstate, mvcc_slot)) - results = list_make1(rbstate); - dlist_foreach (iter2, &mcache->siblings) - { - arrowMetadataCache *__mcache - = dlist_container(arrowMetadataCache, chain, iter2.cur); - rbstate = makeRecordBatchStateFromCache(__mcache, fdesc, - p_stat_attrs); - if (checkArrowRecordBatchIsVisible(rbstate, mvcc_slot)) - results = lappend(results, rbstate); - } - SpinLockAcquire(&arrow_metadata_state->lru_lock); - dlist_move_head(&arrow_metadata_state->lru_list, - &mcache->lru_chain); - SpinLockRelease(&arrow_metadata_state->lru_lock); - LWLockRelease(lock); + af_state = BuildArrowFileState(relation, fname, NULL); + if (!af_state) + continue; + foreach (lc2, af_state->rb_list) + { + RecordBatchState *rb_state = lfirst(lc2); - return results; + if (rb_state->rb_nitems == 0) + continue; /* not reasonable to sample, skipped */ + total_nrows += rb_state->rb_nitems; + rb_state_list = lappend(rb_state_list, rb_state); } } + nrooms = Min(nrooms, total_nrows); - /* - * Hmm... no valid metadata cache was not found, so build a new entry - * under the exclusive lock on the arrow file. - */ - if (!has_exclusive) - { - LWLockRelease(lock); - LWLockAcquire(lock, LW_EXCLUSIVE); - has_exclusive = true; - goto retry; - } - else + /* fetch samples for each record-batch */ + foreach (lc1, rb_state_list) { - ArrowFileInfo af_info; - arrowMetadataCache *mcache; - arrowStatsBinary *arrow_bstats; - List *rb_state_any = NIL; - - readArrowFileDesc(FileGetRawDesc(fdesc), &af_info); - if (af_info.dictionaries != NULL) - elog(ERROR, "DictionaryBatch is not supported"); - Assert(af_info.footer._num_dictionaries == 0); - - if (af_info.recordBatches == NULL) - elog(DEBUG2, "arrow file '%s' contains no RecordBatch", - FilePathName(fdesc)); + RecordBatchState *rb_state = lfirst(lc1); + int nsamples; - arrow_bstats = buildArrowStatsBinary(&af_info.footer, p_stat_attrs); - for (index = 0; index < af_info.footer._num_recordBatches; index++) - { - RecordBatchState *rb_state; - ArrowBlock *block - = &af_info.footer.recordBatches[index]; - ArrowRecordBatch *rbatch - = &af_info.recordBatches[index].body.recordBatch; - - rb_state = makeRecordBatchState(&af_info.footer.schema, - block, rbatch); - rb_state->fdesc = fdesc; - memcpy(&rb_state->stat_buf, &stat_buf, sizeof(struct stat)); - rb_state->rb_index = index; - - if (arrow_bstats) - applyArrowStatsBinary(rb_state, arrow_bstats); - - if (checkArrowRecordBatchIsVisible(rb_state, mvcc_slot)) - results = lappend(results, rb_state); - rb_state_any = lappend(rb_state_any, rb_state); - } - releaseArrowStatsBinary(arrow_bstats); - /* try to build a metadata cache for further references */ - mcache = __arrowBuildMetadataCache(rb_state_any, key.hash); - if (mcache) - { - dlist_push_head(hash_slot, &mcache->chain); - SpinLockAcquire(&arrow_metadata_state->lru_lock); - dlist_push_head(&arrow_metadata_state->lru_list, - &mcache->lru_chain); - SpinLockRelease(&arrow_metadata_state->lru_lock); - } + count_nrows += rb_state->rb_nitems; + nsamples = (double)nrooms * ((double)count_nrows / + (double)total_nrows) - nitems; + if (nitems + nsamples > nrooms) + nsamples = nrooms - nitems; + if (nsamples > nsamples_min) + nitems += RecordBatchAcquireSampleRows(relation, + rb_state, + rows + nitems, + nsamples); } - LWLockRelease(lock); - /* - * reclaim unreferenced metadata cache entries based on LRU, if shared- - * memory consumption exceeds the configured threshold. - */ - arrowReclaimMetadataCache(); + *p_totalrows = total_nrows; + *p_totaldeadrows = 0.0; - return results; + return nitems; } /* - * lookup_type_extension_info + * ArrowAnalyzeForeignTable */ -static void -lookup_type_extension_info(Oid type_oid, - const char **p_extname, - const char **p_extschema) +static bool +ArrowAnalyzeForeignTable(Relation frel, + AcquireSampleRowsFunc *p_sample_rows_func, + BlockNumber *p_totalpages) { - Oid ext_oid; - char *extname = NULL; - char *extschema = NULL; + ForeignTable *ft = GetForeignTable(RelationGetRelid(frel)); + List *filesList = arrowFdwExtractFilesList(ft->options, NULL); + ListCell *lc; + size_t totalpages = 0; - ext_oid = get_object_extension_oid(TypeRelationId, - type_oid, 0, true); - if (OidIsValid(ext_oid)) + foreach (lc, filesList) { - Relation rel; - SysScanDesc sscan; - ScanKeyData skey; - HeapTuple tup; + const char *fname = strVal(lfirst(lc)); + struct stat stat_buf; - rel = table_open(ExtensionRelationId, AccessShareLock); - ScanKeyInit(&skey, - Anum_pg_extension_oid, - BTEqualStrategyNumber, F_OIDEQ, - ObjectIdGetDatum(ext_oid)); - sscan = systable_beginscan(rel, ExtensionOidIndexId, - true, NULL, 1, &skey); - tup = systable_getnext(sscan); - if (HeapTupleIsValid(tup)) + if (stat(fname, &stat_buf) != 0) { - Form_pg_extension __ext = (Form_pg_extension) GETSTRUCT(tup); - - extname = pstrdup(NameStr(__ext->extname)); - if (__ext->extrelocatable) - extschema = get_namespace_name(__ext->extnamespace); + elog(NOTICE, "failed on stat('%s') on behalf of '%s', skipped", + fname, get_rel_name(ft->relid)); + continue; } - systable_endscan(sscan); - table_close(rel, AccessShareLock); + totalpages += (stat_buf.st_size + BLCKSZ - 1) / BLCKSZ; } - *p_extname = extname; - *p_extschema = extschema; + if (totalpages > MaxBlockNumber) + totalpages = MaxBlockNumber; + + *p_sample_rows_func = ArrowAcquireSampleRows; + *p_totalpages = totalpages; + + return true; } /* - * setupArrowSQLbufferSchema + * ArrowImportForeignSchema */ -static void -__setupArrowSQLbufferField(SQLtable *table, - SQLfield *column, - const char *attname, - Oid atttypid, - int32 atttypmod, - ArrowField *afield) +static List * +ArrowImportForeignSchema(ImportForeignSchemaStmt *stmt, Oid serverOid) { - HeapTuple tup; - Form_pg_type __type; - const char *typname; - const char *typnamespace; - const char *timezone = show_timezone(); - const char *extname; - const char *extschema; - SQLstat *stat_list; - - /* walk down to the base type, if domain */ - for (;;) + ArrowSchema schema; + List *filesList; + ListCell *lc; + int j; + StringInfoData cmd; + + /* sanity checks */ + switch (stmt->list_type) { - tup = SearchSysCache1(TYPEOID, ObjectIdGetDatum(atttypid)); - if (!HeapTupleIsValid(tup)) - elog(ERROR, "cache lookup failed for type: %u", atttypid); - __type = (Form_pg_type) GETSTRUCT(tup); - if (__type->typtype != TYPTYPE_DOMAIN) + case FDW_IMPORT_SCHEMA_ALL: + break; + case FDW_IMPORT_SCHEMA_LIMIT_TO: + elog(ERROR, "arrow_fdw does not support LIMIT TO clause"); + break; + case FDW_IMPORT_SCHEMA_EXCEPT: + elog(ERROR, "arrow_fdw does not support EXCEPT clause"); + break; + default: + elog(ERROR, "arrow_fdw: Bug? unknown list-type"); break; - atttypid = __type->typbasetype; - atttypmod = __type->typtypmod; - ReleaseSysCache(tup); } - typname = NameStr(__type->typname); - typnamespace = get_namespace_name(__type->typnamespace); - lookup_type_extension_info(atttypid, - &extname, - &extschema); - table->numFieldNodes++; - table->numBuffers += - assignArrowTypePgSQL(column, - attname, - atttypid, - atttypmod, - typname, - typnamespace, - __type->typlen, - __type->typbyval, - __type->typtype, - __type->typalign, - __type->typrelid, - __type->typelem, - timezone, - extname, - extschema, - afield); - /* assign existing min/max statistics, if any */ - if (afield) + filesList = arrowFdwExtractFilesList(stmt->options, NULL); + if (filesList == NIL) + ereport(ERROR, + (errmsg("No valid apache arrow files are specified"), + errhint("Use 'file' or 'dir' option to specify apache arrow files on behalf of the foreign table"))); + + /* read the schema */ + memset(&schema, 0, sizeof(ArrowSchema)); + foreach (lc, filesList) { - stat_list = __buildArrowFieldStatsList(afield, table->numRecordBatches); - if (stat_list) + ArrowFileInfo af_info; + const char *fname = strVal(lfirst(lc)); + + readArrowFile(fname, &af_info, false); + if (lc == list_head(filesList)) { - column->stat_list = stat_list; - column->stat_enabled = true; - table->has_statistics = true; + copyArrowNode(&schema.node, &af_info.footer.schema.node); } - } - - if (OidIsValid(__type->typelem) && __type->typlen == -1) - { - /* array type */ - char elem_name[NAMEDATALEN+10]; - ArrowField *__afield = NULL; - - snprintf(elem_name, sizeof(elem_name), "_%s[]", attname); - column->element = palloc0(sizeof(SQLfield)); - if (afield) + else { - if (afield->_num_children != 1) - elog(ERROR, "Arrow::Field (%s) is not compatible", afield->name); - __afield = &afield->children[0]; + /* compatibility checks */ + ArrowSchema *stemp = &af_info.footer.schema; + + if (schema.endianness != stemp->endianness || + schema._num_fields != stemp->_num_fields) + elog(ERROR, "file '%s' has incompatible schema definition", fname); + for (j=0; j < schema._num_fields; j++) + { + if (!arrowFieldTypeIsEqual(&schema.fields[j], + &stemp->fields[j])) + elog(ERROR, "file '%s' has incompatible schema definition", fname); + } } - __setupArrowSQLbufferField(table, - column->element, - elem_name, - __type->typelem, - -1, - __afield); } - else if (OidIsValid(__type->typrelid)) - { - /* composite type */ - TupleDesc tupdesc = lookup_rowtype_tupdesc(atttypid, atttypmod); - int j; - if (afield && afield->_num_children != tupdesc->natts) - elog(ERROR, "Arrow::Field (%s) is not compatible", afield->name); - - column->nfields = tupdesc->natts; - column->subfields = palloc0(sizeof(SQLfield) * tupdesc->natts); - for (j=0; j < tupdesc->natts; j++) + /* makes a command to define foreign table */ + initStringInfo(&cmd); + appendStringInfo(&cmd, "CREATE FOREIGN TABLE %s (\n", + quote_identifier(stmt->remote_schema)); + for (j=0; j < schema._num_fields; j++) + { + ArrowField *field = &schema.fields[j]; + Oid type_oid; + int32 type_mod; + char *schema; + HeapTuple htup; + Form_pg_type __type; + + __arrowFieldTypeToPGType(field, &type_oid, &type_mod, NULL); + if (!OidIsValid(type_oid)) + elog(ERROR, "unable to map Arrow type on any PG type"); + htup = SearchSysCache1(TYPEOID, ObjectIdGetDatum(type_oid)); + if (!HeapTupleIsValid(htup)) + elog(ERROR, "cache lookup failed for type %u", type_oid); + __type = (Form_pg_type) GETSTRUCT(htup); + schema = get_namespace_name(__type->typnamespace); + if (!schema) + elog(ERROR, "cache lookup failed for schema %u", __type->typnamespace); + if (j > 0) + appendStringInfo(&cmd, ",\n"); + if (type_mod < 0) { - Form_pg_attribute sattr = tupleDescAttr(tupdesc, j); - ArrowField *__afield = NULL; - - if (afield) - __afield = &afield->children[j]; - __setupArrowSQLbufferField(table, - &column->subfields[j], - NameStr(sattr->attname), - sattr->atttypid, - sattr->atttypmod, - __afield); + appendStringInfo(&cmd, " %s %s.%s", + quote_identifier(field->name), + quote_identifier(schema), + NameStr(__type->typname)); } - ReleaseTupleDesc(tupdesc); + else + { + Assert(type_mod >= VARHDRSZ); + appendStringInfo(&cmd, " %s %s.%s(%d)", + quote_identifier(field->name), + quote_identifier(schema), + NameStr(__type->typname), + type_mod - VARHDRSZ); + } + ReleaseSysCache(htup); } - else if (__type->typtype == 'e') + appendStringInfo(&cmd, + "\n" + ") SERVER %s\n" + " OPTIONS (", stmt->server_name); + foreach (lc, stmt->options) { - elog(ERROR, "Enum type is not supported right now"); + DefElem *defel = lfirst(lc); + + if (lc != list_head(stmt->options)) + appendStringInfo(&cmd, ",\n "); + appendStringInfo(&cmd, "%s '%s'", + defel->defname, + strVal(defel->arg)); } - ReleaseSysCache(tup); + appendStringInfo(&cmd, ")"); + + return list_make1(cmd.data); } +/* + * pgstrom_arrow_fdw_import_file + * + * NOTE: Due to historical reason, PostgreSQL does not allow to define + * columns more than MaxHeapAttributeNumber (1600) for foreign-tables also, + * not only heap-tables. This restriction comes from NULL-bitmap length + * in HeapTupleHeaderData and width of t_hoff. + * However, it is not a reasonable restriction for foreign-table, because + * it does not use heap-format internally. + */ static void -setupArrowSQLbufferSchema(SQLtable *table, TupleDesc tupdesc, - ArrowFileInfo *af_info) +__insertPgAttributeTuple(Relation pg_attr_rel, + CatalogIndexState pg_attr_index, + Oid ftable_oid, + AttrNumber attnum, + ArrowField *field) { - int j; + Oid type_oid; + int32 type_mod; + int16 type_len; + bool type_byval; + char type_align; + int32 type_ndims; + char type_storage; + Datum values[Natts_pg_attribute]; + bool isnull[Natts_pg_attribute]; + HeapTuple tup; + ObjectAddress myself, referenced; - Assert(!af_info || af_info->footer.schema._num_fields == tupdesc->natts); - table->nfields = tupdesc->natts; - for (j=0; j < tupdesc->natts; j++) - { - Form_pg_attribute attr = tupleDescAttr(tupdesc, j); - ArrowField *afield = NULL; - - if (af_info) - afield = &af_info->footer.schema.fields[j]; - __setupArrowSQLbufferField(table, - &table->columns[j], - NameStr(attr->attname), - attr->atttypid, - attr->atttypmod, - afield); - } - table->segment_sz = (size_t)arrow_record_batch_size_kb << 10; + __arrowFieldTypeToPGType(field, &type_oid, &type_mod, NULL); + get_typlenbyvalalign(type_oid, + &type_len, + &type_byval, + &type_align); + type_ndims = (type_is_array(type_oid) ? 1 : 0); + type_storage = get_typstorage(type_oid); + + memset(values, 0, sizeof(values)); + memset(isnull, 0, sizeof(isnull)); + + values[Anum_pg_attribute_attrelid - 1] = ObjectIdGetDatum(ftable_oid); + values[Anum_pg_attribute_attname - 1] = CStringGetDatum(field->name); + values[Anum_pg_attribute_atttypid - 1] = ObjectIdGetDatum(type_oid); + values[Anum_pg_attribute_attstattarget - 1] = Int32GetDatum(-1); + values[Anum_pg_attribute_attlen - 1] = Int16GetDatum(type_len); + values[Anum_pg_attribute_attnum - 1] = Int16GetDatum(attnum); + values[Anum_pg_attribute_attndims - 1] = Int32GetDatum(type_ndims); + values[Anum_pg_attribute_attcacheoff - 1] = Int32GetDatum(-1); + values[Anum_pg_attribute_atttypmod - 1] = Int32GetDatum(type_mod); + values[Anum_pg_attribute_attbyval - 1] = BoolGetDatum(type_byval); + values[Anum_pg_attribute_attstorage - 1] = CharGetDatum(type_storage); + values[Anum_pg_attribute_attalign - 1] = CharGetDatum(type_align); + values[Anum_pg_attribute_attnotnull - 1] = BoolGetDatum(!field->nullable); + values[Anum_pg_attribute_attislocal - 1] = BoolGetDatum(true); + isnull[Anum_pg_attribute_attacl - 1] = true; + isnull[Anum_pg_attribute_attoptions - 1] = true; + isnull[Anum_pg_attribute_attfdwoptions - 1] = true; + isnull[Anum_pg_attribute_attmissingval - 1] = true; + + tup = heap_form_tuple(RelationGetDescr(pg_attr_rel), values, isnull); + CatalogTupleInsertWithInfo(pg_attr_rel, tup, pg_attr_index); + + /* add dependency */ + myself.classId = RelationRelationId; + myself.objectId = ftable_oid; + myself.objectSubId = attnum; + referenced.classId = TypeRelationId; + referenced.objectId = type_oid; + referenced.objectSubId = 0; + recordDependencyOn(&myself, &referenced, DEPENDENCY_NORMAL); + + heap_freetuple(tup); } -static void -setupArrowSQLbufferBatches(SQLtable *table, ArrowFileInfo *af_info) +Datum +pgstrom_arrow_fdw_import_file(PG_FUNCTION_ARGS) { - loff_t pos = 0; - int i, nitems; + CreateForeignTableStmt stmt; + ArrowSchema schema; + List *tableElts = NIL; + char *ftable_name; + char *file_name; + char *namespace_name; + DefElem *defel; + int j, nfields; + Oid ftable_oid; + ObjectAddress myself; + ArrowFileInfo af_info; - /* restore DictionaryBatches already in the file */ - nitems = af_info->footer._num_dictionaries; - table->numDictionaries = nitems; - if (nitems > 0) - { - table->dictionaries = palloc(sizeof(ArrowBlock) * nitems); - memcpy(table->dictionaries, - af_info->footer.dictionaries, - sizeof(ArrowBlock) * nitems); - for (i=0; i < nitems; i++) - { - ArrowBlock *block = &table->dictionaries[i]; + /* read schema of the file */ + if (PG_ARGISNULL(0)) + elog(ERROR, "foreign table name is not supplied"); + ftable_name = text_to_cstring(PG_GETARG_TEXT_PP(0)); - pos = Max(pos, ARROWALIGN(block->offset + - block->metaDataLength + - block->bodyLength)); - } - } + if (PG_ARGISNULL(1)) + elog(ERROR, "arrow filename is not supplied"); + file_name = text_to_cstring(PG_GETARG_TEXT_PP(1)); + defel = makeDefElem("file", (Node *)makeString(file_name), -1); + + if (PG_ARGISNULL(2)) + namespace_name = NULL; else - table->dictionaries = NULL; + namespace_name = text_to_cstring(PG_GETARG_TEXT_PP(2)); + + readArrowFile(file_name, &af_info, false); + copyArrowNode(&schema.node, &af_info.footer.schema.node); + if (schema._num_fields > SHRT_MAX) + Elog("Arrow file '%s' has too much fields: %d", + file_name, schema._num_fields); + + /* setup CreateForeignTableStmt */ + memset(&stmt, 0, sizeof(CreateForeignTableStmt)); + NodeSetTag(&stmt, T_CreateForeignTableStmt); + stmt.base.relation = makeRangeVar(namespace_name, ftable_name, -1); - /* restore RecordBatches already in the file */ - nitems = af_info->footer._num_recordBatches; - table->numRecordBatches = nitems; - if (nitems > 0) + nfields = Min(schema._num_fields, 100); + for (j=0; j < nfields; j++) { - table->recordBatches = palloc(sizeof(ArrowBlock) * nitems); - memcpy(table->recordBatches, - af_info->footer.recordBatches, - sizeof(ArrowBlock) * nitems); - for (i=0; i < nitems; i++) - { - ArrowBlock *block = &table->recordBatches[i]; + ColumnDef *cdef; + Oid type_oid; + int32_t type_mod; - pos = Max(pos, ARROWALIGN(block->offset + - block->metaDataLength + - block->bodyLength)); - } + __arrowFieldTypeToPGType(&schema.fields[j], + &type_oid, + &type_mod, + NULL); + cdef = makeColumnDef(schema.fields[j].name, + type_oid, + type_mod, + InvalidOid); + tableElts = lappend(tableElts, cdef); } - else - table->recordBatches = NULL; - - if (lseek(table->fdesc, pos, SEEK_SET) < 0) - elog(ERROR, "failed on lseek('%s',%lu): %m", - table->filename, pos); - table->f_pos = pos; -} - -/* - * createArrowWriteRedoLog - */ -static loff_t -createArrowWriteRedoLog(File filp, bool is_newfile) -{ - arrowWriteRedoLog *redo; - int fdesc = FileGetRawDesc(filp); - const char *fname = FilePathName(filp); - TransactionId curr_xid = GetCurrentTransactionId(); - CommandId curr_cid = GetCurrentCommandId(true); - dlist_iter iter; - MetadataCacheKey key; - struct stat stat_buf; - size_t main_sz; + stmt.base.tableElts = tableElts; + stmt.base.oncommit = ONCOMMIT_NOOP; + stmt.servername = "arrow_fdw"; + stmt.options = list_make1(defel); - if (fstat(fdesc, &stat_buf) != 0) - elog(ERROR, "failed on fstat(2): %m"); - initMetadataCacheKey(&key, &stat_buf); + myself = DefineRelation(&stmt.base, + RELKIND_FOREIGN_TABLE, + InvalidOid, + NULL, + __FUNCTION__); + ftable_oid = myself.objectId; + CreateForeignTable(&stmt, ftable_oid); - dlist_foreach(iter, &arrow_write_redo_list) + if (nfields < schema._num_fields) { - redo = dlist_container(arrowWriteRedoLog, chain, iter.cur); + Relation c_rel = table_open(RelationRelationId, RowExclusiveLock); + Relation a_rel = table_open(AttributeRelationId, RowExclusiveLock); + CatalogIndexState c_index = CatalogOpenIndexes(c_rel); + CatalogIndexState a_index = CatalogOpenIndexes(a_rel); + HeapTuple tup; - if (redo->key.st_dev == key.st_dev && - redo->key.st_ino == key.st_ino && - redo->xid == curr_xid && - redo->cid <= curr_cid) - { - elog(ERROR, "Why? '%s' on behalf of arrow_fdw foreign-table is concurrently opened for update, please confirm the configuration", fname); - } - } + tup = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(ftable_oid)); + if (!HeapTupleIsValid(tup)) + elog(ERROR, "cache lookup failed for relation %u", ftable_oid); - if (is_newfile) - { - main_sz = MAXALIGN(offsetof(arrowWriteRedoLog, footer_backup)); - redo = MemoryContextAllocZero(CacheMemoryContext, - main_sz + strlen(fname) + 1); - memcpy(&redo->key, &key, sizeof(MetadataCacheKey)); - redo->xid = curr_xid; - redo->cid = curr_cid; - redo->pathname = (char *)redo + main_sz; - strcpy(redo->pathname, fname); - redo->is_truncate = false; - redo->footer_offset = 0; - redo->footer_length = 0; - } - else - { - ssize_t nbytes; - off_t offset; - char temp[100]; - - /* make backup image of the Footer section */ - nbytes = sizeof(int32) + 6; /* = strlen("ARROW1") */ - offset = stat_buf.st_size - nbytes; - if (__preadFile(fdesc, temp, nbytes, offset) != nbytes) - elog(ERROR, "failed on pread(2): %m"); - offset -= *((int32 *)temp); - - nbytes = stat_buf.st_size - offset; - if (nbytes <= 0) - elog(ERROR, "strange apache arrow format"); - main_sz = MAXALIGN(offsetof(arrowWriteRedoLog, - footer_backup[nbytes])); - redo = MemoryContextAllocZero(CacheMemoryContext, - main_sz + strlen(fname) + 1); - memcpy(&redo->key, &key, sizeof(MetadataCacheKey)); - redo->xid = curr_xid; - redo->cid = curr_cid; - redo->pathname = (char *)redo + main_sz; - strcpy(redo->pathname, fname); - redo->is_truncate = false; - PG_TRY(); - { - if (__preadFile(fdesc, redo->footer_backup, nbytes, offset) != nbytes) - elog(ERROR, "failed on pread(2): %m"); - if (lseek(fdesc, offset, SEEK_SET) < 0) - elog(ERROR, "failed on lseek(2): %m"); - redo->footer_offset = offset; - redo->footer_length = nbytes; - } - PG_CATCH(); + for (j=nfields; j < schema._num_fields; j++) { - pfree(redo); - PG_RE_THROW(); + __insertPgAttributeTuple(a_rel, + a_index, + ftable_oid, + j+1, + &schema.fields[j]); } - PG_END_TRY(); - } - elog(DEBUG2, "arrow: redo-log on '%s' (st_dev=%u/st_ino=%u) xid=%u cid=%u offset=%lu length=%zu", - redo->pathname, (uint32)redo->key.st_dev, (uint32)redo->key.st_ino, - (uint32)redo->xid, (uint32)redo->cid, - (uint64)redo->footer_offset, - (uint64)redo->footer_length); + /* update relnatts also */ + ((Form_pg_class) GETSTRUCT(tup))->relnatts = schema._num_fields; + CatalogTupleUpdate(c_rel, &tup->t_self, tup); - dlist_push_head(&arrow_write_redo_list, &redo->chain); + CatalogCloseIndexes(a_index); + CatalogCloseIndexes(c_index); + table_close(a_rel, RowExclusiveLock); + table_close(c_rel, RowExclusiveLock); - return redo->footer_offset; + CommandCounterIncrement(); + } + PG_RETURN_VOID(); } /* - * writeOutArrowRecordBatch + * handler of Arrow_Fdw */ -static void -writeOutArrowRecordBatch(arrowWriteState *aw_state, bool with_footer) +Datum +pgstrom_arrow_fdw_handler(PG_FUNCTION_ARGS) { - SQLtable *table = &aw_state->sql_table; - int index = aw_state->hash % ARROW_METADATA_HASH_NSLOTS; - arrowWriteMVCCLog *mvcc = NULL; - - if (table->nitems > 0) - { - mvcc = MemoryContextAllocZero(TopSharedMemoryContext, - sizeof(arrowWriteMVCCLog)); - memcpy(&mvcc->key, &aw_state->key, sizeof(MetadataCacheKey)); - mvcc->xid = GetCurrentTransactionId(); - mvcc->cid = GetCurrentCommandId(true); - } - - PG_TRY(); - { - LWLockAcquire(&arrow_metadata_state->lock_slots[index], - LW_EXCLUSIVE); - /* write out an empty arrow file */ - if (table->f_pos == 0) - { - arrowFileWrite(table, "ARROW1\0\0", 8); - writeArrowSchema(table); - } - if (table->nitems > 0) - { - mvcc->record_batch = writeArrowRecordBatch(table); - sql_table_clear(table); - dlist_push_tail(&arrow_metadata_state->mvcc_slots[index], - &mvcc->chain); - elog(DEBUG2, - "arrow-write: '%s' (st_dev=%u, st_ino=%u), xid=%u, cid=%u, record_batch=%u nitems=%lu", - FilePathName(aw_state->file), - (uint32)mvcc->key.st_dev, (uint32)mvcc->key.st_ino, - (uint32)mvcc->xid, (uint32)mvcc->cid, mvcc->record_batch, - table->nitems); - } - if (with_footer) - writeArrowFooter(table); - - /* - * Invalidation of the metadata cache, if any - * - * NOTE: metadata cache shall be invalidated on the next reference, - * if st_mtime of the file is newer than st_mtime of the mcache. - * Linux kernel offers nanosecond precision in st_Xtime, but it never - * guarantee the st_Xtime is recorded in nanosecond precision... - */ - arrowInvalidateMetadataCache(&aw_state->key, true); - - LWLockRelease(&arrow_metadata_state->lock_slots[index]); - } - PG_CATCH(); - { - if (mvcc) - pfree(mvcc); - PG_RE_THROW(); - } - PG_END_TRY(); + PG_RETURN_POINTER(&pgstrom_arrow_fdw_routine); } /* - * TRUNCATE support + * validator of Arrow_Fdw */ -static void -__arrowExecTruncateRelation(Relation frel) +Datum +pgstrom_arrow_fdw_validator(PG_FUNCTION_ARGS) { - TupleDesc tupdesc = RelationGetDescr(frel); - Oid frel_oid = RelationGetRelid(frel); - ForeignTable *ft = GetForeignTable(frel_oid); - arrowWriteRedoLog *redo; - ArrowFileInfo af_info; - struct stat stat_buf; - MetadataCacheKey key; - int index; - List *filesList; - SQLtable *table; - const char *path_name; - const char *dir_name; - const char *file_name; - size_t main_sz; - int fdesc = -1; - char backup_path[MAXPGPATH]; - bool writable; - - filesList = __arrowFdwExtractFilesList(ft->options, - NULL, - &writable); - if (!writable) - elog(ERROR, "arrow_fdw: foreign table \"%s\" is not writable", - RelationGetRelationName(frel)); - Assert(list_length(filesList) == 1); - path_name = strVal(linitial(filesList)); - readArrowFile(path_name, &af_info, false); - if (stat(path_name, &stat_buf) != 0) - elog(ERROR, "failed on stat('%s'): %m", path_name); - /* metadata cache invalidation */ - index = initMetadataCacheKey(&key, &stat_buf); - LWLockAcquire(&arrow_metadata_state->lock_slots[index], LW_EXCLUSIVE); - arrowInvalidateMetadataCache(&key, true); - LWLockRelease(&arrow_metadata_state->lock_slots[index]); - - /* build SQLtable to write out schema */ - table = palloc0(offsetof(SQLtable, columns[tupdesc->natts])); - setupArrowSQLbufferSchema(table, tupdesc, &af_info); - - /* create REDO log entry */ - main_sz = MAXALIGN(offsetof(arrowWriteRedoLog, footer_backup)); - redo = MemoryContextAllocZero(CacheMemoryContext, - main_sz + strlen(path_name) + 1); - redo->xid = GetCurrentTransactionId(); - redo->cid = GetCurrentCommandId(true); - redo->pathname = (char *)redo + main_sz; - strcpy(redo->pathname, path_name); - redo->is_truncate = true; - - PG_TRY(); + List *options = untransformRelOptions(PG_GETARG_DATUM(0)); + Oid catalog = PG_GETARG_OID(1); + + if (catalog == ForeignTableRelationId) { - /* - * move the current arrow file to the backup - */ - dir_name = dirname(pstrdup(path_name)); - file_name = basename(pstrdup(path_name)); - for (;;) - { - redo->suffix = random(); - snprintf(backup_path, sizeof(backup_path), - "%s/%s.%u.backup", - dir_name, file_name, redo->suffix); - if (stat(backup_path, &stat_buf) != 0) - { - if (errno == ENOENT) - break; - elog(ERROR, "failed on stat('%s'): %m", backup_path); - } - } - if (rename(path_name, backup_path) != 0) - elog(ERROR, "failed on rename('%s','%s'): %m", - path_name, backup_path); + List *filesList = arrowFdwExtractFilesList(options, NULL); + ListCell *lc; - /* - * create an empty arrow file - */ - PG_TRY(); - { - fdesc = open(path_name, O_RDWR | O_CREAT | O_EXCL, 0600); - if (fdesc < 0) - elog(ERROR, "failed on open('%s'): %m", path_name); - if (fstat(fdesc, &stat_buf) != 0) - elog(ERROR, "failed on fstat('%s'): %m", path_name); - initMetadataCacheKey(&redo->key, &stat_buf); - table->filename = path_name; - table->fdesc = fdesc; - arrowFileWrite(table, "ARROW1\0\0", 8); - writeArrowSchema(table); - writeArrowFooter(table); - } - PG_CATCH(); + foreach (lc, filesList) { - if (fdesc >= 0) - close(fdesc); - if (rename(backup_path, path_name) != 0) - elog(WARNING, "failed on rename('%s', '%s'): %m", - backup_path, path_name); - PG_RE_THROW(); + const char *fname = strVal(lfirst(lc)); + ArrowFileInfo af_info; + + readArrowFile(fname, &af_info, true); } - PG_END_TRY(); - close(fdesc); - } - PG_CATCH(); - { - pfree(redo); - PG_RE_THROW(); } - PG_END_TRY(); - /* save the REDO log entry */ - dlist_push_head(&arrow_write_redo_list, &redo->chain); -} - -#if PG_VERSION_NUM >= 140000 -/* - * TRUNCATE support - */ -static void -ArrowExecForeignTruncate(List *rels, DropBehavior behavior, bool restart_seqs) -{ - ListCell *lc; - - foreach (lc, rels) + else if (options != NIL) { - Relation frel = lfirst(lc); + const char *label; - __arrowExecTruncateRelation(frel); + switch (catalog) + { + case ForeignDataWrapperRelationId: + label = "FOREIGN DATA WRAPPER"; + break; + case ForeignServerRelationId: + label = "SERVER"; + break; + case UserMappingRelationId: + label = "USER MAPPING"; + break; + case AttributeRelationId: + label = "attribute of FOREIGN TABLE"; + break; + default: + label = "????"; + break; + } + elog(ERROR, "Arrow_Fdw does not support any options for %s", label); } + PG_RETURN_VOID(); } -#endif /* - * pgstrom_arrow_fdw_truncate + * pgstrom_arrow_fdw_precheck_schema */ Datum -pgstrom_arrow_fdw_truncate(PG_FUNCTION_ARGS) -{ -#if PG_VERSION_NUM < 140000 - Oid frel_oid = PG_GETARG_OID(0); - Relation frel; - FdwRoutine *routine; - - frel = table_open(frel_oid, AccessExclusiveLock); - if (frel->rd_rel->relkind != RELKIND_FOREIGN_TABLE) - ereport(ERROR, - (errcode(ERRCODE_WRONG_OBJECT_TYPE), - errmsg("\"%s\" is not arrow_fdw foreign table", - RelationGetRelationName(frel)))); - routine = GetFdwRoutineForRelation(frel, false); - if (memcmp(routine, &pgstrom_arrow_fdw_routine, sizeof(FdwRoutine)) != 0) - ereport(ERROR, - (errcode(ERRCODE_WRONG_OBJECT_TYPE), - errmsg("\"%s\" is not arrow_fdw foreign table", - RelationGetRelationName(frel)))); - __arrowExecTruncateRelation(frel); - - table_close(frel, NoLock); -#else - elog(ERROR, "PostgreSQL v14 supports TRUNCATE ; use the standard statement instead of the legacy interface"); -#endif - PG_RETURN_VOID(); -} -PG_FUNCTION_INFO_V1(pgstrom_arrow_fdw_truncate); - -static void -__applyArrowTruncateRedoLog(arrowWriteRedoLog *redo, bool is_commit) +pgstrom_arrow_fdw_precheck_schema(PG_FUNCTION_ARGS) { - char backup[MAXPGPATH]; - - snprintf(backup, MAXPGPATH, "%s.%u.backup", - redo->pathname, redo->suffix); - if (is_commit) - { - elog(DEBUG2, "arrow-redo: unlink [%s]", backup); - if (unlink(backup) != 0) - ereport(WARNING, - (errcode_for_file_access(), - errmsg("could not remove truncated file \"%s\": %m", - backup), - errhint("remove the \"%s\" manually", backup))); - } - else - { - elog(DEBUG2, "arrow-redo: rename [%s]->[%s]", backup, redo->pathname); - if (rename(backup, redo->pathname) != 0) - ereport(WARNING, - (errcode_for_file_access(), - errmsg("could not restore backup file \"%s\": %m", - backup), - errhint("please restore \"%s\" to \"%s\" manually", - backup, redo->pathname))); - arrowInvalidateMetadataCache(&redo->key, true); - } -} + EventTriggerData *trigdata; + Relation frel = NULL; + ListCell *lc; + bool check_schema_compatibility = false; -static void -__applyArrowInsertRedoLog(arrowWriteRedoLog *redo, bool is_commit) -{ - int fdesc; - - if (is_commit) - return; + if (!CALLED_AS_EVENT_TRIGGER(fcinfo)) + elog(ERROR, "%s: must be called as EventTrigger", __FUNCTION__); + trigdata = (EventTriggerData *) fcinfo->context; + if (strcmp(trigdata->event, "ddl_command_end") != 0) + elog(ERROR, "%s: must be called on ddl_command_end event", __FUNCTION__); - /* special case, if it was an empty file */ - if (redo->footer_offset == 0 && - redo->footer_length == 0) + if (strcmp(GetCommandTagName(trigdata->tag), + "CREATE FOREIGN TABLE") == 0) { - if (unlink(redo->pathname) != 0) - ereport(WARNING, - (errcode_for_file_access(), - errmsg("failed on truncate('%s'): %m", redo->pathname), - errdetail("could not apply REDO image, therefore, garbages are still remained"))); - return; - } + CreateStmt *stmt = (CreateStmt *)trigdata->parsetree; - fdesc = open(redo->pathname, O_RDWR); - if (fdesc < 0) - { - ereport(WARNING, - (errcode_for_file_access(), - errmsg("failed on open('%s'): %m", redo->pathname), - errdetail("could not apply REDO image, therefore, arrow file might be corrupted"))); - } - else if (lseek(fdesc, redo->footer_offset, SEEK_SET) < 0) - { - ereport(WARNING, - (errcode_for_file_access(), - errmsg("failed on lseek('%s'): %m", redo->pathname), - errdetail("could not apply REDO image, therefore, arrow file might be corrupted"))); - } - else if (__writeFile(fdesc, - redo->footer_backup, - redo->footer_length) != redo->footer_length) - { - ereport(WARNING, - (errcode_for_file_access(), - errmsg("failed on write('%s'): %m", redo->pathname), - errdetail("could not apply REDO image, therefore, arrow file might be corrupted"))); + frel = relation_openrv_extended(stmt->relation, NoLock, true); + if (frel && RelationIsArrowFdw(frel)) + check_schema_compatibility = true; } - else if (ftruncate(fdesc, (redo->footer_offset + - redo->footer_length)) != 0) + else if (strcmp(GetCommandTagName(trigdata->tag), + "ALTER FOREIGN TABLE") == 0 && + IsA(trigdata->parsetree, AlterTableStmt)) { - ereport(WARNING, - (errcode_for_file_access(), - errmsg("failed on ftruncate('%s'): %m", redo->pathname), - errdetail("could not apply REDO image, therefore, arrow file might be corrupted"))); - } - /* invalidation of the metadata-cache */ - arrowInvalidateMetadataCache(&redo->key, true); - - close(fdesc); - - elog(DEBUG2, "arrow_fdw: REDO log applied (xid=%u, cid=%u, file=[%s], offset=%zu, length=%zu)", redo->xid, redo->cid, redo->pathname, redo->footer_offset, redo->footer_length); -} - -static void -__cleanupArrowWriteMVCCLog(TransactionId curr_xid, dlist_head *mvcc_slot) -{ - dlist_mutable_iter iter; + AlterTableStmt *stmt = (AlterTableStmt *)trigdata->parsetree; - dlist_foreach_modify(iter, mvcc_slot) - { - arrowWriteMVCCLog *mvcc = dlist_container(arrowWriteMVCCLog, - chain, iter.cur); - if (mvcc->xid == curr_xid) + frel = relation_openrv_extended(stmt->relation, NoLock, true); + if (frel && RelationIsArrowFdw(frel)) { - dlist_delete(&mvcc->chain); - elog(DEBUG2, "arrow: release mvcc-log (st_dev=%u, st_ino=%u), xid=%u, cid=%u, record_batch=%u", - (uint32)mvcc->key.st_dev, (uint32)mvcc->key.st_ino, - (uint32)mvcc->xid, (uint32)mvcc->cid, mvcc->record_batch); - pfree(mvcc); + foreach (lc, stmt->cmds) + { + AlterTableCmd *cmd = lfirst(lc); + + if (cmd->subtype == AT_AddColumn || + cmd->subtype == AT_DropColumn || + cmd->subtype == AT_AlterColumnType) + { + check_schema_compatibility = true; + break; + } + } } } -} -/* - * __arrowFdwXactCallback - */ -static void -__arrowFdwXactCallback(TransactionId curr_xid, bool is_commit) -{ - arrowWriteRedoLog *redo; - dlist_mutable_iter iter; - CommandId curr_cid = InvalidCommandId; - uint32 index; - bool locked[ARROW_METADATA_HASH_NSLOTS]; - LWLock *locks[ARROW_METADATA_HASH_NSLOTS]; - uint32 lcount = 0; - - if (curr_xid == InvalidTransactionId || - dlist_is_empty(&arrow_write_redo_list)) - return; - - memset(locked, 0, sizeof(locked)); - dlist_foreach_modify(iter, &arrow_write_redo_list) + if (check_schema_compatibility) { - redo = dlist_container(arrowWriteRedoLog, chain, iter.cur); - if (redo->xid != curr_xid) - continue; - if (curr_cid != InvalidCommandId && - curr_cid < redo->cid) - elog(WARNING, "Bug? Order of REDO log is not be correct. ABORT transaction might generate wrong image restored."); + ForeignTable *ft = GetForeignTable(RelationGetRelid(frel)); + List *filesList = arrowFdwExtractFilesList(ft->options, NULL); - index = redo->key.hash % ARROW_METADATA_HASH_NSLOTS; - if (!locked[index]) + foreach (lc, filesList) { - LWLock *lock = &arrow_metadata_state->lock_slots[index]; - dlist_head *slot = &arrow_metadata_state->mvcc_slots[index]; + const char *fname = strVal(lfirst(lc)); - LWLockAcquire(lock, LW_EXCLUSIVE); - __cleanupArrowWriteMVCCLog(curr_xid, slot); - locked[index] = true; - locks[lcount++] = lock; + (void)BuildArrowFileState(frel, fname, NULL); } - if (redo->is_truncate) - __applyArrowTruncateRedoLog(redo, is_commit); - else - __applyArrowInsertRedoLog(redo, is_commit); - - dlist_delete(&redo->chain); - pfree(redo); } - - for (index=0; index < lcount; index++) - LWLockRelease(locks[index]); -} - -/* - * arrowFdwXactCallback - */ -static void -arrowFdwXactCallback(XactEvent event, void *arg) -{ - TransactionId curr_xid = GetCurrentTransactionIdIfAny(); - - if (event == XACT_EVENT_COMMIT) - __arrowFdwXactCallback(curr_xid, true); - else if (event == XACT_EVENT_ABORT) - __arrowFdwXactCallback(curr_xid, false); -} - -/* - * arrowFdwSubXactCallback - */ -static void -arrowFdwSubXactCallback(SubXactEvent event, SubTransactionId mySubid, - SubTransactionId parentSubid, void *arg) -{ - TransactionId curr_xid = GetCurrentTransactionIdIfAny(); - - if (event == SUBXACT_EVENT_COMMIT_SUB) - __arrowFdwXactCallback(curr_xid, true); - else if (event == SUBXACT_EVENT_ABORT_SUB) - __arrowFdwXactCallback(curr_xid, false); + if (frel) + relation_close(frel, NoLock); + PG_RETURN_NULL(); } /* @@ -6093,9 +4520,13 @@ arrowFdwSubXactCallback(SubXactEvent event, SubTransactionId mySubid, static void pgstrom_request_arrow_fdw(void) { + size_t sz; + if (shmem_request_next) shmem_request_next(); - RequestAddinShmemSpace(MAXALIGN(sizeof(arrowMetadataState))); + sz = TYPEALIGN(ARROW_METADATA_BLOCKSZ, + (size_t)arrow_metadata_cache_size_kb << 10); + RequestAddinShmemSpace(MAXALIGN(sizeof(arrowMetadataCacheHead)) + sz); } /* @@ -6105,26 +4536,41 @@ static void pgstrom_startup_arrow_fdw(void) { bool found; - int i; + size_t sz; + char *buffer; + int i, n; if (shmem_startup_next) (*shmem_startup_next)(); - arrow_metadata_state = - ShmemInitStruct("arrow_metadata_state", - MAXALIGN(sizeof(arrowMetadataState)), - &found); - if (!IsUnderPostmaster) + arrow_metadata_cache = ShmemInitStruct("arrowMetadataCache(head)", + MAXALIGN(sizeof(arrowMetadataCacheHead)), + &found); + Assert(!found); + + LWLockInitialize(&arrow_metadata_cache->mutex, LWLockNewTrancheId()); + SpinLockInit(&arrow_metadata_cache->lru_lock); + dlist_init(&arrow_metadata_cache->lru_list); + dlist_init(&arrow_metadata_cache->free_blocks); + dlist_init(&arrow_metadata_cache->free_mcaches); + dlist_init(&arrow_metadata_cache->free_fcaches); + for (i=0; i < ARROW_METADATA_HASH_NSLOTS; i++) + dlist_init(&arrow_metadata_cache->hash_slots[i]); + + /* slab allocator */ + sz = TYPEALIGN(ARROW_METADATA_BLOCKSZ, + (size_t)arrow_metadata_cache_size_kb << 10); + n = sz / ARROW_METADATA_BLOCKSZ; + buffer = ShmemInitStruct("arrowMetadataCache(body)", sz, &found); + Assert(!found); + for (i=0; i < n; i++) { - SpinLockInit(&arrow_metadata_state->lru_lock); - dlist_init(&arrow_metadata_state->lru_list); - pg_atomic_init_u64(&arrow_metadata_state->consumed, 0UL); - for (i=0; i < ARROW_METADATA_HASH_NSLOTS; i++) - { - LWLockInitialize(&arrow_metadata_state->lock_slots[i], -1); - dlist_init(&arrow_metadata_state->hash_slots[i]); - dlist_init(&arrow_metadata_state->mvcc_slots[i]); - } + arrowMetadataCacheBlock *mc_block = (arrowMetadataCacheBlock *)buffer; + + memset(mc_block, 0, offsetof(arrowMetadataCacheBlock, data)); + dlist_push_tail(&arrow_metadata_cache->free_blocks, &mc_block->chain); + + buffer += ARROW_METADATA_BLOCKSZ; } } @@ -6150,28 +4596,15 @@ pgstrom_init_arrow_fdw(void) r->ExplainForeignScan = ArrowExplainForeignScan; /* ANALYZE support */ r->AnalyzeForeignTable = ArrowAnalyzeForeignTable; - /* IMPORT FOREIGN SCHEMA support */ - r->ImportForeignSchema = ArrowImportForeignSchema; -#if PG_VERSION_NUM >= 140000 - r->ExecForeignTruncate = ArrowExecForeignTruncate; -#endif /* CPU Parallel support */ r->IsForeignScanParallelSafe = ArrowIsForeignScanParallelSafe; r->EstimateDSMForeignScan = ArrowEstimateDSMForeignScan; r->InitializeDSMForeignScan = ArrowInitializeDSMForeignScan; - r->ReInitializeDSMForeignScan = ArrowReInitializeDSMForeignScan; + //r->ReInitializeDSMForeignScan = ArrowReInitializeDSMForeignScan; r->InitializeWorkerForeignScan = ArrowInitializeWorkerForeignScan; r->ShutdownForeignScan = ArrowShutdownForeignScan; - /* INSERT/DELETE support */ - r->PlanForeignModify = ArrowPlanForeignModify; - r->BeginForeignModify = ArrowBeginForeignModify; - r->ExecForeignInsert = ArrowExecForeignInsert; - r->EndForeignModify = ArrowEndForeignModify; -#if PG_VERSION_NUM >= 110000 - r->BeginForeignInsert = ArrowBeginForeignInsert; - r->EndForeignInsert = ArrowEndForeignInsert; -#endif - r->ExplainForeignModify = ArrowExplainForeignModify; + /* IMPORT FOREIGN SCHEMA support */ + r->ImportForeignSchema = ArrowImportForeignSchema; /* * Turn on/off arrow_fdw @@ -6202,38 +4635,23 @@ pgstrom_init_arrow_fdw(void) "size of shared metadata cache for arrow files", NULL, &arrow_metadata_cache_size_kb, - 131072, /* 128MB */ - 32768, /* 32MB */ + 512 * 1024, /* 512MB */ + 32 * 1024, /* 32MB */ INT_MAX, PGC_POSTMASTER, GUC_NOT_IN_SAMPLE | GUC_UNIT_KB, NULL, NULL, NULL); - arrow_metadata_cache_size = (size_t)arrow_metadata_cache_size_kb << 10; - - /* - * Limit of RecordBatch size for writing - */ - DefineCustomIntVariable("arrow_fdw.record_batch_size", - "maximum size of record batch on writing", - NULL, - &arrow_record_batch_size_kb, - 256 * 1024, /* default: 256MB */ - 4 * 1024, /* min: 4MB */ - 2048 * 1024, /* max: 2GB */ - PGC_USERSET, - GUC_NOT_IN_SAMPLE | GUC_UNIT_KB, - NULL, NULL, NULL); - /* shared memory size */ shmem_request_next = shmem_request_hook; shmem_request_hook = pgstrom_request_arrow_fdw; shmem_startup_next = shmem_startup_hook; shmem_startup_hook = pgstrom_startup_arrow_fdw; - - /* transaction callback */ - RegisterXactCallback(arrowFdwXactCallback, NULL); - RegisterSubXactCallback(arrowFdwSubXactCallback, NULL); - - /* misc init */ - dlist_init(&arrow_write_redo_list); } + + + + + + + + diff --git a/src/arrow_ipc.h b/src/arrow_ipc.h index 6abc07e7a..ea0d9f50d 100644 --- a/src/arrow_ipc.h +++ b/src/arrow_ipc.h @@ -3,8 +3,8 @@ * * Definitions for Apache Arrow IPC stuff. * ---- - * Copyright 2011-2021 (C) KaiGai Kohei - * Copyright 2014-2021 (C) PG-Strom Developers Team + * Copyright 2011-2023 (C) KaiGai Kohei + * Copyright 2014-2023 (C) PG-Strom Developers Team * * This program is free software; you can redistribute it and/or modify * it under the terms of the PostgreSQL License. diff --git a/src/arrow_nodes.c b/src/arrow_nodes.c index 449108012..f7cb6e820 100644 --- a/src/arrow_nodes.c +++ b/src/arrow_nodes.c @@ -4,8 +4,8 @@ * Routines to handle ArrowNode objects, intermediation of PostgreSQL types * and Apache Arrow types. * ---- - * Copyright 2011-2021 (C) KaiGai Kohei - * Copyright 2014-2021 (C) PG-Strom Developers Team + * Copyright 2011-2023 (C) KaiGai Kohei + * Copyright 2014-2023 (C) PG-Strom Developers Team * * This program is free software; you can redistribute it and/or modify * it under the terms of the PostgreSQL License. @@ -1947,14 +1947,16 @@ readArrowFooter(ArrowFooter *node, const char *pos) #define ARROW_FILE_TAIL_SIGNATURE "ARROW1" #define ARROW_FILE_TAIL_SIGNATURE_SZ (sizeof(ARROW_FILE_TAIL_SIGNATURE) - 1) -#ifdef __PGSTROM_MODULE__ -#include "pg_strom.h" -#define __mmap(a,b,c,d,e,f) __mmapFile((a),(b),(c),(d),(e),(f)) -#define __munmap(a,b) __munmapFile((a)) -#else -#define __mmap(a,b,c,d,e,f) mmap((a),(b),(c),(d),(e),(f)) -#define __munmap(a,b) munmap((a),(b)) -#endif /* __PGSTROM_MODULE__ */ +#ifndef __PGSTROM_MODULE__ +#define PG_TRY() \ + if (true) { \ + bool __dummy__ __attribute__((unused)) +#define PG_FINALLY() \ + } else { \ + bool __dummy__ __attribute__((unused)) +#define PG_END_TRY() \ + } +#endif void readArrowFileDesc(int fdesc, ArrowFileInfo *af_info) @@ -1975,85 +1977,93 @@ readArrowFileDesc(int fdesc, ArrowFileInfo *af_info) if (__PAGE_SIZE == 0) __PAGE_SIZE = sysconf(_SC_PAGESIZE); mmap_sz = ((file_sz + __PAGE_SIZE - 1) & ~(__PAGE_SIZE - 1)); - mmap_head = __mmap(NULL, mmap_sz, PROT_READ, MAP_SHARED, fdesc, 0); + mmap_head = mmap(NULL, mmap_sz, PROT_READ, MAP_SHARED, fdesc, 0); if (mmap_head == MAP_FAILED) Elog("failed on mmap: %m"); mmap_tail = mmap_head + file_sz - ARROW_FILE_TAIL_SIGNATURE_SZ; /* check signature */ - if (memcmp(mmap_head, - ARROW_FILE_HEAD_SIGNATURE, - ARROW_FILE_HEAD_SIGNATURE_SZ) != 0 || - memcmp(mmap_tail, - ARROW_FILE_TAIL_SIGNATURE, - ARROW_FILE_TAIL_SIGNATURE_SZ) != 0) + PG_TRY(); { - Elog("Signature mismatch on Apache Arrow file"); - } + if (memcmp(mmap_head, + ARROW_FILE_HEAD_SIGNATURE, + ARROW_FILE_HEAD_SIGNATURE_SZ) != 0 || + memcmp(mmap_tail, + ARROW_FILE_TAIL_SIGNATURE, + ARROW_FILE_TAIL_SIGNATURE_SZ) != 0) + { + Elog("Signature mismatch on Apache Arrow file"); + } - /* Read Footer chunk */ - pos = mmap_tail - sizeof(int32_t); - offset = *((int32_t *)pos); - pos -= offset; - offset = *((int32_t *)pos); - readArrowFooter(&af_info->footer, pos + offset); + /* Read Footer chunk */ + pos = mmap_tail - sizeof(int32_t); + offset = *((int32_t *)pos); + pos -= offset; + offset = *((int32_t *)pos); + readArrowFooter(&af_info->footer, pos + offset); - /* Read DictionaryBatch chunks */ - nitems = af_info->footer._num_dictionaries; - if (nitems > 0) - { - af_info->dictionaries = palloc0(nitems * sizeof(ArrowMessage)); - for (i=0; i < nitems; i++) + /* Read DictionaryBatch chunks */ + nitems = af_info->footer._num_dictionaries; + if (nitems > 0) { - ArrowBlock *b = &af_info->footer.dictionaries[i]; - ArrowMessage *m = &af_info->dictionaries[i]; - int32_t *ival = (int32_t *)(mmap_head + b->offset); - int32_t metaLength __attribute__((unused)); - int32_t *headOffset; - - if (*ival == 0xffffffff) - { - metaLength = ival[1]; - headOffset = ival + 2; - } - else + af_info->dictionaries = palloc0(nitems * sizeof(ArrowMessage)); + for (i=0; i < nitems; i++) { - /* Older format prior to Arrow v0.15 */ - metaLength = *ival; - headOffset = ival + 1; + ArrowBlock *b = &af_info->footer.dictionaries[i]; + ArrowMessage *m = &af_info->dictionaries[i]; + int32_t *ival = (int32_t *)(mmap_head + b->offset); + int32_t metaLength __attribute__((unused)); + int32_t *headOffset; + + if (*ival == 0xffffffff) + { + metaLength = ival[1]; + headOffset = ival + 2; + } + else + { + /* Older format prior to Arrow v0.15 */ + metaLength = *ival; + headOffset = ival + 1; + } + pos = (const char *)headOffset + *headOffset; + readArrowMessage(m, pos); } - pos = (const char *)headOffset + *headOffset; - readArrowMessage(m, pos); } - } - /* Read RecordBatch chunks */ - nitems = af_info->footer._num_recordBatches; - if (nitems > 0) - { - af_info->recordBatches = palloc0(nitems * sizeof(ArrowMessage)); - for (i=0; i < nitems; i++) + /* Read RecordBatch chunks */ + nitems = af_info->footer._num_recordBatches; + if (nitems > 0) { - ArrowBlock *b = &af_info->footer.recordBatches[i]; - ArrowMessage *m = &af_info->recordBatches[i]; - int32_t *ival = (int32_t *)(mmap_head + b->offset); - int32_t metaLength __attribute__((unused)); - int32_t *headOffset; - - if (*ival == 0xffffffff) + af_info->recordBatches = palloc0(nitems * sizeof(ArrowMessage)); + for (i=0; i < nitems; i++) { - metaLength = ival[1]; - headOffset = ival + 2; + ArrowBlock *b = &af_info->footer.recordBatches[i]; + ArrowMessage *m = &af_info->recordBatches[i]; + int32_t *ival = (int32_t *)(mmap_head + b->offset); + int32_t metaLength __attribute__((unused)); + int32_t *headOffset; + + if (*ival == 0xffffffff) + { + metaLength = ival[1]; + headOffset = ival + 2; + } + else + { + /* Older format prior to Arrow v0.15 */ + metaLength = *ival; + headOffset = ival + 1; + } + pos = (const char *)headOffset + *headOffset; + readArrowMessage(m, pos); } - else - { - /* Older format prior to Arrow v0.15 */ - metaLength = *ival; - headOffset = ival + 1; - } - pos = (const char *)headOffset + *headOffset; - readArrowMessage(m, pos); } + munmap(mmap_head, mmap_sz); + } + PG_FINALLY(); + { + munmap(mmap_head, mmap_sz); } - __munmap(mmap_head, mmap_sz); + PG_END_TRY(); } diff --git a/next/brin.c b/src/brin.c similarity index 100% rename from next/brin.c rename to src/brin.c diff --git a/src/codegen.c b/src/codegen.c index 914d04160..426e4f282 100644 --- a/src/codegen.c +++ b/src/codegen.c @@ -1,533 +1,253 @@ /* * codegen.c * - * Routines for CUDA code generator + * Routines for xPU code generator * ---- - * Copyright 2011-2021 (C) KaiGai Kohei - * Copyright 2014-2021 (C) PG-Strom Developers Team + * Copyright 2011-2023 (C) KaiGai Kohei + * Copyright 2014-2023 (C) PG-Strom Developers Team * * This program is free software; you can redistribute it and/or modify * it under the terms of the PostgreSQL License. */ #include "pg_strom.h" -#include "cuda_numeric.h" -#include "cuda_postgis.h" - -static MemoryContext devinfo_memcxt; -static dlist_head devtype_info_slot[128]; -static dlist_head devfunc_info_slot[1024]; -static dlist_head devcast_info_slot[48]; -static dlist_head devindex_info_slot[48]; - -static cl_uint generic_devtype_hashfunc(devtype_info *dtype, Datum datum); -static cl_uint pg_int1_devtype_hashfunc(devtype_info *dtype, Datum datum); -static cl_uint pg_int2_devtype_hashfunc(devtype_info *dtype, Datum datum); -static cl_uint pg_int4_devtype_hashfunc(devtype_info *dtype, Datum datum); -static cl_uint pg_int8_devtype_hashfunc(devtype_info *dtype, Datum datum); -static cl_uint pg_float2_devtype_hashfunc(devtype_info *dtype, Datum datum); -static cl_uint pg_float4_devtype_hashfunc(devtype_info *dtype, Datum datum); -static cl_uint pg_float8_devtype_hashfunc(devtype_info *dtype, Datum datum); -static cl_uint pg_numeric_devtype_hashfunc(devtype_info *dtype, Datum datum); -static cl_uint pg_interval_devtype_hashfunc(devtype_info *dtype, Datum datum); -static cl_uint pg_bpchar_devtype_hashfunc(devtype_info *dtype, Datum datum); -static cl_uint pg_inet_devtype_hashfunc(devtype_info *dtype, Datum datum); -static cl_uint pg_jsonb_devtype_hashfunc(devtype_info *dtype, Datum datum); -static cl_uint pg_range_devtype_hashfunc(devtype_info *dtype, Datum datum); -static cl_uint pg_geometry_devtype_hashfunc(devtype_info *dtype, Datum datum); -static cl_uint pg_box2df_devtype_hashfunc(devtype_info *dtype, Datum datum); - -/* callback to handle special cases of device cast */ -static int devcast_text2numeric_callback(codegen_context *context, - StringInfo body, - devcast_info *dcast, - CoerceViaIO *node); -/* error report */ -#define __ELog(fmt, ...) \ - ereport(ERROR, \ - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), \ - errmsg((fmt), ##__VA_ARGS__))) - -/* known extension name */ -#define PGSTROM "pg_strom" -#define POSTGIS3 "postgis" - -/* - * Catalog of data types supported by device code - * - * naming convension of types: - * pg__t - */ - -/* - * MEMO: PG10 does not have OID definitions below - */ -#ifndef INT8RANGEOID -#define INT8RANGEOID 3926 -#endif -#ifndef TSRANGEOID -#define TSRANGEOID 3908 -#endif -#ifndef TSTZRANGEOID -#define TSTZRANGEOID 3910 -#endif -#ifndef DATERANGEOID -#define DATERANGEOID 3912 -#endif +/* -------- static variables --------*/ +#define DEVTYPE_INFO_NSLOTS 128 +#define DEVFUNC_INFO_NSLOTS 1024 +static MemoryContext devinfo_memcxt = NULL; +static List *devtype_info_slot[DEVTYPE_INFO_NSLOTS]; +static List *devtype_code_slot[DEVTYPE_INFO_NSLOTS]; /* by TypeOpCode */ +static List *devfunc_info_slot[DEVFUNC_INFO_NSLOTS]; +static List *devfunc_code_slot[DEVFUNC_INFO_NSLOTS]; /* by FuncOpCode */ + +#define TYPE_OPCODE(NAME,OID,EXTENSION,FLAGS) \ + static uint32_t devtype_##NAME##_hash(bool isnull, Datum value); +#include "xpu_opcodes.h" + +#define TYPE_OPCODE(NAME,OID,EXTENSION,FLAGS) \ + { EXTENSION, #NAME, TypeOpCode__##NAME, \ + DEVKIND__ANY | (FLAGS), \ + devtype_##NAME##_hash, \ + sizeof(xpu_##NAME##_t), \ + __alignof__(xpu_##NAME##_t), InvalidOid}, static struct { const char *type_extension; const char *type_name; - Oid type_oid_fixed; /* can be InvalidOid if not build-in */ - const char *type_oid_label; - cl_uint type_flags; /* library to declare this type */ - cl_uint extra_sz; /* required size to store internal form */ - devtype_hashfunc_type hash_func; + TypeOpCode type_code; + uint32_t type_flags; + devtype_hashfunc_f type_hashfunc; + int type_sizeof; + int type_alignof; + Oid type_alias; } devtype_catalog[] = { - /* - * Primitive datatypes - */ - { NULL, "bool", BOOLOID, "BOOLOID", - 0, 0, generic_devtype_hashfunc - }, - { PGSTROM, "int1", INT1OID, "INT1OID", - 0, 0, pg_int1_devtype_hashfunc - }, - { NULL, "int2", INT2OID, "INT2OID", - 0, 0, pg_int2_devtype_hashfunc - }, - { NULL, "int4", INT4OID, "INT4OID", - 0, 0, pg_int4_devtype_hashfunc - }, - { NULL, "int8", INT8OID, "INT8OID", - 0, 0, pg_int8_devtype_hashfunc - }, - /* XXX - float2 is not a built-in data type */ - { PGSTROM, "float2", FLOAT2OID, "FLOAT2OID", - 0, 0, pg_float2_devtype_hashfunc - }, - { NULL, "float4", FLOAT4OID, "FLOAT4OID", - 0, 0, pg_float4_devtype_hashfunc - }, - { NULL, "float8", FLOAT8OID, "FLOAT8OID", - 0, 0, pg_float8_devtype_hashfunc - }, - /* - * Misc data types - */ - { NULL, "money", CASHOID, "CASHOID", - DEVKERNEL_NEEDS_MISCLIB, 0, - generic_devtype_hashfunc - }, - { NULL, "uuid", UUIDOID, "UUIDOID", - DEVKERNEL_NEEDS_MISCLIB, UUID_LEN, - generic_devtype_hashfunc - }, - { NULL, "macaddr", MACADDROID, "MACADDROID", - DEVKERNEL_NEEDS_MISCLIB, sizeof(macaddr), - generic_devtype_hashfunc - }, - { NULL, "inet", INETOID, "INETOID", - DEVKERNEL_NEEDS_MISCLIB, sizeof(inet), - pg_inet_devtype_hashfunc - }, - { NULL, "cidr", CIDROID, "CIDROID", - DEVKERNEL_NEEDS_MISCLIB, sizeof(inet), - pg_inet_devtype_hashfunc - }, - /* - * Date and time datatypes - */ - { NULL, "date", DATEOID, "DATEOID", - DEVKERNEL_NEEDS_TIMELIB, 0, - generic_devtype_hashfunc - }, - { NULL, "time", TIMEOID, "TIMEOID", - DEVKERNEL_NEEDS_TIMELIB, 0, - generic_devtype_hashfunc - }, - { NULL, "timetz", TIMETZOID, "TIMETZOID", - DEVKERNEL_NEEDS_TIMELIB, sizeof(TimeTzADT), - generic_devtype_hashfunc - }, - { NULL, "timestamp", TIMESTAMPOID, "TIMESTAMPOID", - DEVKERNEL_NEEDS_TIMELIB, 0, - generic_devtype_hashfunc - }, - { NULL, "timestamptz", TIMESTAMPTZOID, "TIMESTAMPTZOID", - DEVKERNEL_NEEDS_TIMELIB, 0, - generic_devtype_hashfunc - }, - { NULL, "interval", INTERVALOID, "INTERVALOID", - DEVKERNEL_NEEDS_TIMELIB, sizeof(Interval), - pg_interval_devtype_hashfunc - }, - /* - * variable length datatypes - */ - { NULL, "bpchar", BPCHAROID, "BPCHAROID", - DEVKERNEL_NEEDS_TEXTLIB, 0, - pg_bpchar_devtype_hashfunc - }, - { NULL, "varchar", VARCHAROID, "VARCHAROID", - DEVKERNEL_NEEDS_TEXTLIB, 0, - generic_devtype_hashfunc - }, - { NULL, "numeric", NUMERICOID, "NUMERICOID", - 0, sizeof(struct NumericData), - pg_numeric_devtype_hashfunc - }, - { NULL, "bytea", BYTEAOID, "BYTEAOID", - 0, sizeof(pg_varlena_t), - generic_devtype_hashfunc - }, - { NULL, "text", TEXTOID, "TEXTOID", - DEVKERNEL_NEEDS_TEXTLIB, sizeof(pg_varlena_t), - generic_devtype_hashfunc - }, - { NULL, "jsonb", JSONBOID, "JSONBOID", - DEVKERNEL_NEEDS_JSONLIB, - /* see comment at vlbuf_estimate_jsonb() */ - TOAST_TUPLE_THRESHOLD, - pg_jsonb_devtype_hashfunc - }, - /* - * range types - */ - { NULL, "int4range", INT4RANGEOID, "INT4RANGEOID", - DEVKERNEL_NEEDS_RANGETYPE, - sizeof(RangeType) + 2 * sizeof(cl_int) + 1, - pg_range_devtype_hashfunc - }, - { NULL, "int8range", INT8RANGEOID, "INT8RANGEOID", - DEVKERNEL_NEEDS_RANGETYPE, - sizeof(RangeType) + 2 * sizeof(cl_long) + 1, - pg_range_devtype_hashfunc - }, - { NULL, "tsrange", TSRANGEOID, "TSRANGEOID", - DEVKERNEL_NEEDS_TIMELIB | DEVKERNEL_NEEDS_RANGETYPE, - sizeof(RangeType) + 2 * sizeof(Timestamp) + 1, - pg_range_devtype_hashfunc - }, - { NULL, "tstzrange", TSTZRANGEOID, "TSTZRANGEOID", - DEVKERNEL_NEEDS_TIMELIB | DEVKERNEL_NEEDS_RANGETYPE, - sizeof(RangeType) + 2 * sizeof(TimestampTz) + 1, - pg_range_devtype_hashfunc - }, - { NULL, "daterange", DATERANGEOID, "DATERANGEOID", - DEVKERNEL_NEEDS_TIMELIB | DEVKERNEL_NEEDS_RANGETYPE, - sizeof(RangeType) + 2 * sizeof(DateADT) + 1, - pg_range_devtype_hashfunc - }, - /* - * PostGIS types - */ - { POSTGIS3, "geometry", InvalidOid, "GEOMETRYOID", - DEVKERNEL_NEEDS_POSTGIS, - sizeof(pg_geometry_t), - pg_geometry_devtype_hashfunc - }, - { POSTGIS3, "box2df", InvalidOid, "BOX2DFOID", - DEVKERNEL_NEEDS_POSTGIS, - sizeof(pg_box2df_t), - pg_box2df_devtype_hashfunc - } +#include "xpu_opcodes.h" + /* alias device data types */ + {NULL, "varchar", TypeOpCode__text, DEVKIND__ANY, + devtype_text_hash, sizeof(xpu_text_t), TEXTOID}, + {NULL, "cidr", TypeOpCode__inet, DEVKIND__ANY, + devtype_inet_hash, sizeof(xpu_inet_t), INETOID}, + {NULL, NULL, TypeOpCode__Invalid, 0, NULL, 0, InvalidOid} }; static const char * get_extension_name_by_object(Oid class_id, Oid object_id) { - Relation rel; - ScanKeyData skeys[2]; - SysScanDesc scan; - HeapTuple htup; - const char *ext_name = NULL; - - ScanKeyInit(&skeys[0], - Anum_pg_depend_classid, - BTEqualStrategyNumber, F_OIDEQ, - ObjectIdGetDatum(class_id)); - ScanKeyInit(&skeys[1], - Anum_pg_depend_objid, - BTEqualStrategyNumber, F_OIDEQ, - ObjectIdGetDatum(object_id)); - - rel = table_open(DependRelationId, AccessShareLock); - scan = systable_beginscan(rel, DependDependerIndexId, true, - NULL, 2, skeys); - while (HeapTupleIsValid(htup = systable_getnext(scan))) - { - Form_pg_depend dep = (Form_pg_depend) GETSTRUCT(htup); - const char *__ext_name; - - if (dep->refclassid == ExtensionRelationId && - dep->deptype == DEPENDENCY_EXTENSION) - { - __ext_name = get_extension_name(dep->refobjid); - if (__ext_name) - ext_name = quote_identifier(__ext_name); - break; - } - } - systable_endscan(scan); - table_close(rel, AccessShareLock); - - return ext_name; -} - -static void -append_string_devtype_identifier(StringInfo buf, Oid type_oid) -{ - HeapTuple htup; - Form_pg_type type_form; - char *nsp_name; - - htup = SearchSysCache1(TYPEOID, ObjectIdGetDatum(type_oid)); - if (!HeapTupleIsValid(htup)) - elog(ERROR, "cache lookup failed for type %u", type_oid); - type_form = (Form_pg_type) GETSTRUCT(htup); - - nsp_name = get_namespace_name(type_form->typnamespace); - if (!nsp_name) - elog(ERROR, "cache lookup failed for namespace %u", type_form->typnamespace); - appendStringInfo(buf, "%s.%s", - quote_identifier(nsp_name), - quote_identifier(NameStr(type_form->typname))); - ReleaseSysCache(htup); -} - -/* - * build_extra_devtype_info - * - * it queries the extra device type support - */ -static devtype_info * -build_extra_devtype_info(TypeCacheEntry *tcache, const char *ext_name) -{ - StringInfoData ident; - devtype_info __dtype; - devtype_info *dtype = NULL; - int i; + Oid ext_oid = getExtensionOfObject(class_id, object_id); - /* setup arguments */ - initStringInfo(&ident); - append_string_devtype_identifier(&ident, tcache->type_id); - - memset(&__dtype, 0, sizeof(devtype_info)); - __dtype.type_extension = ext_name; - __dtype.type_oid = tcache->type_id; - __dtype.type_flags = 0; - __dtype.type_length = tcache->typlen; - __dtype.type_align = typealign_get_width(tcache->typalign); - __dtype.type_byval = tcache->typbyval; - __dtype.type_name = NULL; /* callback must set the device type name */ - __dtype.extra_sz = 0; - __dtype.hash_func = NULL; - __dtype.type_eqfunc = get_opcode(tcache->eq_opr); - __dtype.type_cmpfunc = tcache->cmp_proc; - - for (i=0; i < pgstrom_num_users_extra; i++) - { - pgstromUsersExtraDescriptor *extra = &pgstrom_users_extra_desc[i]; - - if (extra->lookup_extra_devtype && - extra->lookup_extra_devtype(ident.data, &__dtype)) - { - MemoryContext oldcxt; - - /* must be still base type */ - Assert(__dtype.type_element == NULL && - __dtype.comp_nfields == 0); - if (!__dtype.type_name) - { - elog(DEBUG2, "Extra module didn't set device type name for '%s'", - format_type_be(tcache->type_id)); - continue; - } - oldcxt = MemoryContextSwitchTo(devinfo_memcxt); - dtype = pmemdup(&__dtype, offsetof(devtype_info, comp_subtypes[0])); - if (__dtype.type_extension) - dtype->type_extension = pstrdup(__dtype.type_extension); - dtype->type_name = pstrdup(__dtype.type_name); - dtype->type_flags |= extra->extra_flags; - MemoryContextSwitchTo(oldcxt); - break; - } - } - pfree(ident.data); - return dtype; + if (OidIsValid(ext_oid)) + return get_extension_name(ext_oid); + return NULL; } static devtype_info * build_basic_devtype_info(TypeCacheEntry *tcache, const char *ext_name) { + devtype_info *dtype = NULL; HeapTuple htup; - Form_pg_type type_form; - const char *type_name; - devtype_info *entry = NULL; + Form_pg_type __type; + char type_name[NAMEDATALEN+1]; + Oid type_namespace; int i; htup = SearchSysCache1(TYPEOID, ObjectIdGetDatum(tcache->type_id)); if (!HeapTupleIsValid(htup)) elog(ERROR, "cache lookup failed for type %u", tcache->type_id); - type_form = (Form_pg_type) GETSTRUCT(htup); - type_name = NameStr(type_form->typname); - - for (i=0; i < lengthof(devtype_catalog); i++) + __type = (Form_pg_type) GETSTRUCT(htup); + strcpy(type_name, NameStr(__type->typname)); + type_namespace = __type->typnamespace; + ReleaseSysCache(htup); + /* built-in types must be in pg_catalog */ + if (!ext_name && type_namespace != PG_CATALOG_NAMESPACE) + return NULL; + for (i=0; devtype_catalog[i].type_name != NULL; i++) { const char *__ext_name = devtype_catalog[i].type_extension; const char *__type_name = devtype_catalog[i].type_name; - if (ext_name) - { - if (!__ext_name || strcmp(ext_name, __ext_name) != 0) - continue; - } - else - { - if (__ext_name || type_form->typnamespace != PG_CATALOG_NAMESPACE) - continue; - } - - if (strcmp(type_name, __type_name) == 0) + if ((ext_name + ? (__ext_name && strcmp(ext_name, __ext_name) == 0) + : (__ext_name == NULL)) && + strcmp(type_name, __type_name) == 0) { - MemoryContext oldcxt = MemoryContextSwitchTo(devinfo_memcxt); + MemoryContext oldcxt; + Oid __type_alias = devtype_catalog[i].type_alias; - entry = palloc0(offsetof(devtype_info, comp_subtypes[0])); + /* check feasibility of type alias */ + if (OidIsValid(__type_alias)) + { + char castmethod; + + htup = SearchSysCache2(CASTSOURCETARGET, + ObjectIdGetDatum(tcache->type_id), + ObjectIdGetDatum(__type_alias)); + if (!HeapTupleIsValid(htup)) + elog(ERROR, "binary type cast %s to %s is not defined", + format_type_be(tcache->type_id), + format_type_be(__type_alias)); + castmethod = ((Form_pg_cast)GETSTRUCT(htup))->castmethod; + if (castmethod != COERCION_METHOD_BINARY) + elog(ERROR, "type cast %s to %s is not binary compatible (%c)", + format_type_be(tcache->type_id), + format_type_be(__type_alias), castmethod); + ReleaseSysCache(htup); + /* use type name of the alias */ + htup = SearchSysCache1(TYPEOID, ObjectIdGetDatum(__type_alias)); + if (!HeapTupleIsValid(htup)) + elog(ERROR, "cache lookup failed for type %u", __type_alias); + __type = (Form_pg_type) GETSTRUCT(htup); + strcpy(type_name, NameStr(__type->typname)); + ReleaseSysCache(htup); + } + oldcxt = MemoryContextSwitchTo(devinfo_memcxt); + dtype = palloc0(offsetof(devtype_info, comp_subtypes[0])); if (ext_name) - entry->type_extension = pstrdup(ext_name); - entry->type_oid = tcache->type_id; - entry->type_flags = devtype_catalog[i].type_flags; - entry->type_length = tcache->typlen; - entry->type_align = typealign_get_width(tcache->typalign); - entry->type_byval = tcache->typbyval; - entry->type_name = devtype_catalog[i].type_name; /* const */ - entry->extra_sz = devtype_catalog[i].extra_sz; - entry->hash_func = devtype_catalog[i].hash_func; + dtype->type_extension = pstrdup(ext_name); + dtype->type_code = devtype_catalog[i].type_code; + dtype->type_oid = tcache->type_id; + dtype->type_flags = devtype_catalog[i].type_flags; + dtype->type_length = tcache->typlen; + dtype->type_align = typealign_get_width(tcache->typalign); + dtype->type_byval = tcache->typbyval; + dtype->type_name = pstrdup(type_name); + dtype->type_extension = (ext_name ? pstrdup(ext_name) : NULL); + dtype->type_sizeof = devtype_catalog[i].type_sizeof; + dtype->type_alignof = devtype_catalog[i].type_alignof; + dtype->type_hashfunc = devtype_catalog[i].type_hashfunc; /* type equality functions */ - entry->type_eqfunc = get_opcode(tcache->eq_opr); - entry->type_cmpfunc = tcache->cmp_proc; - + dtype->type_eqfunc = get_opcode(tcache->eq_opr); + dtype->type_cmpfunc = tcache->cmp_proc; MemoryContextSwitchTo(oldcxt); - break; + + return dtype; } } - if (!entry && pgstrom_num_users_extra > 0) - entry = build_extra_devtype_info(tcache, ext_name); - ReleaseSysCache(htup); - - return entry; + return NULL; /* not found */ } static devtype_info * -build_array_devtype_info(TypeCacheEntry *tcache, const char *ext_name) +build_composite_devtype_info(TypeCacheEntry *tcache, const char *ext_name) { - devtype_info *element; - devtype_info *entry; - Oid typelem; + TupleDesc tupdesc = lookup_rowtype_tupdesc(tcache->type_id, -1); + devtype_info **subtypes = alloca(sizeof(devtype_info *) * tupdesc->natts); + devtype_info *dtype; MemoryContext oldcxt; + uint32_t extra_flags = DEVKIND__ANY; + int j; - typelem = get_element_type(tcache->type_id); - Assert(OidIsValid(typelem) && tcache->typlen == -1); - element = pgstrom_devtype_lookup(typelem); - if (!element) - return NULL; + for (j=0; j < tupdesc->natts; j++) + { + Form_pg_attribute attr = TupleDescAttr(tupdesc, j); + + dtype = pgstrom_devtype_lookup(attr->atttypid); + if (!dtype) + { + ReleaseTupleDesc(tupdesc); + return NULL; + } + extra_flags &= dtype->type_flags; + subtypes[j] = dtype; + } + ReleaseTupleDesc(tupdesc); oldcxt = MemoryContextSwitchTo(devinfo_memcxt); - entry = palloc0(offsetof(devtype_info, comp_subtypes[0])); + dtype = palloc0(offsetof(devtype_info, + comp_subtypes[tupdesc->natts])); if (ext_name) - entry->type_extension = pstrdup(ext_name); - entry->type_oid = tcache->type_id; - entry->type_flags = element->type_flags; - entry->type_length = tcache->typlen; - entry->type_align = typealign_get_width(tcache->typalign); - entry->type_byval = tcache->typbyval; - entry->type_name = "array"; - entry->extra_sz = sizeof(pg_array_t); - entry->hash_func = generic_devtype_hashfunc; - entry->type_element = element; + dtype->type_extension = pstrdup(ext_name); + dtype->type_code = TypeOpCode__composite; + dtype->type_oid = tcache->type_id; + dtype->type_flags = extra_flags | DEVTYPE__USE_KVARS_SLOTBUF; + dtype->type_length = tcache->typlen; + dtype->type_align = typealign_get_width(tcache->typalign); + dtype->type_byval = tcache->typbyval; + dtype->type_name = "composite"; + dtype->type_sizeof = sizeof(xpu_composite_t); + dtype->type_alignof = __alignof__(xpu_composite_t); + dtype->type_hashfunc = NULL; //devtype_composite_hash; + dtype->type_eqfunc = get_opcode(tcache->eq_opr); + dtype->type_cmpfunc = tcache->cmp_proc; + dtype->comp_nfields = tupdesc->natts; + memcpy(dtype->comp_subtypes, subtypes, + sizeof(devtype_info *) * tupdesc->natts); MemoryContextSwitchTo(oldcxt); - return entry; + return dtype; } static devtype_info * -build_composite_devtype_info(TypeCacheEntry *tcache, const char *ext_name) +build_array_devtype_info(TypeCacheEntry *tcache, const char *ext_name) { - Oid type_relid = tcache->typrelid; - int j, nfields = get_relnatts(type_relid); - devtype_info **subtypes = alloca(sizeof(devtype_info *) * nfields); - devtype_info *entry; - cl_uint extra_flags = 0; - size_t extra_sz; + devtype_info *elem; + devtype_info *dtype; MemoryContext oldcxt; - extra_sz = (MAXALIGN(sizeof(Datum) * nfields) + - MAXALIGN(sizeof(bool) * nfields)); - for (j=0; j < nfields; j++) - { - HeapTuple tup; - Oid atttypid; - devtype_info *dtype; - - tup = SearchSysCache2(ATTNUM, - ObjectIdGetDatum(type_relid), - Int16GetDatum(j+1)); - if (!HeapTupleIsValid(tup)) - return NULL; - atttypid = ((Form_pg_attribute) GETSTRUCT(tup))->atttypid; - ReleaseSysCache(tup); - - dtype = pgstrom_devtype_lookup(atttypid); - if (!dtype) - return NULL; - subtypes[j] = dtype; - - extra_flags |= dtype->type_flags; - extra_sz += MAXALIGN(dtype->extra_sz); - } + elem = pgstrom_devtype_lookup(tcache->typelem); + if (!elem) + return NULL; oldcxt = MemoryContextSwitchTo(devinfo_memcxt); - entry = palloc0(offsetof(devtype_info, comp_subtypes[nfields])); + dtype = palloc0(offsetof(devtype_info, comp_subtypes[0])); if (ext_name) - entry->type_extension = pstrdup(ext_name); - entry->type_oid = tcache->type_id; - entry->type_flags = extra_flags; - entry->type_length = tcache->typlen; - entry->type_align = typealign_get_width(tcache->typalign); - entry->type_byval = tcache->typbyval; - entry->type_name = "composite"; - entry->extra_sz = extra_sz; - entry->comp_nfields = nfields; - memcpy(entry->comp_subtypes, subtypes, - sizeof(devtype_info *) * nfields); + dtype->type_extension = pstrdup(ext_name); + dtype->type_code = TypeOpCode__array; + dtype->type_oid = tcache->type_id; + dtype->type_flags = elem->type_flags | DEVTYPE__USE_KVARS_SLOTBUF; + dtype->type_length = tcache->typlen; + dtype->type_align = typealign_get_width(tcache->typalign); + dtype->type_byval = tcache->typbyval; + dtype->type_name = "array"; + dtype->type_sizeof = sizeof(xpu_array_t); + dtype->type_alignof = __alignof__(xpu_array_t); + dtype->type_hashfunc = NULL; //devtype_array_hash; + /* type equality functions */ + dtype->type_eqfunc = get_opcode(tcache->eq_opr); + dtype->type_cmpfunc = tcache->cmp_proc; + MemoryContextSwitchTo(oldcxt); - return entry; + return dtype; } devtype_info * pgstrom_devtype_lookup(Oid type_oid) { - TypeCacheEntry *tcache; devtype_info *dtype; - uint32 hashvalue; - uint32 hindex; - size_t sz; - dlist_iter iter; + Datum hash; + uint32_t index; + ListCell *lc; const char *ext_name; + TypeCacheEntry *tcache; - /* lookup dtype that is already built */ - hashvalue = GetSysCacheHashValue(TYPEOID, ObjectIdGetDatum(type_oid), 0, 0, 0); - hindex = hashvalue % lengthof(devtype_info_slot); - dlist_foreach(iter, &devtype_info_slot[hindex]) + hash = hash_any((unsigned char *)&type_oid, sizeof(Oid)); + index = hash % DEVTYPE_INFO_NSLOTS; + foreach (lc, devtype_info_slot[index]) { - dtype = dlist_container(devtype_info, chain, iter.cur); + dtype = lfirst(lc); if (dtype->type_oid == type_oid) { - if (dtype->type_is_negative) - return NULL; - return dtype; + Assert(dtype->hash == hash); + goto found; } } /* try to build devtype_info entry */ @@ -535,4395 +255,2627 @@ pgstrom_devtype_lookup(Oid type_oid) tcache = lookup_type_cache(type_oid, TYPECACHE_EQ_OPR | TYPECACHE_CMP_PROC); - if (OidIsValid(tcache->typrelid)) + /* if domain, move to the base type */ + while (tcache->nextDomain) + tcache = tcache->nextDomain; + + if (OidIsValid(tcache->typelem) && tcache->typlen == -1) + { + /* array type */ + dtype = build_array_devtype_info(tcache, ext_name); + } + else if (tcache->typtype == TYPTYPE_COMPOSITE) { /* composite type */ + if (!OidIsValid(tcache->typrelid)) + elog(ERROR, "Bug? wrong composite definition at %s", + format_type_be(type_oid)); dtype = build_composite_devtype_info(tcache, ext_name); } - else if (OidIsValid(tcache->typelem) && tcache->typlen == -1) + else if (tcache->typtype == TYPTYPE_BASE || + tcache->typtype == TYPTYPE_RANGE) { - /* array type */ - dtype = build_array_devtype_info(tcache, ext_name); + /* base or range type */ + dtype = build_basic_devtype_info(tcache, ext_name); } else { - /* base or extra type */ - dtype = build_basic_devtype_info(tcache, ext_name); + /* not a supported type */ + dtype = NULL; } - - /* makes a negative entry, if not in the catalog */ + + /* make a negative entry, if not device executable */ if (!dtype) { - sz = offsetof(devtype_info, comp_subtypes[0]); - dtype = MemoryContextAllocZero(devinfo_memcxt, sz); - dtype->type_oid = type_oid; + dtype = MemoryContextAllocZero(devinfo_memcxt, + sizeof(devtype_info)); dtype->type_is_negative = true; } - dtype->hashvalue = hashvalue; - dlist_push_head(&devtype_info_slot[hindex], &dtype->chain); - + dtype->type_oid = type_oid; + dtype->hash = hash; + devtype_info_slot[index] = lappend_cxt(devinfo_memcxt, + devtype_info_slot[index], dtype); + if (!dtype->type_is_negative) + { + hash = hash_any((unsigned char *)&dtype->type_code, sizeof(TypeOpCode)); + index = hash % DEVTYPE_INFO_NSLOTS; + devtype_code_slot[index] = lappend_cxt(devinfo_memcxt, + devtype_code_slot[index], dtype); + } +found: if (dtype->type_is_negative) return NULL; return dtype; } -devtype_info * -pgstrom_devtype_lookup_and_track(Oid type_oid, codegen_context *context) -{ - devtype_info *dtype = pgstrom_devtype_lookup(type_oid); - - if (dtype) - context->extra_flags |= dtype->type_flags; - - return dtype; -} - +/* + * devtype_lookup_by_opcode + */ static devtype_info * -pgstrom_devtype_lookup_by_name(const char *type_ident) +devtype_lookup_by_opcode(TypeOpCode type_code) { - char *type_name = NULL; - char *ext_name = NULL; - const char *__ext_name; - Oid type_oid = InvalidOid; - Relation rel; - ScanKeyData skey; - SysScanDesc sscan; - HeapTuple htup; - - type_name = alloca(strlen(type_ident) + 1); - strcpy(type_name, type_ident); - ext_name = strchr(type_name, '@'); - if (ext_name) - *ext_name++ = '\0'; - - htup = SearchSysCache2(TYPENAMENSP, - CStringGetDatum(type_name), - ObjectIdGetDatum(PG_CATALOG_NAMESPACE)); - if (HeapTupleIsValid(htup)) - { - type_oid = PgTypeTupleGetOid(htup); - __ext_name = get_extension_name_by_object(TypeRelationId, type_oid); - if (ext_name) - { - if (!__ext_name || strcmp(ext_name, __ext_name) != 0) - type_oid = InvalidOid; - } - else if (__ext_name != NULL) - type_oid = InvalidOid; - ReleaseSysCache(htup); - } + Datum hash; + uint32_t index; + ListCell *lc; - if (!OidIsValid(type_oid)) + hash = hash_any((unsigned char *)&type_code, sizeof(TypeOpCode)); + index = hash % DEVTYPE_INFO_NSLOTS; + foreach (lc, devtype_code_slot[index]) { - rel = table_open(TypeRelationId, AccessShareLock); - - ScanKeyInit(&skey, - Anum_pg_type_typname, - BTEqualStrategyNumber, F_NAMEEQ, - CStringGetDatum(type_name)); - sscan = systable_beginscan(rel, TypeNameNspIndexId, - true, NULL, 1, &skey); - do { - htup = systable_getnext(sscan); - if (!HeapTupleIsValid(htup)) - break; - type_oid = PgTypeTupleGetOid(htup); - __ext_name = get_extension_name_by_object(TypeRelationId, type_oid); - if (ext_name) - { - if (!__ext_name || strcmp(ext_name, __ext_name) != 0) - type_oid = InvalidOid; - } - else if (__ext_name != NULL) - type_oid = InvalidOid; - } while (!OidIsValid(type_oid)); + devtype_info *dtype = lfirst(lc); - systable_endscan(sscan); - table_close(rel, AccessShareLock); + if (dtype->type_code == type_code) + return dtype; } - - if (OidIsValid(type_oid)) - return pgstrom_devtype_lookup(type_oid); return NULL; } -/* code for extra device types */ -size_t -pgstrom_codegen_extra_devtypes(char *buf, size_t bufsz, uint32 extra_flags) +/* + * Built-in device type hash functions + */ +static uint32_t +devtype_bool_hash(bool isnull, Datum value) { - size_t off = 0; - int i; - - /* only extra device types */ - extra_flags &= DEVKERNEL_USERS_EXTRA_MASK; - - for (i=0; i < pgstrom_num_users_extra; i++) - { - pgstromUsersExtraDescriptor *ex_desc = &pgstrom_users_extra_desc[i]; - - if ((ex_desc->extra_flags & extra_flags) == ex_desc->extra_flags) - { - off += snprintf(buf + off, bufsz - off, - "#include \"%s.h\"\n", - ex_desc->extra_name); - } - } - /* array type support */ - off += snprintf( - buf + off, bufsz - off, - "\n" - "DEVICE_FUNCTION(cl_uint)\n" - "pg_extras_array_from_arrow(kern_context *kcxt,\n" - " char *dest,\n" - " kern_colmeta *smeta,\n" - " char *base,\n" - " cl_uint start,\n" - " cl_uint end)\n" - "{\n"); - if (pgstrom_num_users_extra > 0) - { - off += snprintf( - buf + off, bufsz - off, - " switch (smeta->atttypid)\n" - " {\n"); - for (i=0; i < lengthof(devtype_info_slot); i++) - { - dlist_iter iter; - devtype_info *dtype; - - dlist_foreach(iter, &devtype_info_slot[i]) - { - dtype = dlist_container(devtype_info, chain, iter.cur); - if ((dtype->type_flags & extra_flags) == 0) - continue; - off += snprintf( - buf + off, bufsz - off, - " case %u:\n" - " return pg_%s_array_from_arrow(kcxt, dest,\n" - " smeta, base,\n" - " start, end);\n", - dtype->type_oid, - dtype->type_name); - } - } - off += snprintf( - buf + off, bufsz - off, - " default:\n" - " break;\n" - " }\n"); - } - off += snprintf( - buf + off, bufsz - off, - " return 0;\n" - "}\n"); - - /* composite type support */ - off += snprintf( - buf + off, bufsz - off, - "\n" - "DEVICE_FUNCTION(cl_bool)\n" - "pg_extras_composite_from_arrow(kern_context *kcxt,\n" - " kern_colmeta *smeta,\n" - " char *base,\n" - " cl_uint rowidx,\n" - " cl_char *p_dclass,\n" - " Datum *p_datum)\n" - "{\n"); - - if (pgstrom_num_users_extra > 0) - { - off += snprintf( - buf + off, bufsz - off, - " switch (smeta->atttypid)\n" - " {\n"); - for (i=0; i < lengthof(devtype_info_slot); i++) - { - dlist_iter iter; - devtype_info *dtype; - - dlist_foreach(iter, &devtype_info_slot[i]) - { - dtype = dlist_container(devtype_info, chain, iter.cur); - if ((dtype->type_flags & extra_flags) == 0) - continue; + bool bval; - off += snprintf( - buf + off, bufsz - off, - " case %u: {\n" - " pg_%s_t temp;\n" - " pg_datum_fetch_arrow(kcxt, temp, smeta, base, rowidx);\n" - " pg_datum_store(kcxt, temp, p_dclass, p_datum);\n" - " return true;\n" - " }\n", - dtype->type_oid, - dtype->type_name); - } - } - off += snprintf( - buf + off, bufsz - off, - " default:\n" - " break;\n" - " }\n"); - } - off += snprintf( - buf + off, bufsz - off, - " return false;\n" - "}\n"); - return off; + if (isnull) + return 0; + bval = DatumGetBool(value) ? true : false; + return hash_any((unsigned char *)&bval, sizeof(bool)); } -/* - * Device type specific hash-functions - * - * Some device types have internal representation, like numeric, which shall - * be used to GpuHashJoin for join-key hashing. - */ -static cl_uint -generic_devtype_hashfunc(devtype_info *dtype, Datum datum) +static inline uint32_t +__devtype_simple_hash(bool isnull, Datum value, int sz) { - if (dtype->type_byval) - return hash_any((unsigned char *)&datum, dtype->type_length); - if (dtype->type_length > 0) - return hash_any((unsigned char *)DatumGetPointer(datum), - dtype->type_length); - Assert(dtype->type_length == -1); - return hash_any((cl_uchar *)VARDATA_ANY(datum), - VARSIZE_ANY_EXHDR(datum)); + if (isnull) + return 0; + return hash_any((unsigned char *)&value, sz); } -static cl_uint -pg_int1_devtype_hashfunc(devtype_info *dtype, Datum datum) +static uint32_t +devtype_int1_hash(bool isnull, Datum value) { - cl_int ival = DatumGetChar(datum); - - return hash_any((cl_uchar *)&ival, sizeof(cl_char)); + return __devtype_simple_hash(isnull, value, sizeof(int8_t)); } -static cl_uint -pg_int2_devtype_hashfunc(devtype_info *dtype, Datum datum) +static uint32_t +devtype_int2_hash(bool isnull, Datum value) { - cl_int ival = DatumGetInt16(datum); - - return hash_any((cl_uchar *)&ival, sizeof(cl_short)); + return __devtype_simple_hash(isnull, value, sizeof(int16_t)); } -static cl_uint -pg_int4_devtype_hashfunc(devtype_info *dtype, Datum datum) +static uint32_t +devtype_int4_hash(bool isnull, Datum value) { - cl_int ival = DatumGetInt32(datum); + return __devtype_simple_hash(isnull, value, sizeof(int32_t)); +} - return hash_any((cl_uchar *)&ival, sizeof(cl_int)); +static uint32_t +devtype_int8_hash(bool isnull, Datum value) +{ + return __devtype_simple_hash(isnull, value, sizeof(int64_t)); } -static cl_uint -pg_int8_devtype_hashfunc(devtype_info *dtype, Datum datum) +static uint32_t +devtype_float2_hash(bool isnull, Datum value) { - cl_long ival = DatumGetInt64(datum); - cl_uint lo = (ival & 0xffffffffL); - cl_uint hi = (ival >> 32); + return __devtype_simple_hash(isnull, value, sizeof(float2_t)); +} - lo ^= (ival >= 0 ? hi : ~hi); +static uint32_t +devtype_float4_hash(bool isnull, Datum value) +{ + return __devtype_simple_hash(isnull, value, sizeof(float4_t)); +} - return hash_any((cl_uchar *)&lo, sizeof(cl_int)); +static uint32_t +devtype_float8_hash(bool isnull, Datum value) +{ + return __devtype_simple_hash(isnull, value, sizeof(float8_t)); } -extern Datum pgstrom_float2_to_float8(PG_FUNCTION_ARGS); -static cl_uint -pg_float2_devtype_hashfunc(devtype_info *dtype, Datum datum) +static uint32_t +devtype_numeric_hash(bool isnull, Datum value) { - Datum v = DirectFunctionCall1(pgstrom_float2_to_float8, datum); - cl_double fval = DatumGetFloat8(v); + uint32_t len; - if (fval == 0.0) + if (isnull) return 0; - return hash_any((cl_uchar *)&fval, sizeof(cl_double)); + len = VARSIZE_ANY_EXHDR(value); + if (len >= sizeof(uint16_t)) + { + NumericChoice *nc = (NumericChoice *)VARDATA_ANY(value); + NumericDigit *digits = NUMERIC_DIGITS(nc, nc->n_header); + int weight = NUMERIC_WEIGHT(nc, nc->n_header) + 1; + int i, ndigits = NUMERIC_NDIGITS(nc->n_header, len); + int128_t value = 0; + + for (i=0; i < ndigits; i++) + { + NumericDigit dig = digits[i]; + + value = value * PG_NBASE + dig; + if (value < 0) + elog(ERROR, "numeric value is out of range"); + } + if (NUMERIC_SIGN(nc->n_header) == NUMERIC_NEG) + value = -value; + weight = PG_DEC_DIGITS * (ndigits - weight); + /* see, set_normalized_numeric */ + if (value == 0) + weight = 0; + else + { + while (value % 10 == 0) + { + value /= 10; + weight--; + } + } + return (hash_any((unsigned char *)&weight, sizeof(int16_t)) ^ + hash_any((unsigned char *)&value, sizeof(int128_t))); + } + elog(ERROR, "corrupted numeric header"); } -static cl_uint -pg_float4_devtype_hashfunc(devtype_info *dtype, Datum datum) +static uint32_t +devtype_bytea_hash(bool isnull, Datum value) { - cl_double fval = DatumGetFloat4(datum); - - if (fval == 0.0) + if (isnull) return 0; - return hash_any((cl_uchar *)&fval, sizeof(cl_double)); + return hash_any((unsigned char *)VARDATA_ANY(value), VARSIZE_ANY_EXHDR(value)); } -static cl_uint -pg_float8_devtype_hashfunc(devtype_info *dtype, Datum datum) +static uint32_t +devtype_text_hash(bool isnull, Datum value) { - cl_double fval = DatumGetFloat8(datum); - - if (fval == 0.0) + if (isnull) return 0; - return hash_any((cl_uchar *)&fval, sizeof(cl_double)); + return hash_any((unsigned char *)VARDATA_ANY(value), VARSIZE_ANY_EXHDR(value)); } -static cl_uint -pg_numeric_devtype_hashfunc(devtype_info *dtype, Datum datum) +static uint32_t +devtype_bpchar_hash(bool isnull, Datum value) { - kern_context dummy; - pg_numeric_t temp; + if (!isnull) + { + char *s = VARDATA_ANY(value); + int sz = VARSIZE_ANY_EXHDR(value); - memset(&dummy, 0, sizeof(dummy)); - /* - * MEMO: If NUMERIC value is out of range, we may not be able to - * execute GpuJoin in the kernel space for all the outer chunks. - * Is it still valuable to run on GPU kernel? - */ - temp = pg_numeric_from_varlena(&dummy, (struct varlena *) - DatumGetPointer(datum)); - if (dummy.errcode != ERRCODE_STROM_SUCCESS) - elog(ERROR, "failed on hash calculation of device numeric: %s", - DatumGetCString(DirectFunctionCall1(numeric_out, datum))); - - return hash_any((cl_uchar *)&temp.value, - offsetof(pg_numeric_t, weight) + sizeof(cl_short)); + sz = bpchartruelen(s, sz); + return hash_any((unsigned char *)s, sz); + } + return 0; } -static cl_uint -pg_interval_devtype_hashfunc(devtype_info *dtype, Datum datum) +static uint32_t +devtype_date_hash(bool isnull, Datum value) { - Interval *interval = DatumGetIntervalP(datum); - cl_long frac; - cl_long days; - - frac = interval->time % USECS_PER_DAY; - days = (interval->time / USECS_PER_DAY + - interval->month * 30L + - interval->day); - days ^= frac; - - return hash_any((cl_uchar *)&days, sizeof(cl_long)); + return __devtype_simple_hash(isnull, value, sizeof(DateADT)); } -static cl_uint -pg_bpchar_devtype_hashfunc(devtype_info *dtype, Datum datum) +static uint32_t +devtype_time_hash(bool isnull, Datum value) { - char *s = VARDATA_ANY(datum); - int i, len = VARSIZE_ANY_EXHDR(datum); - - Assert(dtype->type_oid == BPCHAROID); - /* - * whitespace is the tail end of CHAR(n) data shall be ignored - * when we calculate hash-value, to match same text exactly. - */ - for (i = len - 1; i >= 0 && s[i] == ' '; i--) - ; - return hash_any((unsigned char *)VARDATA_ANY(datum), i+1); + return __devtype_simple_hash(isnull, value, sizeof(TimeADT)); } -static cl_uint -pg_inet_devtype_hashfunc(devtype_info *dtype, Datum datum) +static uint32_t +devtype_timetz_hash(bool isnull, Datum value) { - inet_struct *is = (inet_struct *) VARDATA_ANY(datum); + if (!isnull) + { + TimeTzADT *tmtz = DatumGetTimeTzADTP(value); - Assert(dtype->type_oid == INETOID || - dtype->type_oid == CIDROID); - if (is->family == PGSQL_AF_INET) - return hash_any((cl_uchar *)is, offsetof(inet_struct, ipaddr[4])); - else if (is->family == PGSQL_AF_INET6) - return hash_any((cl_uchar *)is, offsetof(inet_struct, ipaddr[16])); + return (hash_any((unsigned char *)&tmtz->time, sizeof(TimeADT)) ^ + hash_any((unsigned char *)&tmtz->zone, sizeof(int32_t))); + } + return 0; +} - elog(ERROR, "unexpected address family: %d", is->family); - return ~0U; +static uint32_t +devtype_timestamp_hash(bool isnull, Datum value) +{ + return __devtype_simple_hash(isnull, value, sizeof(Timestamp)); } -static cl_uint -__jsonb_devtype_hashfunc(devtype_info *dtype, JsonbContainer *jc) +static uint32_t +devtype_timestamptz_hash(bool isnull, Datum value) { - cl_uint hash = 0; - cl_uint j, nitems = JsonContainerSize(jc); - char *base = NULL; - char *data; - cl_uint datalen; + return __devtype_simple_hash(isnull, value, sizeof(TimestampTz)); +} - if (!JsonContainerIsScalar(jc)) +static uint32_t +devtype_interval_hash(bool isnull, Datum value) +{ + if (!isnull) { - if (JsonContainerIsObject(jc)) - { - base = (char *)(jc->children + 2 * nitems); - hash ^= JB_FOBJECT; - } - else - { - base = (char *)(jc->children + nitems); - hash ^= JB_FARRAY; - } + Interval *iv = DatumGetIntervalP(value); + + return hash_any((unsigned char *)iv, sizeof(Interval)); } + return 0; +} + +static uint32_t +devtype_money_hash(bool isnull, Datum value) +{ + return __devtype_simple_hash(isnull, value, sizeof(int64_t)); +} - for (j=0; j < nitems; j++) +static uint32_t +devtype_uuid_hash(bool isnull, Datum value) +{ + if (!isnull) { - cl_uint index = j; - cl_uint temp; - JEntry entry; + pg_uuid_t *uuid = DatumGetUUIDP(value); - /* hash value for key */ - if (JsonContainerIsObject(jc)) - { - entry = jc->children[index]; - if (!JBE_ISSTRING(entry)) - elog(ERROR, "jsonb key value is not STRING"); - data = base + getJsonbOffset(jc, index); - datalen = getJsonbLength(jc, index); - temp = hash_any((cl_uchar *)data, datalen); - hash = ((hash << 1) | (hash >> 31)) ^ temp; - - index += nitems; - } - /* hash value for element */ - entry = jc->children[index]; - if (JBE_ISNULL(entry)) - temp = 0x01; - else if (JBE_ISSTRING(entry)) - { - data = base + getJsonbOffset(jc, index); - datalen = getJsonbLength(jc, index); - temp = hash_any((cl_uchar *)data, datalen); - } - else if (JBE_ISNUMERIC(entry)) - { - data = base + INTALIGN(getJsonbOffset(jc, index)); - temp = pg_numeric_devtype_hashfunc(NULL, PointerGetDatum(data)); - } - else if (JBE_ISBOOL_TRUE(entry)) - temp = 0x02; - else if (JBE_ISBOOL_FALSE(entry)) - temp = 0x04; - else if (JBE_ISCONTAINER(entry)) - { - data = base + INTALIGN(getJsonbOffset(jc, index)); - temp = __jsonb_devtype_hashfunc(dtype, (JsonbContainer *)data); - } - else - elog(ERROR, "Unexpected jsonb entry (%08x)", entry); - hash = ((hash << 1) | (hash >> 31)) ^ temp; + return hash_any(uuid->data, UUID_LEN); } - return hash; + return 0; } -static cl_uint -pg_jsonb_devtype_hashfunc(devtype_info *dtype, Datum datum) +static uint32_t +devtype_macaddr_hash(bool isnull, Datum value) { - JsonbContainer *jc = (JsonbContainer *) VARDATA_ANY(datum); + if (!isnull) + { + macaddr *maddr = DatumGetMacaddrP(value); - return __jsonb_devtype_hashfunc(dtype, jc); + return hash_any((unsigned char *)maddr, sizeof(macaddr)); + } + return 0; } -static cl_uint -pg_range_devtype_hashfunc(devtype_info *dtype, Datum datum) +static uint32_t +devtype_inet_hash(bool isnull, Datum value) { - RangeType *r = DatumGetRangeTypeP(datum); - cl_uchar flags = *((char *)r + VARSIZE_ANY(r) - 1); - cl_uchar *pos = (cl_uchar *)(r + 1); - struct { - Datum l_val; - Datum u_val; - cl_uchar flags; - } temp; - int32 ival32; - - if (RANGE_HAS_LBOUND(flags)) + if (!isnull) { - switch (RangeTypeGetOid(r)) - { - case INT4RANGEOID: - case DATERANGEOID: - memcpy(&ival32, pos, sizeof(cl_int)); - temp.l_val = (cl_long)ival32; - pos += sizeof(cl_int); - break; - case INT8RANGEOID: - case TSRANGEOID: - case TSTZRANGEOID: - memcpy(&temp.l_val, pos, sizeof(cl_long)); - pos += sizeof(cl_long); - break; - default: - elog(ERROR, "unexpected range type: %s", - format_type_be(RangeTypeGetOid(r))); - } + inet *in = DatumGetInetP(value); + int sz; + + if (in->inet_data.family == PGSQL_AF_INET) + sz = offsetof(inet_struct, ipaddr[4]); + else if (in->inet_data.family == PGSQL_AF_INET6) + sz = offsetof(inet_struct, ipaddr[16]); + else + elog(ERROR, "corrupted inet data"); + return hash_any((unsigned char *)&in->inet_data, sz); } - if (RANGE_HAS_UBOUND(flags)) - { - switch (RangeTypeGetOid(r)) - { - case INT4RANGEOID: - case DATERANGEOID: - memcpy(&ival32, pos, sizeof(cl_int)); - temp.l_val = (cl_long)ival32; - pos += sizeof(cl_int); - break; - case INT8RANGEOID: - case TSRANGEOID: - case TSTZRANGEOID: - memcpy(&temp.l_val, pos, sizeof(cl_long)); - pos += sizeof(cl_long); - break; - default: - elog(ERROR, "unexpected range type: %s", - format_type_be(RangeTypeGetOid(r))); - } - } - temp.flags = flags; - - return hash_any((unsigned char *)&temp, - 2*sizeof(Datum)+sizeof(cl_uchar)); -} - -static cl_uint -pg_geometry_devtype_hashfunc(devtype_info *dtype, Datum datum) -{ - return 0; //TODO -} - -static cl_uint -pg_box2df_devtype_hashfunc(devtype_info *dtype, Datum datum) -{ - return 0; //TODO + return 0; } /* - * varlena buffer estimation handler + * Built-in device functions/operators */ -static int -vlbuf_estimate_textcat(codegen_context *context, - devfunc_info *dfunc, - Expr **args, int *vl_width) -{ - int i, nargs = list_length(dfunc->func_args); - int maxlen = 0; - - for (i=0; i < nargs; i++) - { - if (vl_width[i] < 0) - __ELog("unable to estimate result size of textcat"); - maxlen += vl_width[i]; - } - /* it consumes varlena buffer on run-time */ - context->extra_bufsz += MAXALIGN(maxlen + VARHDRSZ); - - return maxlen; -} - -static int -vlbuf_estimate_substring(codegen_context *context, - devfunc_info *dfunc, - Expr **args, int *vl_width) -{ - if (list_length(dfunc->func_args) > 2 && - IsA(args[2], Const)) - { - Const *con = (Const *)args[2]; - - Assert(con->consttype == INT4OID); - if (con->constisnull) - return 0; - return Max(DatumGetInt32(con->constvalue), 0); - } - return vl_width[0]; -} - -static int -vlbuf_estimate_jsonb(codegen_context *context, - devfunc_info *dfunc, - Expr **args, int *vl_width) -{ - context->extra_bufsz += MAXALIGN(TOAST_TUPLE_THRESHOLD); - /* - * We usually have no information about jsonb object length preliminary, - * however, plain varlena must be less than the threshold of toasting. - * If user altered storage option of jsonb column to 'main', it may be - * increased to BLCKSZ, but unusual. - */ - return TOAST_TUPLE_THRESHOLD; -} +#define FUNC_OPCODE(SQLNAME,FN_ARGS,FN_FLAGS,DEVNAME,FUNC_COST,EXTENSION) \ + { #SQLNAME, #FN_ARGS, FN_FLAGS, FuncOpCode__##DEVNAME, FUNC_COST, EXTENSION }, +static struct { + const char *func_name; + const char *func_args; + uint32_t func_flags; + FuncOpCode func_code; + int func_cost; + const char *func_extension; +} devfunc_catalog[] = { +#include "xpu_opcodes.h" + {NULL,NULL,0,FuncOpCode__Invalid,0,NULL} +}; -static int -vlbuf_estimate__st_makepoint(codegen_context *context, - devfunc_info *dfunc, - Expr **args, int *vl_width) +static devfunc_info * +pgstrom_devfunc_build(Oid func_oid, int func_nargs, Oid *func_argtypes) { - int nargs = list_length(dfunc->func_args); - - context->extra_bufsz += MAXALIGN(sizeof(double) * 2 * nargs); - - return -1; -} + const char *fextension; + const char *fname; + Oid fnamespace; + Oid frettype; + StringInfoData buf; + devfunc_info *dfunc = NULL; + devtype_info *dtype_rettype; + devtype_info **dtype_argtypes; + MemoryContext oldcxt; + int i, j, sz; + + initStringInfo(&buf); + fname = get_func_name(func_oid); + if (!fname) + elog(ERROR, "cache lookup failed on procedure '%u'", func_oid); + fnamespace = get_func_namespace(func_oid); + frettype = get_func_rettype(func_oid); + dtype_rettype = pgstrom_devtype_lookup(frettype); + if (!dtype_rettype) + goto bailout; + dtype_argtypes = alloca(sizeof(devtype_info *) * func_nargs); + for (j=0; j < func_nargs; j++) + { + dtype_argtypes[j] = pgstrom_devtype_lookup(func_argtypes[j]); + if (!dtype_argtypes[j]) + goto bailout; + } + /* we expect built-in functions are in pg_catalog namespace */ + fextension = get_extension_name_by_object(ProcedureRelationId, func_oid); + if (!fextension && fnamespace != PG_CATALOG_NAMESPACE) + goto bailout; + + for (i=0; devfunc_catalog[i].func_name != NULL; i++) + { + const char *__extension = devfunc_catalog[i].func_extension; + const char *__name = devfunc_catalog[i].func_name; + char *tok, *saveptr; + + if (fextension != NULL + ? (__extension == NULL || strcmp(fextension, __extension) != 0) + : (__extension != NULL)) + continue; + if (strcmp(fname, __name) != 0) + continue; -static int -vlbuf_estimate__st_relate(codegen_context *context, - devfunc_info *dfunc, - Expr **args, int *vl_width) -{ - context->extra_bufsz += MAXALIGN(VARHDRSZ + 9); + resetStringInfo(&buf); + appendStringInfoString(&buf, devfunc_catalog[i].func_args); + for (tok = strtok_r(buf.data, "/", &saveptr), j=0; + tok != NULL && j < func_nargs; + tok = strtok_r(NULL, "/", &saveptr), j++) + { + devtype_info *dtype = dtype_argtypes[j]; - return VARHDRSZ + 9; -} + tok = __trim(tok); + sz = strlen(tok); + if (sz > 4 && + tok[0] == '_' && tok[1] == '_' && + tok[sz-1] == '_' && tok[sz-2] == '_') + { + /* __TYPE__ means variable length argument! */ + tok[sz-1] = '\0'; + if (strcmp(tok+2, dtype->type_name) != 0) + break; + /* must be the last argument set */ + tok = strtok_r(NULL, "/", &saveptr); + if (tok) + break; + /* check whether the following arguments are identical */ + while (j < func_nargs) + { + if (dtype->type_oid != func_argtypes[j]) + break; + j++; + } + } + else + { + if (strcmp(tok, dtype->type_name) != 0) + break; + } + } -static int -vlbuf_estimate__st_expand(codegen_context *context, - devfunc_info *dfunc, - Expr **args, int *vl_width) -{ - context->extra_bufsz += MAXALIGN(4 * sizeof(cl_float) + /* bounding-box */ - 2 * sizeof(cl_uint) + /* nitems + padding */ - 10 * sizeof(double)); /* polygon rawdata */ - return -1; /* not a normal varlena */ + /* Ok, found an entry */ + if (!tok && j == func_nargs) + { + oldcxt = MemoryContextSwitchTo(devinfo_memcxt); + dfunc = palloc0(offsetof(devfunc_info, + func_argtypes[func_nargs])); + dfunc->func_code = devfunc_catalog[i].func_code; + if (fextension) + dfunc->func_extension = pstrdup(fextension); + dfunc->func_name = pstrdup(fname); + dfunc->func_oid = func_oid; + dfunc->func_rettype = dtype_rettype; + dfunc->func_flags = devfunc_catalog[i].func_flags; + dfunc->func_cost = devfunc_catalog[i].func_cost; + dfunc->func_nargs = func_nargs; + memcpy(dfunc->func_argtypes, dtype_argtypes, + sizeof(devtype_info *) * func_nargs); + MemoryContextSwitchTo(oldcxt); + break; + } + } +bailout: + pfree(buf.data); + return dfunc; } -/* - * Catalog of functions supported by device code - * - * naming convension of functions: - * pgfn_(...) - * - * func_template is a set of characters based on the rules below: - * - * [/]f: - * - * attributes: - * 'L' : this function is locale aware, thus, available only if simple - * collation configuration (none, and C-locale). - * 'C' : this function uses its special callback to estimate the result - * width of varlena-buffer. - * 'p' : this function needs cuda_primitive.h - * 's' : this function needs cuda_textlib.h - * 't' : this function needs cuda_timelib.h - * 'j' : this function needs cuda_jsonlib.h - * 'm' : this function needs cuda_misclib.h - * 'r' : this function needs cuda_rangetype.h - * 'g' : this function needs cuda_postgis.h - * - * class character: - * 'r' : right operator that takes an argument (deprecated) - * 'l' : left operator that takes an argument (deprecated) - * 'b' : both operator that takes two arguments (deprecated) - * 'f' : this function is implemented as device function. - * ==> extra is the function name being declared somewhere - */ -#define DEVFUNC_MAX_NARGS 4 - -typedef struct devfunc_catalog_t { - const char *func_extension; /* NULL, if built-in functions */ - const char *func_signature; - int func_devcost; /* relative cost to run on device */ - const char *func_template; /* a template string if simple function */ - devfunc_result_sz_type devfunc_result_sz; -} devfunc_catalog_t; - -static devfunc_catalog_t devfunc_common_catalog[] = { - /* Type cast functions */ - { NULL, "bool bool(int4)", 1, "f:to_bool" }, - - { PGSTROM, "int1 int1(int2)", 1, "f:to_int1" }, - { PGSTROM, "int1 int1(int4)", 1, "f:to_int1" }, - { PGSTROM, "int1 int1(int8)", 1, "f:to_int1" }, - { PGSTROM, "int1 int1(float2)", 1, "f:to_int1" }, - { PGSTROM, "int1 int1(float4)", 1, "f:to_int1" }, - { PGSTROM, "int1 int1(float8)", 1, "f:to_int1" }, - - { PGSTROM, "int2 int2(int1)", 1, "f:to_int2" }, - { NULL, "int2 int2(int4)", 1, "f:to_int2" }, - { NULL, "int2 int2(int8)", 1, "f:to_int2" }, - { PGSTROM, "int2 int2(float2)", 1, "f:to_int2" }, - { NULL, "int2 int2(float4)", 1, "f:to_int2" }, - { NULL, "int2 int2(float8)", 1, "f:to_int2" }, - - { NULL, "int4 int4(bool)", 1, "f:to_int4" }, - { PGSTROM, "int4 int4(int1)", 1, "f:to_int4" }, - { NULL, "int4 int4(int2)", 1, "f:to_int4" }, - { NULL, "int4 int4(int8)", 1, "f:to_int4" }, - { PGSTROM, "int4 int4(float2)", 1, "f:to_int4" }, - { NULL, "int4 int4(float4)", 1, "f:to_int4" }, - { NULL, "int4 int4(float8)", 1, "f:to_int4" }, - - { PGSTROM, "int8 int8(int1)", 1, "f:to_int8" }, - { NULL, "int8 int8(int2)", 1, "f:to_int8" }, - { NULL, "int8 int8(int4)", 1, "f:to_int8" }, - { PGSTROM, "int8 int8(float2)", 1, "f:to_int8" }, - { NULL, "int8 int8(float4)", 1, "f:to_int8" }, - { NULL, "int8 int8(float8)", 1, "f:to_int8" }, - - { PGSTROM, "float2 float2(int1)", 1, "f:to_float2" }, - { PGSTROM, "float2 float2(int2)", 1, "f:to_float2" }, - { PGSTROM, "float2 float2(int4)", 1, "f:to_float2" }, - { PGSTROM, "float2 float2(int8)", 1, "f:to_float2" }, - { PGSTROM, "float2 float2(float4)", 1, "f:to_float2" }, - { PGSTROM, "float2 float2(float8)", 1, "f:to_float2" }, - - { PGSTROM, "float4 float4(int1)", 1, "f:to_float4" }, - { NULL, "float4 float4(int2)", 1, "f:to_float4" }, - { NULL, "float4 float4(int4)", 1, "f:to_float4" }, - { NULL, "float4 float4(int8)", 1, "f:to_float4" }, - { PGSTROM, "float4 float4(float2)", 1, "f:to_float4" }, - { NULL, "float4 float4(float8)", 1, "f:to_float4" }, - - { PGSTROM, "float8 float8(int1)", 1, "f:to_float8" }, - { NULL, "float8 float8(int2)", 1, "f:to_float8" }, - { NULL, "float8 float8(int4)", 1, "f:to_float8" }, - { NULL, "float8 float8(int8)", 1, "f:to_float8" }, - { PGSTROM, "float8 float8(float2)", 1, "f:to_float8" }, - { NULL, "float8 float8(float4)", 1, "f:to_float8" }, - - /* '+' : add operators */ - { PGSTROM, "int1 int1pl(int1,int1)", 1, "p/f:int1pl" }, - { PGSTROM, "int2 int12pl(int1,int2)", 1, "p/f:int12pl" }, - { PGSTROM, "int4 int14pl(int1,int4)", 1, "p/f:int14pl" }, - { PGSTROM, "int8 int18pl(int1,int8)", 1, "p/f:int18pl" }, - { PGSTROM, "int2 int21pl(int2,int1)", 1, "p/f:int21pl" }, - { NULL, "int2 int2pl(int2,int2)", 1, "p/f:int2pl" }, - { NULL, "int4 int24pl(int2,int4)", 1, "p/f:int24pl" }, - { NULL, "int8 int28pl(int2,int8)", 1, "p/f:int28pl" }, - { PGSTROM, "int4 int41pl(int4,int1)", 1, "p/f:int41pl" }, - { NULL, "int4 int42pl(int4,int2)", 1, "p/f:int42pl" }, - { NULL, "int4 int4pl(int4,int4)", 1, "p/f:int4pl" }, - { NULL, "int8 int48pl(int4,int8)", 1, "p/f:int48pl" }, - { PGSTROM, "int8 int81pl(int8,int1)", 1, "p/f:int81pl" }, - { NULL, "int8 int82pl(int8,int2)", 1, "p/f:int82pl" }, - { NULL, "int8 int84pl(int8,int4)", 1, "p/f:int84pl" }, - { NULL, "int8 int8pl(int8,int8)", 1, "p/f:int8pl" }, - { PGSTROM, "float4 float2_pl(float2,float2)", 1, "p/f:float2pl" }, - { PGSTROM, "float4 float24_pl(float2,float4)", 1, "p/f:float24pl" }, - { PGSTROM, "float8 float28_pl(float2,float8)", 1, "p/f:float28pl" }, - { PGSTROM, "float4 float42_pl(float4,float2)", 1, "p/f:float42pl" }, - { NULL, "float4 float4pl(float4,float4)", 1, "p/f:float4pl" }, - { NULL, "float8 float48pl(float4,float8)", 1, "p/f:float48pl" }, - { PGSTROM, "float8 float82_pl(float8,float2)", 1, "p/f:float82pl" }, - { NULL, "float8 float84pl(float8,float4)", 1, "p/f:float84pl" }, - { NULL, "float8 float8pl(float8,float8)", 1, "p/f:float8pl" }, - - /* '-' : subtract operators */ - { PGSTROM, "int1 int1mi(int1,int1)", 1, "p/f:int1mi" }, - { PGSTROM, "int2 int12mi(int1,int2)", 1, "p/f:int12mi" }, - { PGSTROM, "int4 int14mi(int1,int4)", 1, "p/f:int14mi" }, - { PGSTROM, "int8 int18mi(int1,int8)", 1, "p/f:int18mi" }, - { PGSTROM, "int2 int21mi(int2,int1)", 1, "p/f:int21mi" }, - { NULL, "int2 int2mi(int2,int2)", 1, "p/f:int2mi" }, - { NULL, "int4 int24mi(int2,int4)", 1, "p/f:int24mi" }, - { NULL, "int8 int28mi(int2,int8)", 1, "p/f:int28mi" }, - { PGSTROM, "int4 int41mi(int4,int1)", 1, "p/f:int41mi" }, - { NULL, "int4 int42mi(int4,int2)", 1, "p/f:int42mi" }, - { NULL, "int4 int4mi(int4,int4)", 1, "p/f:int4mi" }, - { NULL, "int8 int48mi(int4,int8)", 1, "p/f:int48mi" }, - { PGSTROM, "int8 int81mi(int8,int1)", 1, "p/f:int81mi" }, - { NULL, "int8 int82mi(int8,int2)", 1, "p/f:int82mi" }, - { NULL, "int8 int84mi(int8,int4)", 1, "p/f:int84mi" }, - { NULL, "int8 int8mi(int8,int8)", 1, "p/f:int8mi" }, - { PGSTROM, "float4 float2_mi(float2,float2)", 1, "p/f:float2mi" }, - { PGSTROM, "float4 float24_mi(float2,float4)", 1, "p/f:float24mi" }, - { PGSTROM, "float8 float28_mi(float2,float8)", 1, "p/f:float28mi" }, - { PGSTROM, "float4 float42_mi(float4,float2)", 1, "p/f:float42mi" }, - { NULL, "float4 float4mi(float4,float4)", 1, "p/f:float4mi" }, - { NULL, "float8 float48mi(float4,float8)", 1, "p/f:float48mi" }, - { PGSTROM, "float8 float82_mi(float8,float2)", 1, "p/f:float82mi" }, - { NULL, "float8 float84mi(float8,float4)", 1, "p/f:float84mi" }, - { NULL, "float8 float8mi(float8,float8)", 1, "p/f:float8mi" }, - - /* '*' : mutiply operators */ - { PGSTROM, "int1 int1mul(int1,int1)", 2, "p/f:int1mul" }, - { PGSTROM, "int2 int12mul(int1,int2)", 2, "p/f:int12mul" }, - { PGSTROM, "int4 int14mul(int1,int4)", 2, "p/f:int14mul" }, - { PGSTROM, "int8 int18mul(int1,int8)", 2, "p/f:int18mul" }, - { PGSTROM, "int2 int21mul(int2,int1)", 2, "p/f:int21mul" }, - { NULL, "int2 int2mul(int2,int2)", 2, "p/f:int2mul" }, - { NULL, "int4 int24mul(int2,int4)", 2, "p/f:int24mul" }, - { NULL, "int8 int28mul(int2,int8)", 2, "p/f:int28mul" }, - { PGSTROM, "int4 int41mul(int4,int1)", 2, "p/f:int41mul" }, - { NULL, "int4 int42mul(int4,int2)", 2, "p/f:int42mul" }, - { NULL, "int4 int4mul(int4,int4)", 2, "p/f:int4mul" }, - { NULL, "int8 int48mul(int4,int8)", 2, "p/f:int48mul" }, - { PGSTROM, "int8 int81mul(int8,int1)", 2, "p/f:int81mul" }, - { NULL, "int8 int82mul(int8,int2)", 2, "p/f:int82mul" }, - { NULL, "int8 int84mul(int8,int4)", 2, "p/f:int84mul" }, - { NULL, "int8 int8mul(int8,int8)", 2, "p/f:int8mul" }, - { PGSTROM, "float4 float2_mul(float2,float2)", 2, "p/f:float2mul" }, - { PGSTROM, "float4 float24_mul(float2,float4)", 2, "p/f:float24mul" }, - { PGSTROM, "float8 float28_mul(float2,float8)", 2, "p/f:float28mul" }, - { PGSTROM, "float4 float42_mul(float4,float2)", 2, "p/f:float42mul" }, - { NULL, "float4 float4mul(float4,float4)", 2, "p/f:float4mul" }, - { NULL, "float8 float48mul(float4,float8)", 2, "p/f:float48mul" }, - { PGSTROM, "float8 float82_mul(float8,float2)", 2, "p/f:float82mul" }, - { NULL, "float8 float84mul(float8,float4)", 2, "p/f:float84mul" }, - { NULL, "float8 float8mul(float8,float8)", 2, "p/f:float8mul" }, - - /* '/' : divide operators */ - { PGSTROM, "int1 int1div(int1,int1)", 2, "p/f:int1div" }, - { PGSTROM, "int2 int12div(int1,int2)", 2, "p/f:int12div" }, - { PGSTROM, "int4 int14div(int1,int4)", 2, "p/f:int14div" }, - { PGSTROM, "int8 int18div(int1,int8)", 2, "p/f:int18div" }, - { PGSTROM, "int2 int21div(int2,int1)", 2, "p/f:int21div" }, - { NULL, "int2 int2div(int2,int2)", 2, "p/f:int2div" }, - { NULL, "int4 int24div(int2,int4)", 2, "p/f:int24div" }, - { NULL, "int8 int28div(int2,int8)", 2, "p/f:int28div" }, - { PGSTROM, "int4 int41div(int4,int1)", 2, "p/f:int41div" }, - { NULL, "int4 int42div(int4,int2)", 2, "p/f:int42div" }, - { NULL, "int4 int4div(int4,int4)", 2, "p/f:int4div" }, - { NULL, "int8 int48div(int4,int8)", 2, "p/f:int48div" }, - { PGSTROM, "int2 int81div(int8,int1)", 2, "p/f:int81div" }, - { NULL, "int8 int82div(int8,int2)", 2, "p/f:int82div" }, - { NULL, "int8 int84div(int8,int4)", 2, "p/f:int84div" }, - { NULL, "int8 int8div(int8,int8)", 2, "p/f:int8div" }, - { PGSTROM, "float4 float2_div(float2,float2)", 2, "p/f:float2div" }, - { PGSTROM, "float4 float24_div(float2,float4)", 2, "p/f:float24div" }, - { PGSTROM, "float8 float28_div(float2,float8)", 2, "p/f:float28div" }, - { PGSTROM, "float4 float42_div(float4,float2)", 2, "p/f:float42div" }, - { NULL, "float4 float4div(float4,float4)", 2, "p/f:float4div" }, - { NULL, "float8 float48div(float4,float8)", 2, "p/f:float48div" }, - { PGSTROM, "float8 float82_div(float8,float2)", 2, "p/f:float82div" }, - { NULL, "float8 float84div(float8,float4)", 2, "p/f:float84div" }, - { NULL, "float8 float8div(float8,float8)", 2, "p/f:float8div" }, - - /* '%' : reminder operators */ - { PGSTROM, "int1 int1mod(int1,int1)", 2, "p/f:int1mod" }, - { NULL, "int2 int2mod(int2,int2)", 2, "p/f:int2mod" }, - { NULL, "int4 int4mod(int4,int4)", 2, "p/f:int4mod" }, - { NULL, "int8 int8mod(int8,int8)", 2, "p/f:int8mod" }, - - /* '+' : unary plus operators */ - { PGSTROM, "int1 int1up(int1)", 1, "p/f:int1up" }, - { NULL, "int2 int2up(int2)", 1, "p/f:int2up" }, - { NULL, "int4 int4up(int4)", 1, "p/f:int4up" }, - { NULL, "int8 int8up(int8)", 1, "p/f:int8up" }, - { PGSTROM, "float2 float2_up(float2)",1, "p/f:float2up" }, - { NULL, "float4 float4up(float4)", 1, "p/f:float4up" }, - { NULL, "float8 float8up(float8)", 1, "p/f:float8up" }, - - /* '-' : unary minus operators */ - { PGSTROM, "int1 int1um(int1)", 1, "p/f:int1um" }, - { NULL, "int2 int2um(int2)", 1, "p/f:int2um" }, - { NULL, "int4 int4um(int4)", 1, "p/f:int4um" }, - { NULL, "int8 int8um(int8)", 1, "p/f:int8um" }, - { PGSTROM, "float2 float2_um(float2)",1, "p/f:float2um" }, - { NULL, "float4 float4um(float4)", 1, "p/f:float4um" }, - { NULL, "float8 float8um(float8)", 1, "p/f:float8um" }, - - /* '@' : absolute value operators */ - { PGSTROM, "int1 int1abs(int1)", 1, "p/f:int1abs" }, - { NULL, "int2 int2abs(int2)", 1, "p/f:int2abs" }, - { NULL, "int4 int4abs(int4)", 1, "p/f:int4abs" }, - { NULL, "int8 int8abs(int8)", 1, "p/f:int8abs" }, - { PGSTROM, "float2 float2abs(float2)", 1, "p/f:float2abs" }, - { NULL, "float4 float4abs(float4)", 1, "p/f:float4abs" }, - { NULL, "float8 float8abs(float8)", 1, "p/f:float8abs" }, - - /* '=' : equal operators */ - { NULL, "bool booleq(bool,bool)", 1, "f:booleq" }, - { PGSTROM, "bool int1eq(int1,int1)", 1, "f:int1eq" }, - { PGSTROM, "bool int12eq(int1,int2)", 1, "f:int12eq" }, - { PGSTROM, "bool int14eq(int1,int4)", 1, "f:int14eq" }, - { PGSTROM, "bool int18eq(int1,int8)", 1, "f:int18eq" }, - { PGSTROM, "bool int21eq(int2,int1)", 1, "f:int21eq" }, - { NULL, "bool int2eq(int2,int2)", 1, "f:int2eq" }, - { NULL, "bool int24eq(int2,int4)", 1, "f:int24eq" }, - { NULL, "bool int28eq(int2,int8)", 1, "f:int28eq" }, - { PGSTROM, "bool int41eq(int4,int1)", 1, "f:int41eq" }, - { NULL, "bool int42eq(int4,int2)", 1, "f:int42eq" }, - { NULL, "bool int4eq(int4,int4)", 1, "f:int4eq" }, - { NULL, "bool int48eq(int4,int8)", 1, "f:int48eq" }, - { PGSTROM, "bool int81eq(int8,int1)", 1, "f:int81eq" }, - { NULL, "bool int82eq(int8,int2)", 1, "f:int82eq" }, - { NULL, "bool int84eq(int8,int4)", 1, "f:int84eq" }, - { NULL, "bool int8eq(int8,int8)", 1, "f:int8eq" }, - { PGSTROM, "bool float2_eq(float2,float2)", 1, "f:float2eq" }, - { PGSTROM, "bool float24_eq(float2,float4)", 1, "f:float24eq" }, - { PGSTROM, "bool float28_eq(float2,float8)", 1, "f:float28eq" }, - { PGSTROM, "bool float42_eq(float4,float2)", 1, "f:float42eq" }, - { NULL, "bool float4eq(float4,float4)", 1, "f:float4eq" }, - { NULL, "bool float48eq(float4,float8)", 1, "f:float48eq" }, - { PGSTROM, "bool float82_eq(float8,float2)", 1, "f:float82eq" }, - { NULL, "bool float84eq(float8,float4)", 1, "f:float84eq" }, - { NULL, "bool float8eq(float8,float8)", 1, "f:float8eq" }, - - /* '<>' : not equal operators */ - { PGSTROM, "bool int1ne(int1,int1)", 1, "f:int1ne" }, - { PGSTROM, "bool int12ne(int1,int2)", 1, "f:int12ne" }, - { PGSTROM, "bool int14ne(int1,int4)", 1, "f:int14ne" }, - { PGSTROM, "bool int18ne(int1,int8)", 1, "f:int18ne" }, - { PGSTROM, "bool int21ne(int2,int1)", 1, "f:int21ne" }, - { NULL, "bool int2ne(int2,int2)", 1, "f:int2ne" }, - { NULL, "bool int24ne(int2,int4)", 1, "f:int24ne" }, - { NULL, "bool int28ne(int2,int8)", 1, "f:int28ne" }, - { PGSTROM, "bool int41ne(int4,int1)", 1, "f:int41ne" }, - { NULL, "bool int42ne(int4,int2)", 1, "f:int42ne" }, - { NULL, "bool int4ne(int4,int4)", 1, "f:int4ne" }, - { NULL, "bool int48ne(int4,int8)", 1, "f:int48ne" }, - { PGSTROM, "bool int81ne(int8,int1)", 1, "f:int81ne" }, - { NULL, "bool int82ne(int8,int2)", 1, "f:int82ne" }, - { NULL, "bool int84ne(int8,int4)", 1, "f:int84ne" }, - { NULL, "bool int8ne(int8,int8)", 1, "f:int8ne" }, - { PGSTROM, "bool float2_ne(float2,float2)", 1, "f:float2ne" }, - { PGSTROM, "bool float24_ne(float2,float4)", 1, "f:float24ne" }, - { PGSTROM, "bool float28_ne(float2,float8)", 1, "f:float28ne" }, - { PGSTROM, "bool float42_ne(float4,float2)", 1, "f:float42ne" }, - { NULL, "bool float4ne(float4,float4)", 1, "f:float4ne" }, - { NULL, "bool float48ne(float4,float8)", 1, "f:float48ne" }, - { PGSTROM, "bool float82_ne(float8,float2)", 1, "f:float82ne" }, - { NULL, "bool float84ne(float8,float4)", 1, "f:float84ne" }, - { NULL, "bool float8ne(float8,float8)", 1, "f:float8ne" }, - - /* '>' : greater than operators */ - { PGSTROM, "bool int1gt(int1,int1)", 1, "f:int1gt" }, - { PGSTROM, "bool int12gt(int1,int2)", 1, "f:int12gt" }, - { PGSTROM, "bool int14gt(int1,int4)", 1, "f:int14gt" }, - { PGSTROM, "bool int18gt(int1,int8)", 1, "f:int18gt" }, - { PGSTROM, "bool int21gt(int2,int1)", 1, "f:int21gt" }, - { NULL, "bool int2gt(int2,int2)", 1, "f:int2gt" }, - { NULL, "bool int24gt(int2,int4)", 1, "f:int24gt" }, - { NULL, "bool int28gt(int2,int8)", 1, "f:int28gt" }, - { PGSTROM, "bool int41gt(int4,int1)", 1, "f:int41gt" }, - { NULL, "bool int42gt(int4,int2)", 1, "f:int42gt" }, - { NULL, "bool int4gt(int4,int4)", 1, "f:int4gt" }, - { NULL, "bool int48gt(int4,int8)", 1, "f:int48gt" }, - { PGSTROM, "bool int81gt(int8,int1)", 1, "f:int81gt" }, - { NULL, "bool int82gt(int8,int2)", 1, "f:int82gt" }, - { NULL, "bool int84gt(int8,int4)", 1, "f:int84gt" }, - { NULL, "bool int8gt(int8,int8)", 1, "f:int8gt" }, - { PGSTROM, "bool float2_gt(float2,float2)", 1, "f:float2gt" }, - { PGSTROM, "bool float24_gt(float2,float4)", 1, "f:float24gt" }, - { PGSTROM, "bool float28_gt(float2,float8)", 1, "f:float28gt" }, - { PGSTROM, "bool float42_gt(float4,float2)", 1, "f:float42gt" }, - { NULL, "bool float4gt(float4,float4)", 1, "f:float4gt" }, - { NULL, "bool float48gt(float4,float8)", 1, "f:float48gt" }, - { PGSTROM, "bool float82_gt(float8,float2)", 1, "f:float82gt" }, - { NULL, "bool float84gt(float8,float4)", 1, "f:float84gt" }, - { NULL, "bool float8gt(float8,float8)", 1, "f:float8gt" }, - - /* '<' : less than operators */ - { PGSTROM, "bool int1lt(int1,int1)", 1, "f:int1lt" }, - { PGSTROM, "bool int12lt(int1,int2)", 1, "f:int12lt" }, - { PGSTROM, "bool int14lt(int1,int4)", 1, "f:int14lt" }, - { PGSTROM, "bool int18lt(int1,int8)", 1, "f:int18lt" }, - { PGSTROM, "bool int21lt(int2,int1)", 1, "f:int21lt" }, - { NULL, "bool int2lt(int2,int2)", 1, "f:int2lt" }, - { NULL, "bool int24lt(int2,int4)", 1, "f:int24lt" }, - { NULL, "bool int28lt(int2,int8)", 1, "f:int28lt" }, - { PGSTROM, "bool int41lt(int4,int1)", 1, "f:int41lt" }, - { NULL, "bool int42lt(int4,int2)", 1, "f:int42lt" }, - { NULL, "bool int4lt(int4,int4)", 1, "f:int4lt" }, - { NULL, "bool int48lt(int4,int8)", 1, "f:int48lt" }, - { PGSTROM, "bool int81lt(int8,int1)", 1, "f:int81lt" }, - { NULL, "bool int82lt(int8,int2)", 1, "f:int82lt" }, - { NULL, "bool int84lt(int8,int4)", 1, "f:int84lt" }, - { NULL, "bool int8lt(int8,int8)", 1, "f:int8lt" }, - { PGSTROM, "bool float2_lt(float2,float2)", 1, "f:float2lt" }, - { PGSTROM, "bool float24_lt(float2,float4)", 1, "f:float24lt" }, - { PGSTROM, "bool float28_lt(float2,float8)", 1, "f:float28lt" }, - { PGSTROM, "bool float42_lt(float4,float2)", 1, "f:float42lt" }, - { NULL, "bool float4lt(float4,float4)", 1, "f:float4lt" }, - { NULL, "bool float48lt(float4,float8)", 1, "f:float48lt" }, - { PGSTROM, "bool float82_lt(float8,float2)", 1, "f:float82lt" }, - { NULL, "bool float84lt(float8,float4)", 1, "f:float84lt" }, - { NULL, "bool float8lt(float8,float8)", 1, "f:float8lt" }, - - /* '>=' : relational greater-than or equal-to */ - { PGSTROM, "bool int1ge(int1,int1)", 1, "f:int1ge" }, - { PGSTROM, "bool int12ge(int1,int2)", 1, "f:int12ge" }, - { PGSTROM, "bool int14ge(int1,int4)", 1, "f:int14ge" }, - { PGSTROM, "bool int18ge(int1,int8)", 1, "f:int18ge" }, - { PGSTROM, "bool int21ge(int2,int1)", 1, "f:int21ge" }, - { NULL, "bool int2ge(int2,int2)", 1, "f:int2ge" }, - { NULL, "bool int24ge(int2,int4)", 1, "f:int24ge" }, - { NULL, "bool int28ge(int2,int8)", 1, "f:int28ge" }, - { PGSTROM, "bool int41ge(int4,int1)", 1, "f:int41ge" }, - { NULL, "bool int42ge(int4,int2)", 1, "f:int42ge" }, - { NULL, "bool int4ge(int4,int4)", 1, "f:int4ge" }, - { NULL, "bool int48ge(int4,int8)", 1, "f:int48ge" }, - { PGSTROM, "bool int81ge(int8,int1)", 1, "f:int81ge" }, - { NULL, "bool int82ge(int8,int2)", 1, "f:int82ge" }, - { NULL, "bool int84ge(int8,int4)", 1, "f:int84ge" }, - { NULL, "bool int8ge(int8,int8)", 1, "f:int8ge" }, - { PGSTROM, "bool float2_ge(float2,float2)", 1, "f:float2ge" }, - { PGSTROM, "bool float24_ge(float2,float4)", 1, "f:float24ge" }, - { PGSTROM, "bool float28_ge(float2,float8)", 1, "f:float28ge" }, - { PGSTROM, "bool float42_ge(float4,float2)", 1, "f:float42ge" }, - { NULL, "bool float4ge(float4,float4)", 1, "f:float4ge" }, - { NULL, "bool float48ge(float4,float8)", 1, "f:float48ge" }, - { PGSTROM, "bool float82_ge(float8,float2)", 1, "f:float82ge" }, - { NULL, "bool float84ge(float8,float4)", 1, "f:float84ge" }, - { NULL, "bool float8ge(float8,float8)", 1, "f:float8ge" }, - - /* '<=' : relational greater-than or equal-to */ - { PGSTROM, "bool int1le(int1,int1)", 1, "f:int1le" }, - { PGSTROM, "bool int12le(int1,int2)", 1, "f:int12le" }, - { PGSTROM, "bool int14le(int1,int4)", 1, "f:int14le" }, - { PGSTROM, "bool int18le(int1,int8)", 1, "f:int18le" }, - { PGSTROM, "bool int21le(int2,int1)", 1, "f:int21le" }, - { NULL, "bool int2le(int2,int2)", 1, "f:int2le" }, - { NULL, "bool int24le(int2,int4)", 1, "f:int24le" }, - { NULL, "bool int28le(int2,int8)", 1, "f:int28le" }, - { PGSTROM, "bool int41le(int4,int1)", 1, "f:int41le" }, - { NULL, "bool int42le(int4,int2)", 1, "f:int42le" }, - { NULL, "bool int4le(int4,int4)", 1, "f:int4le" }, - { NULL, "bool int48le(int4,int8)", 1, "f:int48le" }, - { PGSTROM, "bool int81le(int8,int1)", 1, "f:int81le" }, - { NULL, "bool int82le(int8,int2)", 1, "f:int82le" }, - { NULL, "bool int84le(int8,int4)", 1, "f:int84le" }, - { NULL, "bool int8le(int8,int8)", 1, "f:int8le" }, - { PGSTROM, "bool float2_le(float2,float2)", 1, "f:float2le" }, - { PGSTROM, "bool float24_le(float2,float4)", 1, "f:float24le" }, - { PGSTROM, "bool float28_le(float2,float8)", 1, "f:float28le" }, - { PGSTROM, "bool float42_le(float4,float2)", 2, "f:float42le" }, - { NULL, "bool float4le(float4,float4)", 1, "f:float4le" }, - { NULL, "bool float48le(float4,float8)", 1, "f:float48le" }, - { PGSTROM, "bool float82_le(float8,float2)", 1, "f:float82le" }, - { NULL, "bool float84le(float8,float4)", 1, "f:float84le" }, - { NULL, "bool float8le(float8,float8)", 1, "f:float8le" }, - - /* '&' : bitwise and */ - { PGSTROM, "int1 int1and(int1,int1)", 1, "p/f:int1and" }, - { NULL, "int2 int2and(int2,int2)", 1, "p/f:int2and" }, - { NULL, "int4 int4and(int4,int4)", 1, "p/f:int4and" }, - { NULL, "int8 int8and(int8,int8)", 1, "p/f:int8and" }, - - /* '|' : bitwise or */ - { PGSTROM, "int1 int1or(int1,int1)", 1, "p/f:int1or" }, - { NULL, "int2 int2or(int2,int2)", 1, "p/f:int2or" }, - { NULL, "int4 int4or(int4,int4)", 1, "p/f:int4or" }, - { NULL, "int8 int8or(int8,int8)", 1, "p/f:int8or" }, - - /* '#' : bitwise xor */ - { PGSTROM, "int1 int1xor(int1,int1)", 1, "p/f:int1xor" }, - { NULL, "int2 int2xor(int2,int2)", 1, "p/f:int2xor" }, - { NULL, "int4 int4xor(int4,int4)", 1, "p/f:int4xor" }, - { NULL, "int8 int8xor(int8,int8)", 1, "p/f:int8xor" }, - - /* '~' : bitwise not operators */ - { PGSTROM, "int1 int1not(int1)", 1, "p/f:int1not" }, - { NULL, "int2 int2not(int2)", 1, "p/f:int2not" }, - { NULL, "int4 int4not(int4)", 1, "p/f:int4not" }, - { NULL, "int8 int8not(int8)", 1, "p/f:int8not" }, - - /* '>>' : right shift */ - { PGSTROM, "int1 int1shr(int1,int4)", 1, "p/f:int1shr" }, - { NULL, "int2 int2shr(int2,int4)", 1, "p/f:int2shr" }, - { NULL, "int4 int4shr(int4,int4)", 1, "p/f:int4shr" }, - { NULL, "int8 int8shr(int8,int4)", 1, "p/f:int8shr" }, - - /* '<<' : left shift */ - { PGSTROM, "int1 int1shl(int1,int4)", 1, "p/f:int1shl" }, - { NULL, "int2 int2shl(int2,int4)", 1, "p/f:int2shl" }, - { NULL, "int4 int4shl(int4,int4)", 1, "p/f:int4shl" }, - { NULL, "int8 int8shl(int8,int4)", 1, "p/f:int8shl" }, - - /* comparison functions */ - { NULL, "int4 btboolcmp(bool,bool)", 1, "p/f:type_compare" }, - { PGSTROM, "int4 btint1cmp(int1,int1)", 1, "p/f:type_compare" }, - { PGSTROM, "int4 btint12cmp(int1,int2)", 1, "p/f:type_compare" }, - { PGSTROM, "int4 btint14cmp(int1,int4)", 1, "p/f:type_compare" }, - { PGSTROM, "int4 btint18cmp(int1,int8)", 1, "p/f:type_compare" }, - { PGSTROM, "int4 btint21cmp(int2,int1)", 1, "p/f:type_compare" }, - { NULL, "int4 btint2cmp(int2,int2)", 1, "p/f:type_compare" }, - { NULL, "int4 btint24cmp(int2,int4)", 1, "p/f:type_compare" }, - { NULL, "int4 btint28cmp(int2,int8)", 1, "p/f:type_compare" }, - { PGSTROM, "int4 btint41cmp(int4,int1)", 1, "p/f:type_compare" }, - { NULL, "int4 btint42cmp(int4,int2)", 1, "p/f:type_compare" }, - { NULL, "int4 btint4cmp(int4,int4)", 1, "p/f:type_compare" }, - { NULL, "int4 btint48cmp(int4,int8)", 1, "p/f:type_compare" }, - { PGSTROM, "int4 btint81cmp(int8,int1)", 1, "p/f:type_compare" }, - { NULL, "int4 btint82cmp(int8,int2)", 1, "p/f:type_compare" }, - { NULL, "int4 btint84cmp(int8,int4)", 1, "p/f:type_compare" }, - { NULL, "int4 btint8cmp(int8,int8)", 1, "p/f:type_compare" }, - { PGSTROM, "int4 float2_cmp(float2,float2)", 1, "f:type_compare" }, - { PGSTROM, "int4 float24_cmp(float2,float4)", 1, "f:type_compare" }, - { PGSTROM, "int4 float28_cmp(float2,float8)", 1, "f:type_compare" }, - { PGSTROM, "int4 float42_cmp(float4,float2)", 1, "f:type_compare" }, - { NULL, "int4 btfloat4cmp(float4,float4)", 1, "p/f:type_compare" }, - { NULL, "int4 btfloat48cmp(float4,float8)", 1, "p/f:type_compare" }, - { NULL, "int4 btfloat84cmp(float8,float4)", 1, "p/f:type_compare" }, - { NULL, "int4 btfloat8cmp(float8,float8)", 1, "p/f:type_compare" }, - { PGSTROM, "int4 float82_cmp(float8,float2)", 1, "f:type_compare" }, - - /* currency cast */ - { NULL, "money money(numeric)", 1, "m/f:numeric_cash" }, - { NULL, "money money(int4)", 1, "m/f:int4_cash" }, - { NULL, "money money(int8)", 1, "m/f:int8_cash" }, - /* currency operators */ - { NULL, "money cash_pl(money,money)", 1, "m/f:cash_pl" }, - { NULL, "money cash_mi(money,money)", 1, "m/f:cash_mi" }, - { NULL, "float8 cash_div_cash(money,money)", 2, "m/f:cash_div_cash" }, - { PGSTROM, "money cash_mul_int1(money,int1)", 2, "m/f:cash_mul_int1" }, - { NULL, "money cash_mul_int2(money,int2)", 2, "m/f:cash_mul_int2" }, - { NULL, "money cash_mul_int4(money,int4)", 2, "m/f:cash_mul_int4" }, - { PGSTROM, "money cash_mul_flt2(money,float2)", 2, "m/f:cash_mul_flt2" }, - { NULL, "money cash_mul_flt4(money,float4)", 2, "m/f:cash_mul_flt4" }, - { NULL, "money cash_mul_flt8(money,float8)", 2, "m/f:cash_mul_flt8" }, - { PGSTROM, "money cash_div_int1(money,int1)", 2, "m/f:cash_div_int1" }, - { NULL, "money cash_div_int2(money,int2)", 2, "m/f:cash_div_int2" }, - { NULL, "money cash_div_int4(money,int4)", 2, "m/f:cash_div_int4" }, - { PGSTROM, "money cash_div_flt2(money,float2)", 2, "m/f:cash_div_flt2" }, - { NULL, "money cash_div_flt4(money,float4)", 2, "m/f:cash_div_flt4" }, - { NULL, "money cash_div_flt8(money,float8)", 2, "m/f:cash_div_flt8" }, - { PGSTROM, "money int1_mul_cash(int1,money)", 2, "m/f:int1_mul_cash" }, - { NULL, "money int2_mul_cash(int2,money)", 2, "m/f:int2_mul_cash" }, - { NULL, "money int4_mul_cash(int4,money)", 2, "m/f:int4_mul_cash" }, - { PGSTROM, "money flt2_mul_cash(float2,money)", 2, "m/f:flt2_mul_cash" }, - { NULL, "money flt4_mul_cash(float4,money)", 2, "m/f:flt4_mul_cash" }, - { NULL, "money flt8_mul_cash(float8,money)", 2, "m/f:flt8_mul_cash" }, - /* currency comparison */ - { NULL, "int4 cash_cmp(money,money)", 1, "m/f:type_compare" }, - { NULL, "bool cash_eq(money,money)", 1, "m/f:cash_eq" }, - { NULL, "bool cash_ne(money,money)", 1, "m/f:cash_ne" }, - { NULL, "bool cash_lt(money,money)", 1, "m/f:cash_lt" }, - { NULL, "bool cash_le(money,money)", 1, "m/f:cash_le" }, - { NULL, "bool cash_gt(money,money)", 1, "m/f:cash_gt" }, - { NULL, "bool cash_ge(money,money)", 1, "m/f:cash_ge" }, - /* uuid comparison */ - { NULL, "int4 uuid_cmp(uuid,uuid)", 5, "m/f:type_compare" }, - { NULL, "bool uuid_eq(uuid,uuid)", 5, "m/f:uuid_eq" }, - { NULL, "bool uuid_ne(uuid,uuid)", 5, "m/f:uuid_ne" }, - { NULL, "bool uuid_lt(uuid,uuid)", 5, "m/f:uuid_lt" }, - { NULL, "bool uuid_le(uuid,uuid)", 5, "m/f:uuid_le" }, - { NULL, "bool uuid_gt(uuid,uuid)", 5, "m/f:uuid_gt" }, - { NULL, "bool uuid_ge(uuid,uuid)", 5, "m/f:uuid_ge" }, - /* macaddr comparison */ - { NULL, "int4 macaddr_cmp(macaddr,macaddr)", 5, "m/f:type_compare" }, - { NULL, "bool macaddr_eq(macaddr,macaddr)", 5, "m/f:macaddr_eq" }, - { NULL, "bool macaddr_ne(macaddr,macaddr)", 5, "m/f:macaddr_ne" }, - { NULL, "bool macaddr_lt(macaddr,macaddr)", 5, "m/f:macaddr_lt" }, - { NULL, "bool macaddr_le(macaddr,macaddr)", 5, "m/f:macaddr_le" }, - { NULL, "bool macaddr_gt(macaddr,macaddr)", 5, "m/f:macaddr_gt" }, - { NULL, "bool macaddr_ge(macaddr,macaddr)", 5, "m/f:macaddr_ge" }, - /* inet comparison */ - { NULL, "int4 network_cmp(inet,inet)", 8, "m/f:type_compare" }, - { NULL, "bool network_eq(inet,inet)", 8, "m/f:network_eq" }, - { NULL, "bool network_ne(inet,inet)", 8, "m/f:network_ne" }, - { NULL, "bool network_lt(inet,inet)", 8, "m/f:network_lt" }, - { NULL, "bool network_le(inet,inet)", 8, "m/f:network_le" }, - { NULL, "bool network_gt(inet,inet)", 8, "m/f:network_gt" }, - { NULL, "bool network_ge(inet,inet)", 8, "m/f:network_ge" }, - { NULL, "inet network_larger(inet,inet)", 8, "m/f:network_larger" }, - { NULL, "inet network_smaller(inet,inet)",8, "m/f:network_smaller" }, - { NULL, "bool network_sub(inet,inet)", 8, "m/f:network_sub" }, - { NULL, "bool network_subeq(inet,inet)", 8, "m/f:network_subeq" }, - { NULL, "bool network_sup(inet,inet)", 8, "m/f:network_sup" }, - { NULL, "bool network_supeq(inet,inet)", 8, "m/f:network_supeq" }, - { NULL, "bool network_overlap(inet,inet)",8, "m/f:network_overlap" }, - - /* - * Mathmatical functions - */ - { PGSTROM, "int1 abs(int1)", 1, "p/f:int1abs" }, - { NULL, "int2 abs(int2)", 1, "p/f:int2abs" }, - { NULL, "int4 abs(int4)", 1, "p/f:int4abs" }, - { NULL, "int8 abs(int8)", 1, "p/f:int8abs" }, - { PGSTROM, "float2 abs(float2)", 1, "p/f:float2abs" }, - { NULL, "float4 abs(float4)", 1, "p/f:float4abs" }, - { NULL, "float8 abs(float8)", 1, "p/f:float8abs" }, - { NULL, "float8 cbrt(float8)", 1, "m/f:cbrt" }, - { NULL, "float8 dcbrt(float8)", 1, "m/f:cbrt" }, - { NULL, "float8 ceil(float8)", 1, "m/f:ceil" }, - { NULL, "float8 ceiling(float8)", 1, "m/f:ceil" }, - { NULL, "float8 exp(float8)", 5, "m/f:exp" }, - { NULL, "float8 dexp(float8)", 5, "m/f:exp" }, - { NULL, "float8 floor(float8)", 1, "m/f:floor" }, - { NULL, "float8 ln(float8)", 5, "m/f:ln" }, - { NULL, "float8 dlog1(float8)", 5, "m/f:ln" }, - { NULL, "float8 log(float8)", 5, "m/f:log10" }, - { NULL, "float8 dlog10(float8)", 5, "m/f:log10" }, - { NULL, "float8 pi()", 0, "m/f:dpi" }, - { NULL, "float8 power(float8,float8)", 5, "m/f:dpow" }, - { NULL, "float8 pow(float8,float8)", 5, "m/f:dpow" }, - { NULL, "float8 dpow(float8,float8)", 5, "m/f:dpow" }, - { NULL, "float8 round(float8)", 5, "m/f:round" }, - { NULL, "float8 dround(float8)", 5, "m/f:round" }, - { NULL, "float8 sign(float8)", 1, "m/f:sign" }, - { NULL, "float8 sqrt(float8)", 5, "m/f:dsqrt" }, - { NULL, "float8 dsqrt(float8)", 5, "m/f:dsqrt" }, - { NULL, "float8 trunc(float8)", 1, "m/f:trunc" }, - { NULL, "float8 dtrunc(float8)", 1, "m/f:trunc" }, - - /* - * Trigonometric function - */ - { NULL, "float8 degrees(float8)", 5, "m/f:degrees" }, - { NULL, "float8 radians(float8)", 5, "m/f:radians" }, - { NULL, "float8 acos(float8)", 5, "m/f:acos" }, - { NULL, "float8 asin(float8)", 5, "m/f:asin" }, - { NULL, "float8 atan(float8)", 5, "m/f:atan" }, - { NULL, "float8 atan2(float8,float8)", 5, "m/f:atan2" }, - { NULL, "float8 cos(float8)", 5, "m/f:cos" }, - { NULL, "float8 cot(float8)", 5, "m/f:cot" }, - { NULL, "float8 sin(float8)", 5, "m/f:sin" }, - { NULL, "float8 tan(float8)", 5, "m/f:tan" }, - - /* - * Numeric functions - * ------------------------- */ - /* Numeric type cast functions */ - { PGSTROM, "int1 int1(numeric)", 8, "f:numeric_int1" }, - { NULL, "int2 int2(numeric)", 8, "f:numeric_int2" }, - { NULL, "int4 int4(numeric)", 8, "f:numeric_int4" }, - { NULL, "int8 int8(numeric)", 8, "f:numeric_int8" }, - { PGSTROM, "float2 float2(numeric)", 8, "f:numeric_float2" }, - { NULL, "float4 float4(numeric)", 8, "f:numeric_float4" }, - { NULL, "float8 float8(numeric)", 8, "f:numeric_float8" }, - { PGSTROM, "numeric numeric(int1)", 5, "f:int1_numeric" }, - { NULL, "numeric numeric(int2)", 5, "f:int2_numeric" }, - { NULL, "numeric numeric(int4)", 5, "f:int4_numeric" }, - { NULL, "numeric numeric(int8)", 5, "f:int8_numeric" }, - { PGSTROM, "numeric numeric(float2)", 5, "f:float2_numeric" }, - { NULL, "numeric numeric(float4)", 5, "f:float4_numeric" }, - { NULL, "numeric numeric(float8)", 5, "f:float8_numeric" }, - /* Numeric operators */ - { NULL, "numeric numeric_add(numeric,numeric)", 10, "f:numeric_add" }, - { NULL, "numeric numeric_sub(numeric,numeric)", 10, "f:numeric_sub" }, - { NULL, "numeric numeric_mul(numeric,numeric)", 10, "f:numeric_mul" }, - { NULL, "numeric numeric_uplus(numeric)", 10, "f:numeric_uplus" }, - { NULL, "numeric numeric_uminus(numeric)", 10, "f:numeric_uminus" }, - { NULL, "numeric numeric_abs(numeric)", 10, "f:numeric_abs" }, - { NULL, "numeric abs(numeric)", 10, "f:numeric_abs" }, - /* Numeric comparison */ - { NULL, "bool numeric_eq(numeric,numeric)", 8, "f:numeric_eq" }, - { NULL, "bool numeric_ne(numeric,numeric)", 8, "f:numeric_ne" }, - { NULL, "bool numeric_lt(numeric,numeric)", 8, "f:numeric_lt" }, - { NULL, "bool numeric_le(numeric,numeric)", 8, "f:numeric_le" }, - { NULL, "bool numeric_gt(numeric,numeric)", 8, "f:numeric_gt" }, - { NULL, "bool numeric_ge(numeric,numeric)", 8, "f:numeric_ge" }, - { NULL, "int4 numeric_cmp(numeric,numeric)", 8, "f:type_compare" }, - - /* - * Date and time functions - * ------------------------------- */ - /* Type cast functions */ - { NULL, "date date(timestamp)", 1, "t/f:timestamp_date" }, - { NULL, "date date(timestamptz)", 1, "t/f:timestamptz_date" }, - { NULL, "time time(timetz)", 1, "t/f:timetz_time" }, - { NULL, "time time(timestamp)", 1, "t/f:timestamp_time" }, - { NULL, "time time(timestamptz)", 1, "t/f:timestamptz_time" }, - { NULL, "timetz timetz(time)", 1, "t/f:time_timetz" }, - { NULL, "timetz timetz(timestamptz)", 1, "t/f:timestamptz_timetz" }, -#ifdef NOT_USED - { NULL, "timetz timetz(timetz,int4)", 1, "t/f:timetz_scale" }, -#endif - { NULL, "timestamp timestamp(date)", - 1, "t/f:date_timestamp" }, - { NULL, "timestamp timestamp(timestamptz)", - 1, "t/f:timestamptz_timestamp" }, - { NULL, "timestamptz timestamptz(date)", - 1, "t/f:date_timestamptz" }, - { NULL, "timestamptz timestamptz(timestamp)", - 1, "t/f:timestamp_timestamptz" }, - /* timedata operators */ - { NULL, "date date_pli(date,int4)", 1, "t/f:date_pli" }, - { NULL, "date date_mii(date,int4)", 1, "t/f:date_mii" }, - { NULL, "int4 date_mi(date,date)", 1, "t/f:date_mi" }, - { NULL, "timestamp datetime_pl(date,time)", 2, "t/f:datetime_pl" }, - { NULL, "date integer_pl_date(int4,date)", 2, "t/f:integer_pl_date" }, - { NULL, "timestamp timedate_pl(time,date)", 2, "t/f:timedate_pl" }, - /* time - time => interval */ - { NULL, "interval time_mi_time(time,time)", - 2, "t/f:time_mi_time" }, - /* timestamp - timestamp => interval */ - { NULL, "interval timestamp_mi(timestamp,timestamp)", - 4, "t/f:timestamp_mi" }, - /* timetz +/- interval => timetz */ - { NULL, "timetz timetz_pl_interval(timetz,interval)", - 4, "t/f:timetz_pl_interval" }, - { NULL, "timetz timetz_mi_interval(timetz,interval)", - 4, "t/f:timetz_mi_interval" }, - /* timestamptz +/- interval => timestamptz */ - { NULL, "timestamptz timestamptz_pl_interval(timestamptz,interval)", - 4, "t/f:timestamptz_pl_interval" }, - { NULL, "timestamptz timestamptz_mi_interval(timestamptz,interval)", - 4, "t/f:timestamptz_mi_interval" }, - /* interval operators */ - { NULL, "interval interval_um(interval)", 4, "t/f:interval_um" }, - { NULL, "interval interval_pl(interval,interval)", 4, "t/f:interval_pl" }, - { NULL, "interval interval_mi(interval,interval)", 4, "t/f:interval_mi" }, - /* date + timetz => timestamptz */ - { NULL, "timestamptz datetimetz_pl(date,timetz)", - 4, "t/f:datetimetz_timestamptz" }, - { NULL, "timestamptz timestamptz(date,timetz)", - 4, "t/f:datetimetz_timestamptz" }, - /* comparison between date */ - { NULL, "bool date_eq(date,date)", 2, "t/f:date_eq" }, - { NULL, "bool date_ne(date,date)", 2, "t/f:date_ne" }, - { NULL, "bool date_lt(date,date)", 2, "t/f:date_lt" }, - { NULL, "bool date_le(date,date)", 2, "t/f:date_le" }, - { NULL, "bool date_gt(date,date)", 2, "t/f:date_gt" }, - { NULL, "bool date_ge(date,date)", 2, "t/f:date_ge" }, - { NULL, "int4 date_cmp(date,date)", 2, "t/f:type_compare" }, - /* comparison of date and timestamp */ - { NULL, "bool date_eq_timestamp(date,timestamp)", - 2, "t/f:date_eq_timestamp" }, - { NULL, "bool date_ne_timestamp(date,timestamp)", - 2, "t/f:date_ne_timestamp" }, - { NULL, "bool date_lt_timestamp(date,timestamp)", - 2, "t/f:date_lt_timestamp" }, - { NULL, "bool date_le_timestamp(date,timestamp)", - 2, "t/f:date_le_timestamp" }, - { NULL, "bool date_gt_timestamp(date,timestamp)", - 2, "t/f:date_gt_timestamp" }, - { NULL, "bool date_ge_timestamp(date,timestamp)", - 2, "t/f:date_ge_timestamp" }, - { NULL, "int4 date_cmp_timestamp(date,timestamp)", - 2, "t/f:date_cmp_timestamp" }, - /* comparison between time */ - { NULL, "bool time_eq(time,time)", 2, "t/f:time_eq" }, - { NULL, "bool time_ne(time,time)", 2, "t/f:time_ne" }, - { NULL, "bool time_lt(time,time)", 2, "t/f:time_lt" }, - { NULL, "bool time_le(time,time)", 2, "t/f:time_le" }, - { NULL, "bool time_gt(time,time)", 2, "t/f:time_gt" }, - { NULL, "bool time_ge(time,time)", 2, "t/f:time_ge" }, - { NULL, "int4 time_cmp(time,time)",2, "t/f:type_compare" }, - /* comparison between timetz */ - { NULL, "bool timetz_eq(timetz,timetz)", 1, "t/f:timetz_eq" }, - { NULL, "bool timetz_ne(timetz,timetz)", 1, "t/f:timetz_ne" }, - { NULL, "bool timetz_lt(timetz,timetz)", 1, "t/f:timetz_lt" }, - { NULL, "bool timetz_le(timetz,timetz)", 1, "t/f:timetz_le" }, - { NULL, "bool timetz_ge(timetz,timetz)", 1, "t/f:timetz_ge" }, - { NULL, "bool timetz_gt(timetz,timetz)", 1, "t/f:timetz_gt" }, - { NULL, "int4 timetz_cmp(timetz,timetz)",1, "t/f:timetz_cmp" }, - /* comparison between timestamp */ - { NULL, "bool timestamp_eq(timestamp,timestamp)", 1, "t/f:timestamp_eq" }, - { NULL, "bool timestamp_ne(timestamp,timestamp)", 1, "t/f:timestamp_ne" }, - { NULL, "bool timestamp_lt(timestamp,timestamp)", 1, "t/f:timestamp_lt" }, - { NULL, "bool timestamp_le(timestamp,timestamp)", 1, "t/f:timestamp_le" }, - { NULL, "bool timestamp_gt(timestamp,timestamp)", 1, "t/f:timestamp_gt" }, - { NULL, "bool timestamp_ge(timestamp,timestamp)", 1, "t/f:timestamp_ge" }, - { NULL, "int4 timestamp_cmp(timestamp,timestamp)",1, "t/f:timestamp_cmp"}, - /* comparison of timestamp and date */ - { NULL, "bool timestamp_eq_date(timestamp,date)", - 3, "t/f:timestamp_eq_date" }, - { NULL, "bool timestamp_ne_date(timestamp,date)", - 3, "t/f:timestamp_ne_date" }, - { NULL, "bool timestamp_lt_date(timestamp,date)", - 3, "t/f:timestamp_lt_date" }, - { NULL, "bool timestamp_le_date(timestamp,date)", - 3, "t/f:timestamp_le_date" }, - { NULL, "bool timestamp_gt_date(timestamp,date)", - 3, "t/f:timestamp_gt_date" }, - { NULL, "bool timestamp_ge_date(timestamp,date)", - 3, "t/f:timestamp_ge_date" }, - { NULL, "int4 timestamp_cmp_date(timestamp,date)", - 3, "t/f:timestamp_cmp_date"}, - /* comparison between timestamptz */ - { NULL, "bool timestamptz_eq(timestamptz,timestamptz)", - 1, "t/f:timestamptz_eq" }, - { NULL, "bool timestamptz_ne(timestamptz,timestamptz)", - 1, "t/f:timestamptz_ne" }, - { NULL, "bool timestamptz_lt(timestamptz,timestamptz)", - 1, "t/f:timestamptz_lt" }, - { NULL, "bool timestamptz_le(timestamptz,timestamptz)", - 1, "t/f:timestamptz_le" }, - { NULL, "bool timestamptz_gt(timestamptz,timestamptz)", - 1, "t/f:timestamptz_gt" }, - { NULL, "bool timestamptz_ge(timestamptz,timestamptz)", - 1, "t/f:timestamptz_ge" }, - { NULL, "int4 timestamptz_cmp(timestamptz,timestamptz)", - 1, "t/f:type_compare" }, - /* comparison between date and timestamptz */ - { NULL, "bool date_lt_timestamptz(date,timestamptz)", - 3, "t/f:date_lt_timestamptz" }, - { NULL, "bool date_le_timestamptz(date,timestamptz)", - 3, "t/f:date_le_timestamptz" }, - { NULL, "bool date_eq_timestamptz(date,timestamptz)", - 3, "t/f:date_eq_timestamptz" }, - { NULL, "bool date_ge_timestamptz(date,timestamptz)", - 3, "t/f:date_ge_timestamptz" }, - { NULL, "bool date_gt_timestamptz(date,timestamptz)", - 3, "t/f:date_gt_timestamptz" }, - { NULL, "bool date_ne_timestamptz(date,timestamptz)", - 3, "t/f:date_ne_timestamptz" }, - /* comparison between timestamptz and date */ - { NULL, "bool timestamptz_lt_date(timestamptz,date)", - 3, "t/f:timestamptz_lt_date" }, - { NULL, "bool timestamptz_le_date(timestamptz,date)", - 3, "t/f:timestamptz_le_date" }, - { NULL, "bool timestamptz_eq_date(timestamptz,date)", - 3, "t/f:timestamptz_eq_date" }, - { NULL, "bool timestamptz_ge_date(timestamptz,date)", - 3, "t/f:timestamptz_ge_date" }, - { NULL, "bool timestamptz_gt_date(timestamptz,date)", - 3, "t/f:timestamptz_gt_date" }, - { NULL, "bool timestamptz_ne_date(timestamptz,date)", - 3, "t/f:timestamptz_ne_date" }, - /* comparison between timestamp and timestamptz */ - { NULL, "bool timestamp_lt_timestamptz(timestamp,timestamptz)", - 2, "t/f:timestamp_lt_timestamptz" }, - { NULL, "bool timestamp_le_timestamptz(timestamp,timestamptz)", - 2, "t/f:timestamp_le_timestamptz" }, - { NULL, "bool timestamp_eq_timestamptz(timestamp,timestamptz)", - 2, "t/f:timestamp_eq_timestamptz" }, - { NULL, "bool timestamp_ge_timestamptz(timestamp,timestamptz)", - 2, "t/f:timestamp_ge_timestamptz" }, - { NULL, "bool timestamp_gt_timestamptz(timestamp,timestamptz)", - 2, "t/f:timestamp_gt_timestamptz" }, - { NULL, "bool timestamp_ne_timestamptz(timestamp,timestamptz)", - 2, "t/f:timestamp_ne_timestamptz" }, - /* comparison between timestamptz and timestamp */ - { NULL, "bool timestamptz_lt_timestamp(timestamptz,timestamp)", - 2, "t/f:timestamptz_lt_timestamp" }, - { NULL, "bool timestamptz_le_timestamp(timestamptz,timestamp)", - 2, "t/f:timestamptz_le_timestamp" }, - { NULL, "bool timestamptz_eq_timestamp(timestamptz,timestamp)", - 2, "t/f:timestamptz_eq_timestamp" }, - { NULL, "bool timestamptz_ge_timestamp(timestamptz,timestamp)", - 2, "t/f:timestamptz_ge_timestamp" }, - { NULL, "bool timestamptz_gt_timestamp(timestamptz,timestamp)", - 2, "t/f:timestamptz_gt_timestamp" }, - { NULL, "bool timestamptz_ne_timestamp(timestamptz,timestamp)", - 2, "t/f:timestamptz_ne_timestamp" }, - /* comparison between intervals */ - { NULL, "bool interval_eq(interval,interval)", 2, "t/f:interval_eq" }, - { NULL, "bool interval_ne(interval,interval)", 2, "t/f:interval_ne" }, - { NULL, "bool interval_lt(interval,interval)", 2, "t/f:interval_lt" }, - { NULL, "bool interval_le(interval,interval)", 2, "t/f:interval_le" }, - { NULL, "bool interval_ge(interval,interval)", 2, "t/f:interval_ge" }, - { NULL, "bool interval_gt(interval,interval)", 2, "t/f:interval_gt" }, - { NULL, "int4 interval_cmp(interval,interval)",2, "t/f:interval_cmp"}, - /* overlaps() */ - { NULL, "bool overlaps(time,time,time,time)", - 20, "t/f:overlaps_time" }, - { NULL, "bool overlaps(timetz,timetz,timetz,timetz)", - 20, "t/f:overlaps_timetz" }, - { NULL, "bool overlaps(timestamp,timestamp,timestamp,timestamp)", - 20, "t/f:overlaps_timestamp" }, - { NULL, "bool overlaps(timestamptz,timestamptz,timestamptz,timestamptz)", - 20, "t/f:overlaps_timestamptz" }, - /* extract() - PG14 changed to return numeric, not float8 */ - { NULL, "float8 date_part(text,timestamp)", - 100, "t/f:date_part_timestamp"}, - { NULL, "float8 date_part(text,timestamptz)", - 100, "t/f:date_part_timestamptz"}, - { NULL, "float8 date_part(text,interval)", - 100, "t/f:date_part_interval"}, - { NULL, "float8 date_part(text,timetz)", - 100, "t/f:date_part_timetz"}, - { NULL, "float8 date_part(text,time)", - 100, "t/f:date_part_time"}, - - { NULL, "numeric extract(text,timestamp)", - 100, "t/f:extract_timestamp"}, - { NULL, "numeric extract(text,timestamptz)", - 100, "t/f:extract_timestamptz"}, - { NULL, "numeric extract(text,time)", - 100, "t/f:extract_time"}, - { NULL, "numeric extract(text,timetz)", - 100, "t/f:extract_timetz"}, - { NULL, "numeric extract(text,interval)", - 100, "t/f:extract_interval"}, - - /* other time and data functions */ - { NULL, "timestamptz now()", 1, "t/f:now" }, - - /* macaddr functions */ - { NULL, "macaddr trunc(macaddr)", 8, "m/f:macaddr_trunc" }, - { NULL, "macaddr macaddr_not(macaddr)", 8, "m/f:macaddr_not" }, - { NULL, "macaddr macaddr_and(macaddr,macaddr)", 8, "m/f:macaddr_and" }, - { NULL, "macaddr macaddr_or(macaddr,macaddr)", 8, "m/f:macaddr_or" }, - - /* inet/cidr functions */ - { NULL, "iner set_masklen(inet,int4)", 8, "m/f:inet_set_masklen" }, - { NULL, "cidr set_masklen(cidr,int4)", 8, "m/f:cidr_set_masklen" }, - { NULL, "int4 family(inet)", 8, "m/f:inet_family" }, - { NULL, "cidr network(inet)", 8, "m/f:network_network" }, - { NULL, "inet netmask(inet)", 8, "m/f:inet_netmask" }, - { NULL, "int4 masklen(inet)", 8, "m/f:inet_masklen" }, - { NULL, "inet broadcast(inet)", 8, "m/f:inet_broadcast" }, - { NULL, "iner hostmask(inet)", 8, "m/f:inet_hostmask" }, - { NULL, "cidr cidr(iner)", 8, "m/f:inet_to_cidr" }, - { NULL, "inet inetnot(inet)", 8, "m/f:inet_not" }, - { NULL, "inet inetand(inet,inet)", 8, "m/f:inet_and" }, - { NULL, "inet inetor(inet,inet)", 8, "m/f:inet_or" }, - { NULL, "inet inetpl(inet,int8)", 8, "m/f:inetpl_int8" }, - { NULL, "inet inetmi_int8(inet,int8)", 8, "m/f:inetmi_int8" }, - { NULL, "int8 inetmi(inet,inet)", 8, "m/f:inetmi" }, - { NULL, "bool inet_same_family(inet,inet)", 8, "m/f:inet_same_family" }, -// { NULL, "inet inet_merge(inet,inet)", 8, "m/f:inet_merge" }, - - /* - * Text functions - */ - { NULL, "bool bpchareq(bpchar,bpchar)", 200, "s/f:bpchareq" }, - { NULL, "bool bpcharne(bpchar,bpchar)", 200, "s/f:bpcharne" }, - { NULL, "bool bpcharlt(bpchar,bpchar)", 200, "sL/f:bpcharlt" }, - { NULL, "bool bpcharle(bpchar,bpchar)", 200, "sL/f:bpcharle" }, - { NULL, "bool bpchargt(bpchar,bpchar)", 200, "sL/f:bpchargt" }, - { NULL, "bool bpcharge(bpchar,bpchar)", 200, "sL/f:bpcharge" }, - { NULL, "int4 bpcharcmp(bpchar,bpchar)",200, "sL/f:type_compare"}, - { NULL, "int4 length(bpchar)", 2, "sL/f:bpcharlen"}, - { NULL, "bool texteq(text,text)", 200, "s/f:texteq" }, - { NULL, "bool textne(text,text)", 200, "s/f:textne" }, - { NULL, "bool text_lt(text,text)", 200, "sL/f:text_lt" }, - { NULL, "bool text_le(text,text)", 200, "sL/f:text_le" }, - { NULL, "bool text_gt(text,text)", 200, "sL/f:text_gt" }, - { NULL, "bool text_ge(text,text)", 200, "sL/f:text_ge" }, - { NULL, "int4 bttextcmp(text,text)", 200, "sL/f:type_compare" }, - /* LIKE operators */ - { NULL, "bool like(text,text)", 9999, "s/f:textlike" }, - { NULL, "bool textlike(text,text)", 9999, "s/f:textlike" }, - { NULL, "bool bpcharlike(bpchar,text)", 9999, "s/f:bpcharlike" }, - { NULL, "bool notlike(text,text)", 9999, "s/f:textnlike" }, - { NULL, "bool textnlike(text,text)", 9999, "s/f:textnlike" }, - { NULL, "bool bpcharnlike(bpchar,text)", 9999, "s/f:bpcharnlike" }, - /* ILIKE operators */ - { NULL, "bool texticlike(text,text)", 9999, "Ls/f:texticlike" }, - { NULL, "bool bpchariclike(text,text)", 9999, "Ls/f:bpchariclike" }, - { NULL, "bool texticnlike(text,text)", 9999, "Ls/f:texticnlike" }, - { NULL, "bool bpcharicnlike(bpchar,text)",9999, "Ls/f:bpcharicnlike" }, - /* string operations */ - { NULL, "int4 length(text)", 2, "s/f:textlen" }, - { NULL, "text textcat(text,text)", - 999, "Cs/f:textcat", - vlbuf_estimate_textcat - }, - { NULL, "text concat(text,text)", - 999, "Cs/f:text_concat2", - vlbuf_estimate_textcat - }, - { NULL, "text concat(text,text,text)", - 999, "Cs/f:text_concat3", - vlbuf_estimate_textcat - }, - { NULL, "text concat(text,text,text,text)", - 999, "Cs/f:text_concat4", - vlbuf_estimate_textcat - }, - { NULL, "text substr(text,int4,int4)", - 10, "Cs/f:text_substring", - vlbuf_estimate_substring - }, - { NULL, "text substring(text,int4,int4)", - 10, "Cs/f:text_substring", - vlbuf_estimate_substring - }, - { NULL, "text substr(text,int4)", - 10, "Cs/f:text_substring_nolen", - vlbuf_estimate_substring - }, - { NULL, "text substring(text,int4)", - 10, "Cs/f:text_substring_nolen", - vlbuf_estimate_substring - }, - /* jsonb operators */ - { NULL, "jsonb jsonb_object_field(jsonb,text)", - 1000, "jC/f:jsonb_object_field", - vlbuf_estimate_jsonb - }, - { NULL, "text jsonb_object_field_text(jsonb,text)", - 1000, "jC/f:jsonb_object_field_text", - vlbuf_estimate_jsonb - }, - { NULL, "jsonb jsonb_array_element(jsonb,int4)", - 1000, "jC/f:jsonb_array_element", - vlbuf_estimate_jsonb - }, - { NULL, "text jsonb_array_element_text(jsonb,int4)", - 1000, "jC/f:jsonb_array_element_text", - vlbuf_estimate_jsonb - }, - { NULL, "bool jsonb_exists(jsonb,text)", - 100, "j/f:jsonb_exists" - }, - /* - * int4range operators - */ - { NULL, "int4 lower(int4range)", 2, "r/f:int4range_lower" }, - { NULL, "int4 upper(int4range)", 2, "r/f:int4range_upper" }, - { NULL, "bool isempty(int4range)", 1, "r/f:int4range_isempty" }, - { NULL, "bool lower_inc(int4range)", 1, "r/f:int4range_lower_inc" }, - { NULL, "bool upper_inc(int4range)", 1, "r/f:int4range_upper_inc" }, - { NULL, "bool lower_inf(int4range)", 1, "r/f:int4range_lower_inf" }, - { NULL, "bool upper_inf(int4range)", 1, "r/f:int4range_upper_inf" }, - { NULL, "bool range_eq(int4range,int4range)", 2, "r/f:int4range_eq" }, - { NULL, "bool range_ne(int4range,int4range)", 2, "r/f:int4range_ne" }, - { NULL, "bool range_lt(int4range,int4range)", 2, "r/f:int4range_lt" }, - { NULL, "bool range_le(int4range,int4range)", 2, "r/f:int4range_le" }, - { NULL, "bool range_gt(int4range,int4range)", 2, "r/f:int4range_gt" }, - { NULL, "bool range_ge(int4range,int4range)", 2, "r/f:int4range_ge" }, - { NULL, "int4 range_cmp(int4range,int4range)",2, "r/f:int4range_cmp"}, - { NULL, "bool range_overlaps(int4range,int4range)", - 4, "r/f:int4range_overlaps" }, - { NULL, "bool range_contains_elem(int4range,int4)", - 4, "r/f:int4range_contains_elem" }, - { NULL, "bool range_contains(int4range,int4range)", - 4, "r/f:int4range_contains" }, - { NULL, "bool elem_contained_by_range(int4,int4range)", - 4, "r/f:elem_contained_by_int4range" }, - { NULL, "bool range_contained_by(int4range,int4range)", - 4, "r/f:int4range_contained_by" }, - { NULL, "bool range_adjacent(int4range,int4range)", - 4, "r/f:int4range_adjacent" }, - { NULL, "bool range_before(int4range,int4range)", - 4, "r/f:int4range_before" }, - { NULL, "bool range_after(int4range,int4range)", - 4, "r/f:int4range_after" }, - { NULL, "bool range_overleft(int4range,int4range)", - 4, "r/f:int4range_overleft" }, - { NULL, "bool range_overright(int4range,int4range)", - 4, "r/f:int4range_overright" }, - { NULL, "int4range range_union(int4range,int4range)", - 4, "r/f:int4range_union" }, - { NULL, "int4range range_merge(int4range,int4range)", - 4, "r/f:int4range_merge" }, - { NULL, "int4range range_intersect(int4range,int4range)", - 4, "r/f:int4range_intersect" }, - { NULL, "int4range range_minus(int4range,int4range)", - 4, "r/f:int4range_minus" }, - /* - * int8range operators - */ - { NULL, "int8 lower(int8range)", 2, "r/f:int8range_lower" }, - { NULL, "int8 upper(int8range)", 2, "r/f:int8range_upper" }, - { NULL, "bool isempty(int8range)", 1, "r/f:int8range_isempty" }, - { NULL, "bool lower_inc(int8range)", 1, "r/f:int8range_lower_inc" }, - { NULL, "bool upper_inc(int8range)", 1, "r/f:int8range_upper_inc" }, - { NULL, "bool lower_inf(int8range)", 1, "r/f:int8range_lower_inf" }, - { NULL, "bool upper_inf(int8range)", 1, "r/f:int8range_upper_inf" }, - { NULL, "bool range_eq(int8range,int8range)", 2, "r/f:int8range_eq" }, - { NULL, "bool range_ne(int8range,int8range)", 2, "r/f:int8range_ne" }, - { NULL, "bool range_lt(int8range,int8range)", 2, "r/f:int8range_lt" }, - { NULL, "bool range_le(int8range,int8range)", 2, "r/f:int8range_le" }, - { NULL, "bool range_gt(int8range,int8range)", 2, "r/f:int8range_gt" }, - { NULL, "bool range_ge(int8range,int8range)", 2, "r/f:int8range_ge" }, - { NULL, "int4 range_cmp(int8range,int8range)",2, "r/f:int8range_cmp"}, - { NULL, "bool range_overlaps(int8range,int8range)", - 4, "r/f:int8range_overlaps" }, - { NULL, "bool range_contains_elem(int8range,int8)", - 4, "r/f:int8range_contains_elem" }, - { NULL, "bool range_contains(int8range,int8range)", - 4, "r/f:int8range_contains" }, - { NULL, "bool elem_contained_by_range(int8,int8range)", - 4, "r/f:elem_contained_by_int8range" }, - { NULL, "bool range_contained_by(int8range,int8range)", - 4, "r/f:int8range_contained_by" }, - { NULL, "bool range_adjacent(int8range,int8range)", - 4, "r/f:int8range_adjacent" }, - { NULL, "bool range_before(int8range,int8range)", - 4, "r/f:int8range_before" }, - { NULL, "bool range_after(int8range,int8range)", - 4, "r/f:int8range_after" }, - { NULL, "bool range_overleft(int8range,int8range)", - 4, "r/f:int8range_overleft" }, - { NULL, "bool range_overright(int8range,int8range)", - 4, "r/f:int8range_overright" }, - { NULL, "int8range range_union(int8range,int8range)", - 4, "r/f:int8range_union" }, - { NULL, "int8range range_merge(int8range,int8range)", - 4, "r/f:int8range_merge" }, - { NULL, "int8range range_intersect(int8range,int8range)", - 4, "r/f:int8range_intersect" }, - { NULL, "int8range range_minus(int8range,int8range)", - 4, "r/f:int8range_minus" }, - /* - * tsrange operators - */ - { NULL, "timestamp lower(tsrange)", 2, "r/f:tsrange_lower" }, - { NULL, "timestamp upper(tsrange)", 2, "r/f:tsrange_upper" }, - { NULL, "bool isempty(tsrange)", 1, "r/f:tsrange_isempty" }, - { NULL, "bool lower_inc(tsrange)", 1, "r/f:tsrange_lower_inc" }, - { NULL, "bool upper_inc(tsrange)", 1, "r/f:tsrange_upper_inc" }, - { NULL, "bool lower_inf(tsrange)", 1, "r/f:tsrange_lower_inf" }, - { NULL, "bool upper_inf(tsrange)", 1, "r/f:tsrange_upper_inf" }, - { NULL, "bool range_eq(tsrange,tsrange)", 2, "r/f:tsrange_eq" }, - { NULL, "bool range_ne(tsrange,tsrange)", 2, "r/f:tsrange_ne" }, - { NULL, "bool range_lt(tsrange,tsrange)", 2, "r/f:tsrange_lt" }, - { NULL, "bool range_le(tsrange,tsrange)", 2, "r/f:tsrange_le" }, - { NULL, "bool range_gt(tsrange,tsrange)", 2, "r/f:tsrange_gt" }, - { NULL, "bool range_ge(tsrange,tsrange)", 2, "r/f:tsrange_ge" }, - { NULL, "int4 range_cmp(tsrange,tsrange)",2, "r/f:tsrange_cmp"}, - { NULL, "bool range_overlaps(tsrange,tsrange)", - 4, "r/f:tsrange_overlaps" }, - { NULL, "bool range_contains_elem(tsrange,timestamp)", - 4, "r/f:tsrange_contains_elem" }, - { NULL, "bool range_contains(tsrange,tsrange)", - 4, "r/f:tsrange_contains" }, - { NULL, "bool elem_contained_by_range(timestamp,tsrange)", - 4, "r/f:elem_contained_by_tsrange" }, - { NULL, "bool range_contained_by(tsrange,tsrange)", - 4, "r/f:tsrange_contained_by" }, - { NULL, "bool range_adjacent(tsrange,tsrange)", - 4, "r/f:tsrange_adjacent" }, - { NULL, "bool range_before(tsrange,tsrange)", - 4, "r/f:tsrange_before" }, - { NULL, "bool range_after(tsrange,tsrange)", - 4, "r/f:tsrange_after" }, - { NULL, "bool range_overleft(tsrange,tsrange)", - 4, "r/f:tsrange_overleft" }, - { NULL, "bool range_overright(tsrange,tsrange)", - 4, "r/f:tsrange_overright" }, - { NULL, "tsrange range_union(tsrange,tsrange)", - 4, "r/f:tsrange_union" }, - { NULL, "tsrange range_merge(tsrange,tsrange)", - 4, "r/f:tsrange_merge" }, - { NULL, "tsrange range_intersect(tsrange,tsrange)", - 4, "r/f:tsrange_intersect" }, - { NULL, "tsrange range_minus(tsrange,tsrange)", - 4, "r/f:tsrange_minus" }, - /* - * tstzrange operators - */ - { NULL, "timestamptz lower(tstzrange)", 2, "r/f:tstzrange_lower" }, - { NULL, "timestamptz upper(tstzrange)", 2, "r/f:tstzrange_upper" }, - { NULL, "bool isempty(tstzrange)", 1, "r/f:tstzrange_isempty" }, - { NULL, "bool lower_inc(tstzrange)", 1, "r/f:tstzrange_lower_inc" }, - { NULL, "bool upper_inc(tstzrange)", 1, "r/f:tstzrange_upper_inc" }, - { NULL, "bool lower_inf(tstzrange)", 1, "r/f:tstzrange_lower_inf" }, - { NULL, "bool upper_inf(tstzrange)", 1, "r/f:tstzrange_upper_inf" }, - { NULL, "bool range_eq(tstzrange,tstzrange)", 2, "r/f:tstzrange_eq" }, - { NULL, "bool range_ne(tstzrange,tstzrange)", 2, "r/f:tstzrange_ne" }, - { NULL, "bool range_lt(tstzrange,tstzrange)", 2, "r/f:tstzrange_lt" }, - { NULL, "bool range_le(tstzrange,tstzrange)", 2, "r/f:tstzrange_le" }, - { NULL, "bool range_gt(tstzrange,tstzrange)", 2, "r/f:tstzrange_gt" }, - { NULL, "bool range_ge(tstzrange,tstzrange)", 2, "r/f:tstzrange_ge" }, - { NULL, "int4 range_cmp(tstzrange,tstzrange)",2, "r/f:tstzrange_cmp"}, - { NULL, "bool range_overlaps(tstzrange,tstzrange)", - 4, "r/f:tstzrange_overlaps" }, - { NULL, "bool range_contains_elem(tstzrange,timestamptz)", - 4, "r/f:tstzrange_contains_elem" }, - { NULL, "bool range_contains(tstzrange,tstzrange)", - 4, "r/f:tstzrange_contains" }, - { NULL, "bool elem_contained_by_range(timestamptz,tstzrange)", - 4, "r/f:elem_contained_by_tstzrange" }, - { NULL, "bool range_contained_by(tstzrange,tstzrange)", - 4, "r/f:tstzrange_contained_by" }, - { NULL, "bool range_adjacent(tstzrange,tstzrange)", - 4, "r/f:tstzrange_adjacent" }, - { NULL, "bool range_before(tstzrange,tstzrange)", - 4, "r/f:tstzrange_before" }, - { NULL, "bool range_after(tstzrange,tstzrange)", - 4, "r/f:tstzrange_after" }, - { NULL, "bool range_overleft(tstzrange,tstzrange)", - 4, "r/f:tstzrange_overleft" }, - { NULL, "bool range_overright(tstzrange,tstzrange)", - 4, "r/f:tstzrange_overright" }, - { NULL, "tstzrange range_union(tstzrange,tstzrange)", - 4, "r/f:tstzrange_union" }, - { NULL, "tstzrange range_merge(tstzrange,tstzrange)", - 4, "r/f:tstzrange_merge" }, - { NULL, "tstzrange range_intersect(tstzrange,tstzrange)", - 4, "r/f:tstzrange_intersect" }, - { NULL, "tstzrange range_minus(tstzrange,tstzrange)", - 4, "r/f:tstzrange_minus" }, - /* - * daterange operators - */ - { NULL, "date lower(daterange)", 2, "r/f:daterange_lower" }, - { NULL, "date upper(daterange)", 2, "r/f:daterange_upper" }, - { NULL, "bool isempty(daterange)", 1, "r/f:daterange_isempty" }, - { NULL, "bool lower_inc(daterange)", 1, "r/f:daterange_lower_inc" }, - { NULL, "bool upper_inc(daterange)", 1, "r/f:daterange_upper_inc" }, - { NULL, "bool lower_inf(daterange)", 1, "r/f:daterange_lower_inf" }, - { NULL, "bool upper_inf(daterange)", 1, "r/f:daterange_upper_inf" }, - { NULL, "bool range_eq(daterange,daterange)", 2, "r/f:daterange_eq" }, - { NULL, "bool range_ne(daterange,daterange)", 2, "r/f:daterange_ne" }, - { NULL, "bool range_lt(daterange,daterange)", 2, "r/f:daterange_lt" }, - { NULL, "bool range_le(daterange,daterange)", 2, "r/f:daterange_le" }, - { NULL, "bool range_gt(daterange,daterange)", 2, "r/f:daterange_gt" }, - { NULL, "bool range_ge(daterange,daterange)", 2, "r/f:daterange_ge" }, - { NULL, "int4 range_cmp(daterange,daterange)",2, "r/f:daterange_cmp"}, - { NULL, "bool range_overlaps(daterange,daterange)", - 4, "r/f:daterange_overlaps" }, - { NULL, "bool range_contains_elem(daterange,date)", - 4, "r/f:daterange_contains_elem" }, - { NULL, "bool range_contains(daterange,daterange)", - 4, "r/f:daterange_contains" }, - { NULL, "bool elem_contained_by_range(date,daterange)", - 4, "r/f:elem_contained_by_daterange" }, - { NULL, "bool range_contained_by(daterange,daterange)", - 4, "r/f:daterange_contained_by" }, - { NULL, "bool range_adjacent(daterange,daterange)", - 4, "r/f:daterange_adjacent" }, - { NULL, "bool range_before(daterange,daterange)", - 4, "r/f:daterange_before" }, - { NULL, "bool range_after(daterange,daterange)", - 4, "r/f:daterange_after" }, - { NULL, "bool range_overleft(daterange,daterange)", - 4, "r/f:daterange_overleft" }, - { NULL, "bool range_overright(daterange,daterange)", - 4, "r/f:daterange_overright" }, - { NULL, "daterange range_union(daterange,daterange)", - 4, "r/f:daterange_union" }, - { NULL, "daterange range_merge(daterange,daterange)", - 4, "r/f:daterange_merge" }, - { NULL, "daterange range_intersect(daterange,daterange)", - 4, "r/f:daterange_intersect" }, - { NULL, "daterange range_minus(daterange,daterange)", - 4, "r/f:daterange_minus" }, - - /* - * PostGIS functions - */ - { POSTGIS3, "geometry st_setsrid(geometry,int4)", - 1, "g/f:st_setsrid" }, - { POSTGIS3, "geometry st_point(float8,float8)", - 10, "gC/f:st_makepoint2", - vlbuf_estimate__st_makepoint }, - { POSTGIS3, "geometry st_makepoint(float8,float8)", - 10, "gC/f:st_makepoint2", - vlbuf_estimate__st_makepoint }, - { POSTGIS3, "geometry st_makepoint(float8,float8,float8)", - 10, "gC/f:st_makepoint3", - vlbuf_estimate__st_makepoint }, - { POSTGIS3, "geometry st_makepoint(float8,float8,float8,float8)", - 10, "gC/f:st_makepoint4", - vlbuf_estimate__st_makepoint }, - { POSTGIS3, "float8 st_distance(geometry,geometry)", - 50, "g/f:st_distance" }, - { POSTGIS3, "bool st_dwithin(geometry,geometry,float8)", - 50, "g/f:st_dwithin" }, - { POSTGIS3, "int4 st_linecrossingdirection(geometry,geometry)", - 50, "g/f:st_linecrossingdirection" }, - { POSTGIS3, "text st_relate(geometry,geometry)", - 999, "g/f:st_relate", - vlbuf_estimate__st_relate }, - { POSTGIS3, "bool st_contains(geometry,geometry)", - 999, "g/f:st_contains" }, - { POSTGIS3, "bool st_crosses(geometry,geometry)", - 999, "g/f:st_crosses" }, - { POSTGIS3, "bool geometry_overlaps(geometry,geometry)", - 10, "g/f:geometry_overlaps" }, - { POSTGIS3, "bool overlaps_2d(box2df,geometry)", - 10, "g/f:box2df_geometry_overlaps" }, - { POSTGIS3, "bool geometry_contains(geometry,geometry)", - 10, "g/f:geometry_contains" }, - { POSTGIS3, "bool contains_2d(box2df,geometry)", - 10, "g/f:box2df_geometry_contains" }, - { POSTGIS3, "bool geometry_within(geometry,geometry)", - 10, "g/f:geometry_within" }, - { POSTGIS3, "bool is_contained_2d(box2df,geometry)", - 10, "g/f:box2df_geometry_within" }, - { POSTGIS3, "geometry st_expand(geometry,float8)", - 20, "gC/f:st_expand", - vlbuf_estimate__st_expand }, - /* - * GpuPreAgg COUNT(distinct KEY) support - */ - { PGSTROM, "int8 hll_hash(int1)", 1, "f:hll_hash_int1" }, - { PGSTROM, "int8 hll_hash(int2)", 1, "f:hll_hash_int2" }, - { PGSTROM, "int8 hll_hash(int4)", 1, "f:hll_hash_int4" }, - { PGSTROM, "int8 hll_hash(int8)", 1, "f:hll_hash_int8" }, - { PGSTROM, "int8 hll_hash(numeric)", 1, "f:hll_hash_numeric" }, - { PGSTROM, "int8 hll_hash(date)", 1, "t/f:hll_hash_date" }, - { PGSTROM, "int8 hll_hash(time)", 1, "t/f:hll_hash_time" }, - { PGSTROM, "int8 hll_hash(timetz)", 1, "t/f:hll_hash_timetz" }, - { PGSTROM, "int8 hll_hash(timestamp)", 1, "t/f:hll_hash_timestamp" }, - { PGSTROM, "int8 hll_hash(timestamptz)", 1, "t/f:hll_hash_timestamptz" }, - { PGSTROM, "int8 hll_hash(bpchar)", 1, "s/f:hll_hash_bpchar" }, - { PGSTROM, "int8 hll_hash(text)", 1, "s/f:hll_hash_text" }, - { PGSTROM, "int8 hll_hash(uuid)", 1, "m/f:hll_hash_uuid"} -}; - -/* default of dfunc->dfunc_varlena_sz if not specified */ -static int -devfunc_generic_result_sz(codegen_context *context, - devfunc_info *dfunc, - Expr **args, int *vl_width) -{ - devtype_info *rtype = dfunc->func_rettype; - - if (rtype->type_length > 0) - return rtype->type_length; - else if (rtype->type_length == -1) - return type_maximum_size(rtype->type_oid, -1); - elog(ERROR, "unexpected type length: %d", rtype->type_length); -} +typedef struct { + Oid func_oid; + int func_nargs; + Oid func_argtypes[1]; +} devfunc_cache_signature; static devfunc_info * -__construct_devfunc_info(const char *func_extension, - HeapTuple protup, - devtype_info *dfunc_rettype, - int dfunc_nargs, - devtype_info **dfunc_argtypes, - Oid dfunc_collid, - int func_devcost, - const char *func_template, - devfunc_result_sz_type devfunc_result_sz) +__pgstrom_devfunc_lookup(Oid func_oid, + int func_nargs, + Oid *func_argtypes, + Oid func_collid) { - Form_pg_proc proc = (Form_pg_proc) GETSTRUCT(protup); - MemoryContext oldcxt; + devfunc_cache_signature *signature; + devtype_info *dtype = NULL; devfunc_info *dfunc = NULL; - List *dfunc_args = NIL; - const char *pos; - const char *end; - int32 flags = 0; - int j; - bool has_collation = false; - bool has_callbacks = false; - - /* fetch attribute */ - end = strchr(func_template, '/'); - if (end) - { - for (pos = func_template; pos < end; pos++) + ListCell *lc; + uint32_t hash; + int i, j, sz; + + sz = offsetof(devfunc_cache_signature, func_argtypes[func_nargs]); + signature = alloca(sz); + memset(signature, 0, sz); + signature->func_oid = func_oid; + signature->func_nargs = func_nargs; + for (i=0; i < func_nargs; i++) + signature->func_argtypes[i] = func_argtypes[i]; + hash = hash_any((unsigned char *)signature, sz); + + i = hash % DEVFUNC_INFO_NSLOTS; + foreach (lc, devfunc_info_slot[i]) + { + dfunc = lfirst(lc); + if (dfunc->hash == hash && + dfunc->func_oid == func_oid && + dfunc->func_nargs == func_nargs) { - switch (*pos) + for (j=0; j < func_nargs; j++) { - case 'L': - has_collation = true; - break; - case 'C': - has_callbacks = true; - break; - case 'p': - flags |= DEVKERNEL_NEEDS_PRIMITIVE; - break; - case 's': - flags |= DEVKERNEL_NEEDS_TEXTLIB; - break; - case 't': - flags |= DEVKERNEL_NEEDS_TIMELIB; - break; - case 'j': - flags |= DEVKERNEL_NEEDS_JSONLIB; - break; - case 'm': - flags |= DEVKERNEL_NEEDS_MISCLIB; - break; - case 'r': - flags |= DEVKERNEL_NEEDS_RANGETYPE; - break; - case 'g': - flags |= DEVKERNEL_NEEDS_POSTGIS; - break; - default: - elog(NOTICE, - "Bug? unkwnon devfunc property: %c", - *pos); + dtype = dfunc->func_argtypes[j]; + if (dtype->type_oid != func_argtypes[j]) break; } + if (j == func_nargs) + goto found; } - func_template = end + 1; } - if (strncmp(func_template, "f:", 2) != 0) + /* not found, build a new entry */ + dfunc = pgstrom_devfunc_build(func_oid, func_nargs, func_argtypes); + if (!dfunc) { - elog(NOTICE, "Bug? unknown device function template: '%s'", - func_template); - return NULL; + MemoryContext oldcxt; + + oldcxt = MemoryContextSwitchTo(devinfo_memcxt); + dfunc = palloc0(offsetof(devfunc_info, func_argtypes[func_nargs])); + dfunc->func_oid = func_oid; + dfunc->func_nargs = func_nargs; + dfunc->func_is_negative = true; + for (i=0; i < func_nargs; i++) + { + dtype = pgstrom_devtype_lookup(func_argtypes[i]); + if (!dtype) + { + dtype = palloc0(sizeof(devtype_info)); + dtype->type_oid = func_argtypes[i]; + dtype->type_is_negative = true; + } + dfunc->func_argtypes[i] = dtype; + } + MemoryContextSwitchTo(oldcxt); } - oldcxt = MemoryContextSwitchTo(devinfo_memcxt); - for (j=0; j < dfunc_nargs; j++) - dfunc_args = lappend(dfunc_args, dfunc_argtypes[j]); - - dfunc = palloc0(sizeof(devfunc_info)); - if (func_extension) - dfunc->func_extension = pstrdup(func_extension); - dfunc->func_oid = PgProcTupleGetOid(protup); - if (has_collation) + dfunc->hash = hash; + devfunc_info_slot[i] = lappend_cxt(devinfo_memcxt, + devfunc_info_slot[i], dfunc); + if (!dfunc->func_is_negative) { - if (OidIsValid(dfunc_collid) && !lc_collate_is_c(dfunc_collid)) - dfunc->func_is_negative = true; - dfunc->func_collid = dfunc_collid; + hash = hash_any((unsigned char *)&dfunc->func_code, sizeof(FuncOpCode)); + i = hash % DEVFUNC_INFO_NSLOTS; + devfunc_code_slot[i] = lappend_cxt(devinfo_memcxt, + devfunc_code_slot[i], dfunc); } - dfunc->func_is_strict = proc->proisstrict; - dfunc->func_flags = flags; - dfunc->func_args = dfunc_args; - dfunc->func_rettype = dfunc_rettype; - dfunc->func_sqlname = pstrdup(NameStr(proc->proname)); - dfunc->func_devname = func_template + 2; /* const cstring */ - dfunc->func_devcost = func_devcost; - dfunc->devfunc_result_sz = (has_callbacks - ? devfunc_result_sz - : devfunc_generic_result_sz); - /* other fields shall be assigned on the caller side */ - MemoryContextSwitchTo(oldcxt); - +found: + if (dfunc->func_is_negative) + return NULL; + if (OidIsValid(func_collid) && !lc_collate_is_c(func_collid) && + (dfunc->func_flags & DEVFUNC__LOCALE_AWARE) != 0) + return NULL; return dfunc; } -static devfunc_info * -pgstrom_devfunc_construct_fuzzy(const char *func_extension, - HeapTuple protup, - devtype_info *dfunc_rettype, - int dfunc_nargs, - devtype_info **dfunc_argtypes, - Oid dfunc_collid, - int fuzzy_index_head, - int fuzzy_index_tail) +devfunc_info * +pgstrom_devfunc_lookup(Oid func_oid, + List *func_args, + Oid func_collid) { - Form_pg_proc proc = (Form_pg_proc) GETSTRUCT(protup); - char buffer[512]; - int i, j; - - Assert(fuzzy_index_head >= 0 && - fuzzy_index_head <= fuzzy_index_tail && - fuzzy_index_tail < lengthof(devfunc_common_catalog)); - for (i = fuzzy_index_head; i <= fuzzy_index_tail; i++) - { - devfunc_catalog_t *procat = devfunc_common_catalog + i; - devtype_info *dtype; - char *tok; - char *pos; + int i, nargs = list_length(func_args); + Oid *argtypes; + ListCell *lc; - if (func_extension) - { - if (!procat->func_extension || - strcmp(procat->func_extension, func_extension) != 0) - continue; - } - else - { - if (procat->func_extension) - continue; - } - strncpy(buffer, procat->func_signature, sizeof(buffer)); - pos = strchr(buffer, ' '); - if (!pos) - continue; - *pos++ = '\0'; + i = 0; + argtypes = alloca(sizeof(Oid) * nargs); + foreach (lc, func_args) + { + Node *node = lfirst(lc); - /* check the function name */ - tok = pos; - pos = strchr(pos, '('); - if (!pos) - continue; - *pos++ = '\0'; - if (strcmp(tok, NameStr(proc->proname)) != 0) - continue; + argtypes[i++] = exprType(node); + } + return __pgstrom_devfunc_lookup(func_oid, nargs, argtypes, func_collid); +} - /* check the argument types */ - for (j=0; j < dfunc_nargs; j++) - { - tok = pos; - pos = strchr(pos, (j < dfunc_nargs - 1 ? ',' : ')')); - if (!pos) - break; /* not match */ - *pos++ = '\0'; +static devfunc_info * +devfunc_lookup_by_opcode(FuncOpCode func_code) +{ + Datum hash; + uint32_t index; + ListCell *lc; - dtype = pgstrom_devtype_lookup_by_name(tok); - if (!dtype) - break; /* not match */ - if (dtype->type_oid != dfunc_argtypes[j]->type_oid && - !pgstrom_devtype_can_relabel(dfunc_argtypes[j]->type_oid, - dtype->type_oid)) - break; /* not match */ - } - if (j < dfunc_nargs) - continue; - /* check the result type */ - dtype = pgstrom_devtype_lookup_by_name(buffer); - if (!dtype) - continue; - if (dtype->type_oid != dfunc_rettype->type_oid && - !pgstrom_devtype_can_relabel(dtype->type_oid, - dfunc_rettype->type_oid)) - continue; + hash = hash_any((unsigned char *)&func_code, sizeof(FuncOpCode)); + index = hash % DEVFUNC_INFO_NSLOTS; + foreach (lc, devfunc_code_slot[index]) + { + devfunc_info *dfunc = lfirst(lc); - /* Ok, found the fuzzy entry */ - return __construct_devfunc_info(func_extension, - protup, - dfunc_rettype, - dfunc_nargs, - dfunc_argtypes, - dfunc_collid, - procat->func_devcost, - procat->func_template, - procat->devfunc_result_sz); + if (dfunc->func_code == func_code) + return dfunc; } - /* not found */ return NULL; } -static devfunc_info * -build_extra_devfunc_info(const char *func_extension, - HeapTuple protup, - devtype_info *dfunc_rettype, - int dfunc_nargs, - devtype_info **dfunc_argtypes, - Oid dfunc_collid) +/* + * lookup special purpose devfuncs + */ +devfunc_info * +devtype_lookup_equal_func(devtype_info *dtype, Oid coll_id) { - Form_pg_proc proc_form = (Form_pg_proc) GETSTRUCT(protup); - StringInfoData ident; - devfunc_info __dfunc; - devfunc_info *dfunc = NULL; - List *dfunc_args = NIL; - const char *nsp_name; - int i; - - /* setup devfunc identifier */ - initStringInfo(&ident); - append_string_devtype_identifier(&ident, dfunc_rettype->type_oid); - nsp_name = get_namespace_name(proc_form->pronamespace); - appendStringInfo(&ident, " %s.%s(", - quote_identifier(nsp_name), - quote_identifier(NameStr(proc_form->proname))); - for (i=0; i < dfunc_nargs; i++) + if (OidIsValid(dtype->type_eqfunc)) { - devtype_info *dtype = dfunc_argtypes[i]; + Oid argtypes[2]; - if (i > 0) - appendStringInfoChar(&ident, ','); - append_string_devtype_identifier(&ident, dtype->type_oid); - dfunc_args = lappend(dfunc_args, dtype); + argtypes[0] = dtype->type_oid; + argtypes[1] = dtype->type_oid; + return __pgstrom_devfunc_lookup(dtype->type_eqfunc, 2, argtypes, coll_id); } - appendStringInfoChar(&ident, ')'); - - memset(&__dfunc, 0, sizeof(devfunc_info)); - __dfunc.func_extension = func_extension; - __dfunc.func_oid = PgProcTupleGetOid(protup); - __dfunc.hashvalue = GetSysCacheHashValue(PROCOID, __dfunc.func_oid, 0, 0, 0); - __dfunc.func_collid = dfunc_collid; - __dfunc.func_is_strict = proc_form->proisstrict; - __dfunc.func_args = dfunc_args; - __dfunc.func_rettype = dfunc_rettype; - __dfunc.func_sqlname = NameStr(proc_form->proname); - __dfunc.func_devname = NULL; /* callback must set */ - __dfunc.func_devcost = 0; /* callback must set */ - __dfunc.devfunc_result_sz = NULL; /* callback must set, if any */ - - for (i=0; i < pgstrom_num_users_extra; i++) - { - pgstromUsersExtraDescriptor *extra = &pgstrom_users_extra_desc[i]; + return NULL; +} - if (extra->lookup_extra_devfunc && - extra->lookup_extra_devfunc(ident.data, &__dfunc)) - { - MemoryContext oldcxt; +devfunc_info * +devtype_lookup_compare_func(devtype_info *dtype, Oid coll_id) +{ + if (OidIsValid(dtype->type_cmpfunc)) + { + Oid argtypes[2]; - /* must be */ - if (!__dfunc.func_devname) - { - elog(DEBUG2, "Extra module didn't set device function name for %s", - format_procedure(__dfunc.func_oid)); - continue; - } - oldcxt = MemoryContextSwitchTo(devinfo_memcxt); - dfunc = palloc0(sizeof(devfunc_info)); - dfunc->func_extension = pstrdup(__dfunc.func_extension); - dfunc->func_oid = __dfunc.func_oid; - dfunc->func_collid = __dfunc.func_collid; - dfunc->func_is_negative = __dfunc.func_is_negative; - dfunc->func_is_strict = __dfunc.func_is_strict; - dfunc->func_flags = __dfunc.func_flags; - dfunc->func_args = list_copy(__dfunc.func_args); - dfunc->func_rettype = __dfunc.func_rettype; - dfunc->func_sqlname = pstrdup(__dfunc.func_sqlname); - dfunc->func_devname = pstrdup(__dfunc.func_devname); - if (__dfunc.devfunc_result_sz) - dfunc->devfunc_result_sz = __dfunc.devfunc_result_sz; - else - dfunc->devfunc_result_sz = devfunc_generic_result_sz; - MemoryContextSwitchTo(oldcxt); - break; - } + argtypes[0] = dtype->type_oid; + argtypes[1] = dtype->type_oid; + return __pgstrom_devfunc_lookup(dtype->type_cmpfunc, 2, argtypes, coll_id); } - pfree(ident.data); - return dfunc; + return NULL; } -static devfunc_info * -pgstrom_devfunc_construct(HeapTuple protup, - Oid func_rettype, - oidvector *func_argtypes, - Oid func_collid) +/* ---------------------------------------------------------------- + * + * xPU pseudo code generator + * + * ---------------------------------------------------------------- + */ +#define __Elog(fmt,...) \ + do { \ + ereport(context->elevel, \ + (errcode(ERRCODE_INTERNAL_ERROR), \ + errmsg("(%s:%d) " fmt, __FUNCTION__, __LINE__, \ + ##__VA_ARGS__), \ + errdetail("problematic expression: %s", \ + nodeToString(context->top_expr)))); \ + return -1; \ + } while(0) + +static int codegen_expression_walker(codegen_context *context, + StringInfo buf, Expr *expr); + +void +codegen_context_init(codegen_context *context, uint32_t task_kind) +{ + memset(context, 0, sizeof(codegen_context)); + context->elevel = ERROR; + context->required_flags = (task_kind & DEVKIND__ANY); +} + +static void +__appendKernExpMagicAndLength(StringInfo buf, int head_pos) +{ + static uint64_t __zero = 0; + const kern_expression *kexp; + int padding = (INTALIGN(buf->len) - buf->len); + uint32_t magic; + + if (padding > 0) + appendBinaryStringInfo(buf, (char *)&__zero, padding); + kexp = (const kern_expression *)(buf->data + head_pos); + magic = (KERN_EXPRESSION_MAGIC + ^ ((uint32_t)kexp->exptype << 6) + ^ ((uint32_t)kexp->opcode << 14)); + appendBinaryStringInfo(buf, (char *)&magic, sizeof(uint32_t)); + ((kern_expression *)(buf->data + head_pos))->len = buf->len - head_pos; +} + +static int +codegen_const_expression(codegen_context *context, + StringInfo buf, Const *con) { - Form_pg_proc proc = (Form_pg_proc) GETSTRUCT(protup); - const char *func_extension; - StringInfoData sig; devtype_info *dtype; - devtype_info *dfunc_rettype; - devtype_info **dfunc_argtypes; - devfunc_info *dfunc = NULL; - int fuzzy_index_head = -1; - int fuzzy_index_tail = -1; - int i; + char typtype; - /* extension name */ - func_extension = get_extension_name_by_object(ProcedureRelationId, - PgProcTupleGetOid(protup)); - /* make a signature string */ - initStringInfo(&sig); - dfunc_rettype = pgstrom_devtype_lookup(func_rettype); - if (!dfunc_rettype) - goto not_found; - appendStringInfo(&sig, "%s %s(", - dfunc_rettype->type_name, - NameStr(proc->proname)); - - dfunc_argtypes = alloca(sizeof(devtype_info *) * func_argtypes->dim1); - for (i=0; i < func_argtypes->dim1; i++) - { - dtype = pgstrom_devtype_lookup(func_argtypes->values[i]); - if (!dtype) - goto not_found; - if (i > 0) - appendStringInfoChar(&sig, ','); - appendStringInfo(&sig, "%s", dtype->type_name); - dfunc_argtypes[i] = dtype; - } - appendStringInfoChar(&sig, ')'); + typtype = get_typtype(con->consttype); + if (typtype != TYPTYPE_BASE && + typtype != TYPTYPE_ENUM && + typtype != TYPTYPE_RANGE && + typtype != TYPTYPE_DOMAIN) + __Elog("unable to use type %s in Const expression (class: %c)", + format_type_be(con->consttype), typtype); - for (i=0; i < lengthof(devfunc_common_catalog); i++) + dtype = pgstrom_devtype_lookup(con->consttype); + if (!dtype) + __Elog("type %s is not device supported", + format_type_be(con->consttype)); + if (buf) { - devfunc_catalog_t *procat = &devfunc_common_catalog[i]; + kern_expression *kexp; + int pos, sz = 0; - if (func_extension) + sz = offsetof(kern_expression, u.c.const_value); + if (!con->constisnull) { - if (!procat->func_extension || - strcmp(procat->func_extension, func_extension) != 0) - continue; - } - else if (procat->func_extension) - continue; - - if (strcmp(procat->func_signature, sig.data) == 0) - { - dfunc = __construct_devfunc_info(func_extension, - protup, - dfunc_rettype, - func_argtypes->dim1, - dfunc_argtypes, - func_collid, - procat->func_devcost, - procat->func_template, - procat->devfunc_result_sz); - break; + if (con->constbyval) + sz += con->constlen; + else if (con->constlen == -1) + sz += VARSIZE_ANY(con->constvalue); + else + elog(ERROR, "unsupported type length: %d", con->constlen); } - else + kexp = alloca(sz); + memset(kexp, 0, sz); + kexp->exptype = dtype->type_code; + kexp->expflags = context->kexp_flags; + kexp->opcode = FuncOpCode__ConstExpr; + kexp->u.c.const_type = con->consttype; + kexp->u.c.const_isnull = con->constisnull; + if (!con->constisnull) { - /* - * In case when function name is identical, but argument list - * does not match exactly. ( - */ - const char *sname = strchr(procat->func_signature, ' '); - const char *pname = NameStr(proc->proname); - - if (sname) - { - sname++; - while (*sname != '\0' && - *pname != '\0' && - *sname == *pname) - { - sname++; - pname++; - } - if (*sname == '(' && *pname == '\0') - { - if (fuzzy_index_head < 0) - fuzzy_index_head = i; - fuzzy_index_tail = i; - } - } + if (con->constbyval) + memcpy(kexp->u.c.const_value, + &con->constvalue, + con->constlen); + else + memcpy(kexp->u.c.const_value, + DatumGetPointer(con->constvalue), + VARSIZE_ANY(con->constvalue)); } + pos = __appendBinaryStringInfo(buf, kexp, sz); + __appendKernExpMagicAndLength(buf, pos); } - /* try invocation with implicit type relabel */ - if (!dfunc && fuzzy_index_head >= 0) - { - dfunc = pgstrom_devfunc_construct_fuzzy(func_extension, - protup, - dfunc_rettype, - func_argtypes->dim1, - dfunc_argtypes, - func_collid, - fuzzy_index_head, - fuzzy_index_tail); - } - /* extra device function, if any */ - if (!dfunc && pgstrom_num_users_extra > 0) - { - dfunc = build_extra_devfunc_info(func_extension, - protup, - dfunc_rettype, - func_argtypes->dim1, - dfunc_argtypes, - func_collid); - } -not_found: - pfree(sig.data); - return dfunc; + return 0; } -static devfunc_info * -__pgstrom_devfunc_lookup(HeapTuple protup, - Oid func_rettype, - oidvector *func_argtypes, - Oid func_collid) +static int +codegen_param_expression(codegen_context *context, + StringInfo buf, Param *param) { - Oid func_oid = PgProcTupleGetOid(protup); - Form_pg_proc proc = (Form_pg_proc) GETSTRUCT(protup); - devfunc_info *dfunc; + kern_expression kexp; devtype_info *dtype; - ListCell *lc; - cl_uint hashvalue; - int j, hindex; - dlist_iter iter; - bool consider_relabel = false; - - hashvalue = GetSysCacheHashValue(PROCOID, func_oid, 0, 0, 0); - hindex = hashvalue % lengthof(devfunc_info_slot); -retry: - dlist_foreach (iter, &devfunc_info_slot[hindex]) - { - dfunc = dlist_container(devfunc_info, chain, iter.cur); - if (dfunc->func_oid != func_oid) - continue; - if (OidIsValid(dfunc->func_collid) && - dfunc->func_collid != func_collid) - continue; + char typtype; + int pos; - dtype = dfunc->func_rettype; - if (dtype->type_oid != func_rettype && - (!consider_relabel || - !pgstrom_devtype_can_relabel(dtype->type_oid, func_rettype))) - continue; + if (param->paramkind != PARAM_EXTERN) + __Elog("Only PARAM_EXTERN is supported on device: %d", + (int)param->paramkind); - if (list_length(dfunc->func_args) == func_argtypes->dim1) - { - j = 0; - foreach (lc, dfunc->func_args) - { - dtype = lfirst(lc); - if (dtype->type_oid != func_argtypes->values[j] && - (!consider_relabel || - !pgstrom_devtype_can_relabel(func_argtypes->values[j], - dtype->type_oid))) - break; /* not match */ - j++; - } - if (!lc) - { - if (dfunc->func_is_negative) - return NULL; - return dfunc; - } - } - } - if (!consider_relabel) + typtype = get_typtype(param->paramtype); + if (typtype != TYPTYPE_BASE && + typtype != TYPTYPE_ENUM && + typtype != TYPTYPE_RANGE && + typtype != TYPTYPE_DOMAIN) + __Elog("unable to use type %s in Const expression (class: %c)", + format_type_be(param->paramtype), typtype); + + dtype = pgstrom_devtype_lookup(param->paramtype); + if (!dtype) + __Elog("type %s is not device supported", + format_type_be(param->paramtype)); + if (buf) { - consider_relabel = true; - goto retry; + memset(&kexp, 0, sizeof(kexp)); + kexp.opcode = FuncOpCode__ParamExpr; + kexp.exptype = dtype->type_code; + kexp.expflags = context->kexp_flags; + kexp.u.p.param_id = param->paramid; + pos = __appendBinaryStringInfo(buf, &kexp, + SizeOfKernExprParam); + __appendKernExpMagicAndLength(buf, pos); } + context->used_params = list_append_unique(context->used_params, param); - /* Not cached, construct a new entry of the device function */ - dfunc = pgstrom_devfunc_construct(protup, - func_rettype, - func_argtypes, - func_collid); - /* Not found, so this function should be a nagative entry */ - if (!dfunc) - { - MemoryContext oldcxt = MemoryContextSwitchTo(devinfo_memcxt); + return 0; +} - /* dummy devtype_info just for oid checks */ - dfunc = palloc0(sizeof(devfunc_info)); - dfunc->func_oid = func_oid; - dfunc->func_is_negative = true; - for (j=0; j < func_argtypes->dim1; j++) - { - dtype = palloc0(sizeof(devtype_info)); - dtype->type_oid = func_argtypes->values[j]; - dfunc->func_args = lappend(dfunc->func_args, dtype); - } - dtype = palloc0(sizeof(devtype_info)); - dtype->type_oid = func_rettype; - dfunc->func_rettype = dtype; - dfunc->func_sqlname = pstrdup(NameStr(proc->proname)); +static int +codegen_var_expression(codegen_context *context, + StringInfo buf, + Expr *expr, + int kvar_slot_id) +{ + Oid type_oid = exprType((Node *)expr); + devtype_info *dtype; - MemoryContextSwitchTo(oldcxt); + dtype = pgstrom_devtype_lookup(type_oid); + if (!dtype) + __Elog("type %s is not device supported", format_type_be(type_oid)); + + if (buf) + { + kern_expression kexp; + int pos; + + memset(&kexp, 0, sizeof(kexp)); + kexp.exptype = dtype->type_code; + kexp.expflags = context->kexp_flags; + kexp.opcode = FuncOpCode__VarExpr; + kexp.u.v.var_typlen = dtype->type_length; + kexp.u.v.var_typbyval = dtype->type_byval; + kexp.u.v.var_typalign = dtype->type_align; + kexp.u.v.var_slot_id = kvar_slot_id; + pos = __appendBinaryStringInfo(buf, &kexp, SizeOfKernExprVar); + __appendKernExpMagicAndLength(buf, pos); } - dfunc->hashvalue = hashvalue; - dlist_push_head(&devfunc_info_slot[hindex], &dfunc->chain); - if (dfunc->func_is_negative) - return NULL; - return dfunc; + return 0; } -devfunc_info * -pgstrom_devfunc_lookup(Oid func_oid, - Oid func_rettype, - List *func_args, /* list of expressions */ - Oid func_collid) +static int +__codegen_func_expression(codegen_context *context, + StringInfo buf, + Oid func_oid, + List *func_args, + Oid func_collid) { - devfunc_info *result = NULL; - HeapTuple tup; + devfunc_info *dfunc; + devtype_info *dtype; + kern_expression kexp; + int pos = -1; + ListCell *lc; - tup = SearchSysCache1(PROCOID, ObjectIdGetDatum(func_oid)); - if (!HeapTupleIsValid(tup)) - elog(ERROR, "cache lookup failed for function %u", func_oid); - PG_TRY(); + dfunc = pgstrom_devfunc_lookup(func_oid, func_args, func_collid); + if (!dfunc || + (dfunc->func_flags & context->required_flags) != context->required_flags) + __Elog("function %s is not supported on the target device", + format_procedure(func_oid)); + dtype = dfunc->func_rettype; + context->device_cost += dfunc->func_cost; + + memset(&kexp, 0, sizeof(kexp)); + kexp.exptype = dtype->type_code; + kexp.expflags = context->kexp_flags; + kexp.opcode = dfunc->func_code; + kexp.nr_args = list_length(func_args); + kexp.args_offset = SizeOfKernExpr(0); + if (buf) + pos = __appendBinaryStringInfo(buf, &kexp, SizeOfKernExpr(0)); + foreach (lc, func_args) { - int func_nargs = list_length(func_args); - oidvector *func_argtypes; - int i = 0; - ListCell *lc; - - func_argtypes = alloca(offsetof(oidvector, values[func_nargs])); - func_argtypes->ndim = 1; - func_argtypes->dataoffset = 0; - func_argtypes->elemtype = OIDOID; - func_argtypes->dim1 = func_nargs; - func_argtypes->lbound1 = 0; - foreach (lc, func_args) - { - Oid type_oid = exprType((Node *)lfirst(lc)); - - func_argtypes->values[i++] = type_oid; - } - SET_VARSIZE(func_argtypes, offsetof(oidvector, values[func_nargs])); + Expr *arg = lfirst(lc); - result = __pgstrom_devfunc_lookup(tup, - func_rettype, - func_argtypes, - func_collid); + if (codegen_expression_walker(context, buf, arg) < 0) + return -1; } - PG_CATCH(); - { - ReleaseSysCache(tup); - PG_RE_THROW(); - } - PG_END_TRY(); - ReleaseSysCache(tup); + if (buf) + __appendKernExpMagicAndLength(buf, pos); + return 0; +} - return result; +static int +codegen_func_expression(codegen_context *context, + StringInfo buf, FuncExpr *func) +{ + return __codegen_func_expression(context, + buf, + func->funcid, + func->args, + func->funccollid); } -devfunc_info * -pgstrom_devfunc_lookup_type_equal(devtype_info *dtype, Oid type_collid) +static int +codegen_oper_expression(codegen_context *context, + StringInfo buf, OpExpr *oper) +{ + return __codegen_func_expression(context, + buf, + get_opcode(oper->opno), + oper->args, + oper->opcollid); +} + +static int +codegen_bool_expression(codegen_context *context, + StringInfo buf, BoolExpr *b) { - devfunc_info *result = NULL; - char buffer[offsetof(oidvector, values[2])]; - oidvector *func_argtypes = (oidvector *)buffer; - HeapTuple tup; - Form_pg_proc proc __attribute__((unused)); + kern_expression kexp; + int pos = -1; + ListCell *lc; - if (!OidIsValid(dtype->type_eqfunc)) - return NULL; - tup = SearchSysCache1(PROCOID, ObjectIdGetDatum(dtype->type_eqfunc)); - if (!HeapTupleIsValid(tup)) - elog(ERROR, "cache lookup failed for function %u", - dtype->type_eqfunc); - PG_TRY(); + memset(&kexp, 0, sizeof(kexp)); + switch (b->boolop) { - proc = (Form_pg_proc) GETSTRUCT(tup); - Assert(proc->pronargs == 2); - Assert(proc->prorettype == BOOLOID); - - memset(func_argtypes, 0, offsetof(oidvector, values[2])); - func_argtypes->ndim = 1; - func_argtypes->dataoffset = 0; - func_argtypes->elemtype = OIDOID; - func_argtypes->dim1 = 2; - func_argtypes->lbound1 = 0; - func_argtypes->values[0] = dtype->type_oid; - func_argtypes->values[1] = dtype->type_oid; - SET_VARSIZE(func_argtypes, offsetof(oidvector, values[2])); - - result = __pgstrom_devfunc_lookup(tup, - BOOLOID, - func_argtypes, - type_collid); + case AND_EXPR: + kexp.opcode = FuncOpCode__BoolExpr_And; + kexp.nr_args = list_length(b->args); + if (kexp.nr_args < 2) + __Elog("BoolExpr(AND) must have 2 or more arguments"); + break; + case OR_EXPR: + kexp.opcode = FuncOpCode__BoolExpr_Or; + kexp.nr_args = list_length(b->args); + if (kexp.nr_args < 2) + __Elog("BoolExpr(OR) must have 2 or more arguments"); + break; + case NOT_EXPR: + kexp.opcode = FuncOpCode__BoolExpr_Not; + kexp.nr_args = list_length(b->args); + if (kexp.nr_args != 1) + __Elog("BoolExpr(OR) must not have multiple arguments"); + break; + default: + __Elog("BoolExpr has unknown bool operation (%d)", (int)b->boolop); } - PG_CATCH(); + kexp.exptype = TypeOpCode__bool; + kexp.expflags = context->kexp_flags; + kexp.args_offset = SizeOfKernExpr(0); + if (buf) + pos = __appendBinaryStringInfo(buf, &kexp, SizeOfKernExpr(0)); + foreach (lc, b->args) { - ReleaseSysCache(tup); - PG_RE_THROW(); - } - PG_END_TRY(); - ReleaseSysCache(tup); + Expr *arg = lfirst(lc); - return result; + if (codegen_expression_walker(context, buf, arg) < 0) + return -1; + } + if (buf) + __appendKernExpMagicAndLength(buf, pos); + return 0; } -devfunc_info * -pgstrom_devfunc_lookup_type_compare(devtype_info *dtype, Oid type_collid) +static int +codegen_nulltest_expression(codegen_context *context, + StringInfo buf, NullTest *nt) { - devfunc_info *result = NULL; - char buffer[offsetof(oidvector, values[2])]; - oidvector *func_argtypes = (oidvector *)buffer; - HeapTuple tup; - Form_pg_proc proc __attribute__((unused)); + kern_expression kexp; + int pos = -1; - if (!OidIsValid(dtype->type_cmpfunc)) - return NULL; - tup = SearchSysCache1(PROCOID, ObjectIdGetDatum(dtype->type_cmpfunc)); - if (!HeapTupleIsValid(tup)) - elog(ERROR, "cache lookup failed for function %u", - dtype->type_cmpfunc); - PG_TRY(); + memset(&kexp, 0, sizeof(kexp)); + switch (nt->nulltesttype) { - proc = (Form_pg_proc) GETSTRUCT(tup); - Assert(proc->pronargs == 2); - Assert(proc->prorettype == INT4OID); - - memset(func_argtypes, 0, offsetof(oidvector, values[2])); - func_argtypes->ndim = 1; - func_argtypes->dataoffset = 0; - func_argtypes->elemtype = OIDOID; - func_argtypes->dim1 = 2; - func_argtypes->lbound1 = 0; - func_argtypes->values[0] = dtype->type_oid; - func_argtypes->values[1] = dtype->type_oid; - SET_VARSIZE(func_argtypes, offsetof(oidvector, values[2])); - - result = __pgstrom_devfunc_lookup(tup, - INT4OID, - func_argtypes, - type_collid); - } - PG_CATCH(); - { - ReleaseSysCache(tup); - PG_RE_THROW(); - } - PG_END_TRY(); - ReleaseSysCache(tup); - - return result; + case IS_NULL: + kexp.opcode = FuncOpCode__NullTestExpr_IsNull; + break; + case IS_NOT_NULL: + kexp.opcode = FuncOpCode__NullTestExpr_IsNotNull; + break; + default: + __Elog("NullTest has unknown NullTestType (%d)", (int)nt->nulltesttype); + } + kexp.exptype = TypeOpCode__bool; + kexp.expflags = context->kexp_flags; + kexp.nr_args = 1; + kexp.args_offset = SizeOfKernExpr(0); + if (buf) + pos = __appendBinaryStringInfo(buf, &kexp, SizeOfKernExpr(0)); + if (codegen_expression_walker(context, buf, nt->arg) < 0) + return -1; + if (buf) + __appendKernExpMagicAndLength(buf, pos); + return 0; } -void -pgstrom_devfunc_track(codegen_context *context, devfunc_info *dfunc) +static int +codegen_booleantest_expression(codegen_context *context, + StringInfo buf, BooleanTest *bt) { - devtype_info *dtype = dfunc->func_rettype; - ListCell *lc; + kern_expression kexp; + int pos = -1; - /* track device function */ - context->extra_flags |= (dfunc->func_flags | dtype->type_flags); - foreach (lc, dfunc->func_args) + memset(&kexp, 0, sizeof(kexp)); + switch (bt->booltesttype) { - dtype = (devtype_info *) lfirst(lc); - context->extra_flags |= dtype->type_flags; - } + case IS_TRUE: + kexp.opcode = FuncOpCode__BoolTestExpr_IsTrue; + break; + case IS_NOT_TRUE: + kexp.opcode = FuncOpCode__BoolTestExpr_IsNotTrue; + break; + case IS_FALSE: + kexp.opcode = FuncOpCode__BoolTestExpr_IsFalse; + break; + case IS_NOT_FALSE: + kexp.opcode = FuncOpCode__BoolTestExpr_IsNotFalse; + break; + case IS_UNKNOWN: + kexp.opcode = FuncOpCode__BoolTestExpr_IsUnknown; + break; + case IS_NOT_UNKNOWN: + kexp.opcode = FuncOpCode__BoolTestExpr_IsNotUnknown; + break; + default: + __Elog("BooleanTest has unknown BoolTestType (%d)", + (int)bt->booltesttype); + } + kexp.exptype = TypeOpCode__bool; + kexp.expflags = context->kexp_flags; + kexp.nr_args = 1; + kexp.args_offset = SizeOfKernExpr(0); + if (buf) + pos = __appendBinaryStringInfo(buf, &kexp, SizeOfKernExpr(0)); + if (codegen_expression_walker(context, buf, bt->arg) < 0) + return -1; + if (buf) + __appendKernExpMagicAndLength(buf, pos); + return 0; } /* - * Device cast support + * is_expression_equals_tlist * - * In some cases, a function can be called with different argument types or - * result type from its declaration, if these types are binary compatible. - * PostgreSQL does not have any infrastructure to check data types, it relies - * on the caller which shall give correct data types, and binary-compatible - * types will work without any problems. - * On the other hands, CUDA C++ has strict type checks for function invocation, - * so we need to inject a thin type cast device function even if they are - * binary compatible. - * The thin device function has the following naming convention: - * - * STATIC_INLINE(DESTTYPE) to_DESTTYPE(kcxt, SOURCETYPE) - * - * We have no SQL function on host side because the above device function - * reflects binary-compatible type cast. If cast is COERCION_METHOD_FUNCTION, - * SQL function shall be explicitly used. - * - * In case of COERCION_METHOD_INOUT, expression tree have CoerceViaIO; that - * involves a pair of heavy operation (cstring-out/in). Usually, it is not - * supported on the device code except for small number of exceptions. - * dcast_coerceviaio_callback allows to inject special case handling to run - * the job of CoerceViaIO. + * It checks whether the supplied expression exactly matches any entry of + * the target-list. If found, it returns its depth and resno. */ -static struct { - Oid src_type_oid; - Oid dst_type_oid; - bool has_domain_checks; - devcast_coerceviaio_callback_f dcast_coerceviaio_callback; -} devcast_catalog[] = { - /* text, varchar, bpchar */ - { TEXTOID, BPCHAROID, false, NULL }, - { TEXTOID, VARCHAROID, false, NULL }, - { VARCHAROID, TEXTOID, false, NULL }, - { VARCHAROID, BPCHAROID, false, NULL }, - /* cidr -> inet, but no reverse type cast */ - { CIDROID, INETOID, false, NULL }, - /* text -> (intX/floatX/numeric), including (jsonb->>'key') reference */ - { TEXTOID, BOOLOID, false, devcast_text2numeric_callback }, - { TEXTOID, INT2OID, false, devcast_text2numeric_callback }, - { TEXTOID, INT4OID, false, devcast_text2numeric_callback }, - { TEXTOID, INT8OID, false, devcast_text2numeric_callback }, - { TEXTOID, FLOAT4OID, false, devcast_text2numeric_callback }, - { TEXTOID, FLOAT8OID, false, devcast_text2numeric_callback }, - { TEXTOID, NUMERICOID, false, devcast_text2numeric_callback }, -}; - -static devcast_info * -build_devcast_info(Oid src_type_oid, Oid dst_type_oid) +static int +is_expression_equals_tlist(codegen_context *context, Expr *expr) { - devcast_info *dcast = NULL; - devtype_info *dtype_s = NULL; - devtype_info *dtype_d = NULL; - int i; + ListCell *lc1, *lc2; + int depth = 0; + int resno; + int slot_id; + devtype_info *dtype = NULL; + + foreach (lc1, context->input_rels_tlist) + { + Node *node = lfirst(lc1); - dtype_s = pgstrom_devtype_lookup(src_type_oid); - if (!dtype_s) - goto not_found; - dtype_d = pgstrom_devtype_lookup(dst_type_oid); - if (!dtype_d) - goto not_found; + if (IsA(node, Integer)) + { + Index varno = intVal(node); + Var *var = (Var *)expr; + + if (IsA(var, Var) && var->varno == varno) + { + resno = var->varattno; + dtype = pgstrom_devtype_lookup(var->vartype); + goto found; + } + } + else if (IsA(node, PathTarget)) + { + PathTarget *reltarget = (PathTarget *)node; + + resno = 1; + foreach (lc2, reltarget->exprs) + { + if (equal(expr, lfirst(lc2))) + { + dtype = pgstrom_devtype_lookup(exprType((Node *)expr)); + goto found; + } + resno++; + } + } + else + { + elog(ERROR, "Bug? unexpected input_rels_tlist"); + } + depth++; + } + return -1; /* not found */ - for (i=0; i < lengthof(devcast_catalog); i++) +found: + slot_id = 0; + forboth (lc1, context->kvars_depth, + lc2, context->kvars_resno) { - if (dtype_s->type_oid == devcast_catalog[i].src_type_oid && - dtype_d->type_oid == devcast_catalog[i].dst_type_oid) + if (depth == lfirst_int(lc1) && + resno == lfirst_int(lc2)) { - dcast = MemoryContextAllocZero(devinfo_memcxt, - sizeof(devcast_info)); - dcast->src_type = dtype_s; - dcast->dst_type = dtype_d; - dcast->has_domain_checks = devcast_catalog[i].has_domain_checks; - dcast->dcast_coerceviaio_callback - = devcast_catalog[i].dcast_coerceviaio_callback; - break; + return slot_id; } + slot_id++; } - /* extra type cast */ - if (!dcast) + context->kvars_depth = lappend_int(context->kvars_depth, depth); + context->kvars_resno = lappend_int(context->kvars_resno, resno); + if (dtype && (dtype->type_flags & DEVTYPE__USE_KVARS_SLOTBUF) != 0) + context->kvars_types = lappend_oid(context->kvars_types, dtype->type_oid); + else + context->kvars_types = lappend_oid(context->kvars_types, InvalidOid); + context->kvars_exprs = lappend(context->kvars_exprs, expr); + + return slot_id; +} + +static int +codegen_expression_walker(codegen_context *context, + StringInfo buf, Expr *expr) +{ + int slot_id; + + if (!expr) + return 0; + /* check simple var references */ + slot_id = is_expression_equals_tlist(context, expr); + if (slot_id >= 0) + return codegen_var_expression(context, buf, expr, slot_id); + + switch (nodeTag(expr)) { - StringInfoData src_ident; - StringInfoData dst_ident; - devcast_info __dcast; + case T_Const: + return codegen_const_expression(context, buf, (Const *)expr); + case T_Param: + return codegen_param_expression(context, buf, (Param *)expr); + case T_FuncExpr: + return codegen_func_expression(context, buf, (FuncExpr *)expr); + case T_OpExpr: + case T_DistinctExpr: + return codegen_oper_expression(context, buf, (OpExpr *)expr); + case T_BoolExpr: + return codegen_bool_expression(context, buf, (BoolExpr *)expr); + case T_NullTest: + return codegen_nulltest_expression(context, buf, (NullTest *)expr); + case T_BooleanTest: + return codegen_booleantest_expression(context, buf, (BooleanTest *)expr); + case T_CoalesceExpr: + case T_MinMaxExpr: + case T_RelabelType: + case T_CoerceViaIO: + case T_CoerceToDomain: + case T_CaseExpr: + case T_CaseTestExpr: + case T_ScalarArrayOpExpr: + default: + __Elog("not a supported expression type: %s", nodeToString(expr)); + } + return -1; +} +#undef __Elog - initStringInfo(&src_ident); - initStringInfo(&dst_ident); - append_string_devtype_identifier(&src_ident, dtype_s->type_oid); - append_string_devtype_identifier(&dst_ident, dtype_d->type_oid); +/* + * codegen_build_loadvars + */ +static int +kern_vars_defitem_comp(const void *__a, const void *__b) +{ + const kern_vars_defitem *a = __a; + const kern_vars_defitem *b = __b; - memset(&__dcast, 0, sizeof(devcast_info)); - __dcast.src_type = dtype_s; - __dcast.dst_type = dtype_d; - __dcast.has_domain_checks = false; /* extra module must set, if any */ + if (a->var_resno < b->var_resno) + return -1; + if (a->var_resno > b->var_resno) + return 1; + return 0; +} - for (i=0; i < pgstrom_num_users_extra; i++) +static kern_expression * +__codegen_build_loadvars_one(codegen_context *context, int depth) +{ + kern_expression kexp; + StringInfoData buf; + int slot_id = 0; + int nloads = 0; + int nslots = list_length(context->kvars_depth); + uint32_t kvars_offset; + ListCell *lc1, *lc2, *lc3; + + initStringInfo(&buf); + buf.len = offsetof(kern_expression, u.load.kvars); + kvars_offset = (sizeof(kern_variable) * nslots + + sizeof(int) * nslots); + forthree (lc1, context->kvars_depth, + lc2, context->kvars_resno, + lc3, context->kvars_types) + { + kern_vars_defitem vitem; + int __depth = lfirst_int(lc1); + int __resno = lfirst_int(lc2); + Oid __type_oid = lfirst_oid(lc3); + + vitem.var_resno = __resno; + vitem.var_slot_id = slot_id++; + if (__depth == depth) { - pgstromUsersExtraDescriptor *extra = &pgstrom_users_extra_desc[i]; - - if (extra->lookup_extra_devcast && - extra->lookup_extra_devcast(src_ident.data, - dst_ident.data, - &__dcast)) + if (!OidIsValid(__type_oid)) + vitem.var_slot_off = 0; + else { - MemoryContext oldcxt = MemoryContextSwitchTo(devinfo_memcxt); - - dcast = pmemdup(&__dcast, sizeof(devcast_info)); + devtype_info *dtype = pgstrom_devtype_lookup(__type_oid); - MemoryContextSwitchTo(oldcxt); - break; + Assert(dtype != NULL); + kvars_offset = TYPEALIGN(dtype->type_alignof, kvars_offset); + vitem.var_slot_off = kvars_offset; + kvars_offset += dtype->type_sizeof; } + appendBinaryStringInfo(&buf, (char *)&vitem, + sizeof(kern_vars_defitem)); + nloads++; } - pfree(src_ident.data); - pfree(dst_ident.data); } -not_found: - /* negative entry */ - if (!dcast) + if (nloads == 0) { - MemoryContext oldcxt = MemoryContextSwitchTo(devinfo_memcxt); - - if (!dtype_s) - { - dtype_s = palloc0(sizeof(devtype_info)); - dtype_s->type_oid = src_type_oid; - } - if (!dtype_d) - { - dtype_d = palloc0(sizeof(devtype_info)); - dtype_d->type_oid = dst_type_oid; - } - dcast = palloc0(sizeof(devcast_info)); - dcast->src_type = dtype_s; - dcast->dst_type = dtype_d; - dcast->cast_is_negative = true; - MemoryContextSwitchTo(oldcxt); + pfree(buf.data); + return NULL; } - /* sanity checks */ - if (dcast->has_domain_checks && - dcast->dcast_coerceviaio_callback != NULL) - __ELog("Bug? type cast %s -> %s with domain checks must be binary compatible", - format_type_be(dcast->src_type->type_oid), - format_type_be(dcast->dst_type->type_oid)); - return dcast; -} + qsort(buf.data + offsetof(kern_expression, u.load.kvars), + nloads, + sizeof(kern_vars_defitem), + kern_vars_defitem_comp); -devcast_info * -pgstrom_devcast_lookup(Oid src_type_oid, Oid dst_type_oid) -{ - uint32 hashvalue; - int hindex; - devcast_info *dcast; - dlist_iter iter; - - hashvalue = GetSysCacheHashValue(CASTSOURCETARGET, - src_type_oid, - dst_type_oid, - 0, 0); - hindex = hashvalue % lengthof(devcast_info_slot); - dlist_foreach (iter, &devcast_info_slot[hindex]) - { - dcast = dlist_container(devcast_info, chain, iter.cur); - if (dcast->src_type->type_oid == src_type_oid && - dcast->dst_type->type_oid == dst_type_oid) - { - if (dcast->cast_is_negative) - return NULL; - return dcast; - } - } - /* create a new one */ - dcast = build_devcast_info(src_type_oid, dst_type_oid); - dcast->hashvalue = hashvalue; - dlist_push_head(&devcast_info_slot[hindex], &dcast->chain); - if (dcast->cast_is_negative) - return NULL; - return dcast; + memset(&kexp, 0, sizeof(kexp)); + kexp.exptype = TypeOpCode__int4; + kexp.expflags = context->kexp_flags; + kexp.opcode = FuncOpCode__LoadVars; + kexp.args_offset = MAXALIGN(offsetof(kern_expression, + u.load.kvars[nloads])); + kexp.u.load.depth = depth; + kexp.u.load.nloads = nloads; + memcpy(buf.data, &kexp, offsetof(kern_expression, u.load.kvars)); + __appendKernExpMagicAndLength(&buf, 0); + + return (kern_expression *)buf.data; } -bool -pgstrom_devtype_can_relabel(Oid src_type_oid, - Oid dst_type_oid) +bytea * +codegen_build_scan_loadvars(codegen_context *context) { - devcast_info *dcast; - - dcast = pgstrom_devcast_lookup(src_type_oid, dst_type_oid); - if (dcast && dcast->dcast_coerceviaio_callback == NULL) - return true; + kern_expression *kexp = __codegen_build_loadvars_one(context, 0); + char *xpucode = NULL; - return false; + if (kexp) + { + xpucode = palloc(VARHDRSZ + kexp->len); + memcpy(xpucode + VARHDRSZ, kexp, kexp->len); + SET_VARSIZE(xpucode, VARHDRSZ + kexp->len); + } + return (bytea *)xpucode; } -/* - * Device index support - * - * devide index handler must be declared as: - * - * DEVICE_FUNCTION(cl_bool) - * pgindex_(kern_context *cxt, - * PageHeaderData *i_page, - * arg1, - * arg2); - */ -static struct { - const char *extname; - const char *signature; - const char *index_kind; - int opstrategy; - const char *index_fname; - const char *ivar_typname; - const char *iarg_typname; -} devindex_catalog[] = { - /* geometry overlap operator */ - { POSTGIS3, "geometry && geometry", - "gist", RTOverlapStrategyNumber, - "gist_geometry_overlap", - "box2df@postgis", - "geometry@postgis", - }, - { POSTGIS3, "box2df && geometry", - "gist", RTOverlapStrategyNumber, - "gist_geometry_overlap", - "box2df@postgis", - "geometry@postgis", - }, - { POSTGIS3, "geometry && box2df", - "gist", RTOverlapStrategyNumber, - "gist_box2df_overlap", - "box2df@postgis", - "box2df@postgis", - }, - { POSTGIS3, "box2df && box2df", - "gist", RTOverlapStrategyNumber, - "gist_box2df_overlap", - "box2df@postgis", - "box2df@postgis", - }, - /* geometry contains operator */ - { POSTGIS3, "geometry ~ geometry", - "gist", RTContainsStrategyNumber, - "gist_geometry_contains", - "box2df@postgis", - "geometry@postgis", - }, - { POSTGIS3, "box2df ~ geometry", - "gist", RTContainsStrategyNumber, - "gist_geometry_contains", - "box2df@postgis", - "geometry@postgis", - }, - { POSTGIS3, "geometry ~ box2df", - "gist", RTContainsStrategyNumber, - "gist_box2df_contains", - "box2df@postgis", - "box2df@postgis", - }, - { POSTGIS3, "box2df ~ box2df", - "gist", RTContainsStrategyNumber, - "gist_box2df_contains", - "box2df@postgis", - "box2df@postgis", - }, - /* geometry contained operator */ - { POSTGIS3, "geometry @ geometry", - "gist", RTContainedByStrategyNumber, - "gist_geometry_contained", - "box2df@postgis", - "geometry@postgis", - }, - { POSTGIS3, "box2df @ geometry", - "gist", RTContainedByStrategyNumber, - "gist_geometry_contained", - "box2df@postgis", - "geometry@postgis", - }, - { POSTGIS3, "geometry @ box2df", - "gist", RTContainedByStrategyNumber, - "gist_box2df_contained", - "box2df@postgis", - "box2df@postgis", - }, - { POSTGIS3, "box2df @ box2df", - "gist", RTContainedByStrategyNumber, - "gist_box2df_contained", - "box2df@postgis", - "box2df@postgis", - }, -}; - -devindex_info * -pgstrom_devindex_lookup(Oid opcode, Oid opfamily) +bytea * +codegen_build_join_loadvars(codegen_context *context) { - devindex_info *dindex = NULL; - uint32 hashvalue; - uint32 hindex; - HeapTuple htup; - Form_pg_amop amop; - dlist_iter iter; - const char *extname; - char signature[3*NAMEDATALEN + 100]; - int i; + kern_expression *kexp; + StringInfoData buf; + int max_depth = -1; + uint32_t sz; + char *result = NULL; + ListCell *lc; - hashvalue = GetSysCacheHashValue(AMOPOPID, - ObjectIdGetDatum(opcode), - CharGetDatum(AMOP_SEARCH), - ObjectIdGetDatum(opfamily), 0); - hindex = hashvalue % lengthof(devindex_info_slot); - dlist_foreach(iter, &devindex_info_slot[hindex]) + foreach (lc, context->kvars_depth) { - dindex = dlist_container(devindex_info, chain, iter.cur); - if (dindex->opcode == opcode && - dindex->opfamily == opfamily) - goto found; - } + int depth = lfirst_int(lc); - extname = get_extension_name_by_object(OperatorRelationId, opcode); - htup = SearchSysCache3(AMOPOPID, - ObjectIdGetDatum(opcode), - CharGetDatum(AMOP_SEARCH), - ObjectIdGetDatum(opfamily)); - if (!HeapTupleIsValid(htup)) - elog(ERROR, "operator %u is not a member of opfamily %u", - opcode, opfamily); - amop = (Form_pg_amop) GETSTRUCT(htup); - snprintf(signature, sizeof(signature), "%s %s %s", - get_type_name(amop->amoplefttype, false), - get_opname(opcode), - get_type_name(amop->amoprighttype, false)); - - dindex = NULL; - for (i=0; i < lengthof(devindex_catalog); i++) - { - const char *__extname = devindex_catalog[i].extname; - const char *__signature = devindex_catalog[i].signature; - const char *__ivar_typname = devindex_catalog[i].ivar_typname; - const char *__iarg_typname = devindex_catalog[i].iarg_typname; - devtype_info *ivar_dtype; - devtype_info *iarg_dtype; - - if (__extname) + if (depth >= 0) + max_depth = Max(max_depth, depth); + } + if (max_depth < 1) + return NULL; + sz = MAXALIGN(offsetof(kern_expression, u.pack.offset[max_depth+1])); + kexp = alloca(sz); + memset(kexp, 0, sz); + kexp->exptype = TypeOpCode__int4; + kexp->expflags = context->kexp_flags; + kexp->opcode = FuncOpCode__Packed; + kexp->args_offset = sz; + kexp->u.pack.npacked = max_depth; + + initStringInfo(&buf); + buf.len = sz; + for (int i=0; i < max_depth; i++) + { + kern_expression *karg = __codegen_build_loadvars_one(context, i+1); + + if (karg) { - if (!extname || strcmp(__extname, extname) != 0) - continue; + kexp->u.pack.offset[i] + = __appendBinaryStringInfo(&buf, karg, karg->len); + kexp->nr_args++; + pfree(karg); } - else if (extname != NULL) - continue; - - if (strcmp(__signature, signature) != 0) - continue; - - ivar_dtype = pgstrom_devtype_lookup_by_name(__ivar_typname); - if (!ivar_dtype) - continue; - iarg_dtype = pgstrom_devtype_lookup_by_name(__iarg_typname); - if (!iarg_dtype) - continue; - - dindex = MemoryContextAllocZero(devinfo_memcxt, sizeof(devindex_info)); - dindex->oper_extension = extname; - dindex->opcode = opcode; - dindex->opfamily = opfamily; - dindex->opstrategy = amop->amopstrategy; - dindex->index_kind = devindex_catalog[i].index_kind; - dindex->index_fname = devindex_catalog[i].index_fname; - dindex->ivar_dtype = ivar_dtype; - dindex->iarg_dtype = iarg_dtype; - break; } - //TODO: call extra module - - /* not supported, add negative entry */ - if (!dindex) + if (kexp->nr_args > 0) { - dindex = MemoryContextAllocZero(devinfo_memcxt, sizeof(devindex_info)); - dindex->oper_extension = extname; - dindex->opcode = opcode; - dindex->opfamily = opfamily; - dindex->opstrategy = amop->amopstrategy; - dindex->index_is_negative = true; + memcpy(buf.data, kexp, sz); + __appendKernExpMagicAndLength(&buf, 0); + result = palloc(VARHDRSZ + buf.len); + memcpy(result + VARHDRSZ, buf.data, buf.len); + SET_VARSIZE(result, VARHDRSZ + buf.len); } - ReleaseSysCache(htup); - - dindex->hashvalue = hashvalue; - dlist_push_head(&devindex_info_slot[hindex], &dindex->chain); -found: - if (dindex->index_is_negative) - return NULL; - return dindex; + pfree(buf.data); + return (bytea *)result; } /* - * codegen_expression_walker - main logic of run-time code generator + * codegen_build_scan_quals */ -static void codegen_expression_walker(codegen_context *context, - StringInfo body, - Node *node, int *p_varlena_sz); - -static Node *__codegen_current_node = NULL; -static void -__appendStringInfo(StringInfo str, const char *fmt,...) - pg_attribute_printf(2, 3); - -static void -__appendStringInfo(StringInfo str, const char *fmt,...) -{ - int save_errno = errno; - - if (!str) - return; - for (;;) - { - va_list va_args; - int needed; - - errno = save_errno; - va_start(va_args, fmt); - needed = appendStringInfoVA(str, fmt, va_args); - va_end(va_args); - - if (needed == 0) - break; - enlargeStringInfo(str, needed); - } -} - -static inline void -__appendStringInfoChar(StringInfo str, char c) -{ - if (str) - appendStringInfoChar(str, c); -} - -static int -codegen_const_expression(codegen_context *context, - StringInfo body, - Const *con) +bytea * +codegen_build_scan_quals(codegen_context *context, List *dev_quals) { - devtype_info *dtype; - cl_int index; - cl_int width; + StringInfoData buf; + Expr *expr; + char *result = NULL; - dtype = pgstrom_devtype_lookup_and_track(con->consttype, context); - if (!dtype) - __ELog("type %s is not device supported", - format_type_be(con->consttype)); - context->used_params = lappend(context->used_params, - copyObject(con)); - index = list_length(context->used_params) - 1; - - __appendStringInfo(body, - "pg_%s_param(kcxt,%d)", - dtype->type_name, index); - if (con->constisnull) - width = 0; - else if (con->constlen > 0) - width = con->constlen; - else if (con->constlen == -1) - width = VARSIZE_ANY_EXHDR(con->constvalue); + Assert(context->elevel >= ERROR); + if (dev_quals == NIL) + return NULL; + if (list_length(dev_quals) == 1) + expr = linitial(dev_quals); else - elog(ERROR, "unexpected type length: %d", con->constlen); - return width; -} - -static int -codegen_param_expression(codegen_context *context, - StringInfo body, - Param *param) -{ - devtype_info *dtype; - ListCell *lc; - int index = 0; - int width; - - if (param->paramkind != PARAM_EXTERN) - __ELog("ParamKind is not PARAM_EXTERN: %d", - (int)param->paramkind); - - dtype = pgstrom_devtype_lookup_and_track(param->paramtype, context); - if (!dtype) - __ELog("type %s is not device supported", - format_type_be(param->paramtype)); + expr = make_andclause(dev_quals); - foreach (lc, context->used_params) + initStringInfo(&buf); + if (codegen_expression_walker(context, &buf, expr) == 0) { - if (equal(param, lfirst(lc))) - goto found; - index++; + result = palloc(VARHDRSZ + buf.len); + memcpy(result + VARHDRSZ, buf.data, buf.len); + SET_VARSIZE(result, VARHDRSZ+buf.len); } - context->used_params = lappend(context->used_params, - copyObject(param)); - index = list_length(context->used_params) - 1; - -found: - __appendStringInfo(body, - "pg_%s_param(kcxt,%d)", - dtype->type_name, index); - if (dtype->type_length > 0) - width = dtype->type_length; - else if (dtype->type_length == -1) - width = type_maximum_size(param->paramtype, - param->paramtypmod) - VARHDRSZ; - else - elog(ERROR, "unexpected type length: %d", dtype->type_length); + pfree(buf.data); - return width; + return (bytea *)result; } +/* + * __try_inject_projection_expression + */ static int -codegen_varnode_expression(codegen_context *context, - StringInfo body, Var *var) +__try_inject_projection_expression(codegen_context *context, + StringInfo buf, + Expr *expr, + bool write_kexp_if_exists, + bool *p_inject_new) { - AttrNumber varattno = var->varattno; - devtype_info *dtype; - ListCell *lc; - int width; + ListCell *lc1, *lc2; + int slot_id; - dtype = pgstrom_devtype_lookup_and_track(var->vartype, context); - if (!dtype) - __ELog("type %s is not device supported", - format_type_be(var->vartype)); /* - * NOTE: Expression tree at the path-construction time can contain - * references to other tables; which can be eventually replaced by - * replace_nestloop_params(). So, this Var-node shall not be visible - * when we generate the device code. - * We may be able to handle the check well, however, we simply - * prohibit the Var-node which references out of the current scope - * of the relations. - * - * If var->varno == INDEX_VAR, it is obvious that caller is - * responsible to build custom_scan_tlist with adequate source. + * When 'expr' is simple Var-reference on the input relations, + * we don't need to inject expression node here. */ - if (context->baserel && !IS_SPECIAL_VARNO(var->varno)) + slot_id = is_expression_equals_tlist(context, expr); + if (slot_id >= 0) { - RelOptInfo *baserel = context->baserel; - - if (!bms_is_member(var->varno, baserel->relids)) - elog(ERROR, "Var (varno=%d) referred out of expected range %s", - var->varno, bms_to_cstring(baserel->relids)); + if (write_kexp_if_exists) + codegen_var_expression(context, buf, expr, slot_id); + *p_inject_new = false; + return slot_id; } /* - * Fixup varattno when pseudo-scan tlist exists, because varattno - * shall be adjusted on setrefs.c, so we have to adjust variable - * name according to the expected attribute number is kernel- - * source shall be constructed prior to setrefs.c / subselect.c + * Try to find out the expression which already has kvars-slot. + * If exists, we can reuse it. */ - if (context->pseudo_tlist != NIL) + slot_id = 0; + forboth (lc1, context->kvars_depth, + lc2, context->kvars_resno) { - foreach (lc, context->pseudo_tlist) - { - TargetEntry *tle = lfirst(lc); - Var *ptv = (Var *) tle->expr; + int depth = lfirst_int(lc1); + int resno = lfirst_int(lc2); - if (!IsA(tle->expr, Var) || - ptv->varno != var->varno || - ptv->varattno != var->varattno || - ptv->varlevelsup != var->varlevelsup) - continue; + if (depth < 0 && + resno > 0 && + resno <= list_length(context->tlist_dev)) + { + TargetEntry *tle = list_nth(context->tlist_dev, resno-1); - varattno = tle->resno; - break; + if (equal(tle->expr, expr)) + { + if (write_kexp_if_exists) + codegen_var_expression(context, buf, expr, slot_id); + *p_inject_new = false; + return slot_id; + } } - if (!lc) - elog(ERROR, "failed on map Var (%s) on ps_tlist: %s", - nodeToString(var), - nodeToString(context->pseudo_tlist)); } - if (varattno < 0) - __appendStringInfo(body, "KVAR_S%u", -varattno); - else - __appendStringInfo(body, "KVAR_%u", varattno); - if (!list_member(context->used_vars, var)) - context->used_vars = lappend(context->used_vars, - copyObject(var)); - if (dtype->type_length >= 0) - width = dtype->type_length; - else - width = type_maximum_size(var->vartype, - var->vartypmod) - VARHDRSZ; - return width; -} -static int -codegen_function_expression(codegen_context *context, - StringInfo body, - devfunc_info *dfunc, List *args) -{ - ListCell *lc1, *lc2; - Expr **fn_args = alloca(sizeof(Expr *) * list_length(args)); - int *vl_width = alloca(sizeof(int) * list_length(args)); - int index = 0; - - __appendStringInfo(body, - "pgfn_%s(kcxt", - dfunc->func_devname); - forboth (lc1, dfunc->func_args, - lc2, args) + /* + * Try to assign a new kvars-slot, if 'expr' exists on tlist_dev. + */ + foreach (lc1, context->tlist_dev) { - devtype_info *dtype = lfirst(lc1); - Node *expr = lfirst(lc2); - Oid expr_type_oid = exprType(expr); + TargetEntry *tle = lfirst(lc1); - __appendStringInfo(body, ", "); - - if (dtype->type_oid == expr_type_oid) - codegen_expression_walker(context, body, expr, &vl_width[index]); - else if (pgstrom_devtype_can_relabel(expr_type_oid, - dtype->type_oid)) - { - /* - * NOTE: PostgreSQL may pass binary compatible arguments - * without explicit RelabelType, like varchar(N) values - * onto text arguments. - * It is quite right implementation from the PostgreSQL - * function invocation API, however, unable to describe - * the relevant device code, because CUDA C++ has strict - * type checks. So, we have to inject an explicit type - * relabel in this case. - */ - __appendStringInfo(body, "to_%s(", dtype->type_name); - codegen_expression_walker(context, body, expr, &vl_width[index]); - __appendStringInfoChar(body, ')'); - } - else + if (equal(tle->expr, expr)) { - __ELog("Bug? unsupported implicit type cast (%s)->(%s)", - format_type_be(expr_type_oid), - format_type_be(dtype->type_oid)); + kern_expression kexp; + devtype_info *dtype; + Oid type_oid; + int pos; + + slot_id = list_length(context->kvars_depth); + context->kvars_depth = lappend_int(context->kvars_depth, -1); + context->kvars_resno = lappend_int(context->kvars_resno, tle->resno); + context->kvars_types = lappend_oid(context->kvars_types, InvalidOid); + + type_oid = exprType((Node *)expr); + dtype = pgstrom_devtype_lookup(type_oid); + if (!dtype) + elog(ERROR, "type %s is not device supported", + format_type_be(type_oid)); + memset(&kexp, 0, sizeof(kexp)); + kexp.exptype = dtype->type_code; + kexp.expflags = context->kexp_flags; + kexp.opcode = FuncOpCode__SaveExpr; + kexp.nr_args = 1; + kexp.args_offset = MAXALIGN(offsetof(kern_expression, + u.save.data)); + kexp.u.save.slot_id = slot_id; + pos = __appendBinaryStringInfo(buf, &kexp, kexp.args_offset); + codegen_expression_walker(context, buf, expr); + __appendKernExpMagicAndLength(buf, pos); + + *p_inject_new = true; + + return slot_id; } - fn_args[index++] = (Expr *)expr; } - __appendStringInfoChar(body, ')'); - /* estimation of function result width */ - return dfunc->devfunc_result_sz(context, dfunc, fn_args, vl_width); + return -1; /* not found */ } -static int -codegen_nulltest_expression(codegen_context *context, - StringInfo body, - NullTest *nulltest) -{ - devtype_info *dtype; - Oid typeoid = exprType((Node *)nulltest->arg); - - if (nulltest->argisrow) - __ELog("NullTest towards RECORD data"); - - dtype = pgstrom_devtype_lookup_and_track(typeoid, context); - if (!dtype) - __ELog("type %s is not device supported", - format_type_be(typeoid)); - switch (nulltest->nulltesttype) - { - case IS_NULL: - __appendStringInfo(body, "PG_ISNULL"); - break; - case IS_NOT_NULL: - __appendStringInfo(body, "PG_ISNOTNULL"); - break; - default: - elog(ERROR, "unknown NullTestType: %d", - (int)nulltest->nulltesttype); - } - __appendStringInfo(body, "(kcxt, "); - codegen_expression_walker(context, body, (Node *) nulltest->arg, NULL); - __appendStringInfoChar(body, ')'); - context->devcost += 1; +/* + * codegen_build_projection + */ +bytea * +codegen_build_projection(codegen_context *context) +{ + kern_expression *kexp; + StringInfoData arg; + StringInfoData buf; + bool meet_resjunk = false; + ListCell *lc; + int nexprs = 0; + int nattrs = 0; + int n, sz, pos; + char *result; + + n = list_length(context->tlist_dev); + sz = MAXALIGN(offsetof(kern_expression, u.pagg.desc[n])); + kexp = alloca(sz); + memset(kexp, 0, sz); + + initStringInfo(&arg); + foreach (lc, context->tlist_dev) + { + TargetEntry *tle = lfirst(lc); + int slot_id; + bool inject_new; + Oid type_oid; + devtype_info *dtype; + kern_projection_desc *desc; - return sizeof(cl_bool); + if (tle->resjunk) + { + meet_resjunk = true; + continue; + } + else if (meet_resjunk) + elog(ERROR, "Bug? a valid TLE after junk TLEs"); + + slot_id = __try_inject_projection_expression(context, + &arg, + tle->expr, + false, + &inject_new); + if (slot_id < 0) + elog(ERROR, "Bug? expression is missing on tlist_dev: %s", + nodeToString(tle->expr)); + if (inject_new) + nexprs++; + + type_oid = exprType((Node *)tle->expr); + dtype = pgstrom_devtype_lookup(type_oid); + if (!dtype) + elog(ERROR, "type %s is not device supported", + format_type_be(type_oid)); + + desc = &kexp->u.proj.desc[nattrs++]; + desc->slot_id = slot_id; + } + kexp->exptype = TypeOpCode__int4; + kexp->expflags = context->kexp_flags; + kexp->opcode = FuncOpCode__Projection; + kexp->nr_args = nexprs; + kexp->args_offset = MAXALIGN(offsetof(kern_expression, + u.proj.desc[nattrs])); + kexp->u.proj.nattrs = nattrs; + initStringInfo(&buf); + pos = __appendBinaryStringInfo(&buf, kexp, kexp->args_offset); + if (nexprs > 0) + __appendBinaryStringInfo(&buf, arg.data, arg.len); + __appendKernExpMagicAndLength(&buf, pos); + + result = palloc(VARHDRSZ + buf.len); + memcpy(result + VARHDRSZ, buf.data, buf.len); + SET_VARSIZE(result, VARHDRSZ + buf.len); + + pfree(arg.data); + pfree(buf.data); + + return (bytea *)result; } -static int -codegen_booleantest_expression(codegen_context *context, - StringInfo body, - BooleanTest *booltest) +/* + * __codegen_build_joinquals + */ +static kern_expression * +__codegen_build_joinquals(codegen_context *context, + List *join_quals, + List *other_quals) { - const char *func_name; + StringInfoData buf; + kern_expression kexp; + ListCell *lc; + uint32_t kexp_flags__saved; - if (exprType((Node *)booltest->arg) != BOOLOID) - elog(ERROR, "argument type of BooleanTest is not bool"); + if (join_quals == NIL && other_quals == NIL) + return NULL; - /* choose one of built-in functions */ - switch (booltest->booltesttype) - { - case IS_TRUE: - func_name = "bool_is_true"; - break; - case IS_NOT_TRUE: - func_name = "bool_is_not_true"; - break; - case IS_FALSE: - func_name = "bool_is_false"; - break; - case IS_NOT_FALSE: - func_name = "bool_is_not_false"; - break; - case IS_UNKNOWN: - func_name = "bool_is_unknown"; - break; - case IS_NOT_UNKNOWN: - func_name = "bool_is_not_unknown"; - break; - default: - elog(ERROR, "unknown BoolTestType: %d", - (int)booltest->booltesttype); - break; - } - __appendStringInfo(body, "pgfn_%s(kcxt, ", func_name); - codegen_expression_walker(context, body, - (Node *) booltest->arg, NULL); - __appendStringInfoChar(body, ')'); - context->devcost += 1; + initStringInfo(&buf); + memset(&kexp, 0, sizeof(kexp)); + kexp.exptype = TypeOpCode__bool; + kexp.expflags = context->kexp_flags; + kexp.opcode = FuncOpCode__JoinQuals; + kexp.nr_args = list_length(join_quals) + list_length(other_quals); + kexp.args_offset = SizeOfKernExpr(0); + __appendBinaryStringInfo(&buf, &kexp, SizeOfKernExpr(0)); - return sizeof(cl_bool); -} + foreach (lc, join_quals) + { + Expr *qual = lfirst(lc); -static int -codegen_bool_expression(codegen_context *context, - StringInfo body, BoolExpr *b) -{ - Node *node; + if (exprType((Node *)qual) != BOOLOID) + elog(ERROR, "Bub? JOIN quals must be boolean"); + if (codegen_expression_walker(context, &buf, qual) < 0) + return NULL; + } - if (b->boolop == NOT_EXPR) + kexp_flags__saved = context->kexp_flags; + context->kexp_flags |= KEXP_FLAG__IS_PUSHED_DOWN; + foreach (lc, other_quals) { - Assert(list_length(b->args) == 1); - node = linitial(b->args); + Expr *qual = lfirst(lc); - __appendStringInfo(body, "NOT("); - codegen_expression_walker(context, body, node, NULL); - __appendStringInfoChar(body, ')'); + if (exprType((Node *)qual) != BOOLOID) + elog(ERROR, "Bub? JOIN quals must be boolean"); + if (codegen_expression_walker(context, &buf, qual) < 0) + return NULL; } - else if (b->boolop == AND_EXPR || - b->boolop == OR_EXPR) - { - StringInfoData temp; - List *used_vars_saved; - ListCell *lc; + context->kexp_flags = kexp_flags__saved; + __appendKernExpMagicAndLength(&buf, 0); - initStringInfo(&temp); + return (kern_expression *)buf.data; +} - used_vars_saved = context->used_vars; - context->used_vars = NIL; - foreach (lc, b->args) - { - Node *node = lfirst(lc); - - if (lc != list_head(b->args)) - __appendStringInfo(&temp, " has_null |= status.isnull;\n"); - __appendStringInfo(&temp, - " status = "); - codegen_expression_walker(context, &temp, node, NULL); - __appendStringInfo(&temp, ";\n" - " if (PG_BOOL_%s(status))\n" - " return status;\n", - (b->boolop == AND_EXPR ? "ISFALSE" : "ISTRUE")); - } - context->decl_count++; - __appendStringInfo( - &context->decl, - "DEVICE_INLINE(pg_bool_t)\n" - "__exprBoolOp_%u(kern_context *kcxt", - context->decl_count); - __appendStringInfo(body, - "__exprBoolOp_%u(kcxt", context->decl_count); - foreach (lc, context->used_vars) +/* + * codegen_build_packed_joinquals + */ +bytea * +codegen_build_packed_joinquals(codegen_context *context, + List *stacked_join_quals, + List *stacked_other_quals) +{ + kern_expression *kexp; + StringInfoData buf; + int i, nrels; + size_t sz; + ListCell *lc1, *lc2; + char *result = NULL; + + nrels = list_length(stacked_join_quals); + sz = MAXALIGN(offsetof(kern_expression, u.pack.offset[nrels])); + kexp = alloca(sz); + memset(kexp, 0, sz); + kexp->exptype = TypeOpCode__int4; + kexp->expflags = context->kexp_flags; + kexp->opcode = FuncOpCode__Packed; + kexp->args_offset = sz; + kexp->u.pack.npacked = nrels; + + initStringInfo(&buf); + buf.len = sz; + + i = 0; + forboth (lc1, stacked_join_quals, + lc2, stacked_other_quals) + { + List *join_quals = lfirst(lc1); + List *other_quals = lfirst(lc2); + kern_expression *karg; + + karg = __codegen_build_joinquals(context, + join_quals, + other_quals); + if (karg) { - devtype_info *dtype; - Var *var = lfirst(lc); - - dtype = pgstrom_devtype_lookup(var->vartype); - if (!dtype) - __ELog("type %s is not device supported", - format_type_be(var->vartype)); - __appendStringInfo( - &context->decl, - ", pg_%s_t &", - dtype->type_name); - codegen_expression_walker(context, - &context->decl, - (Node *)var, NULL); - __appendStringInfo(body, ", "); - codegen_expression_walker(context, body, (Node *)var, NULL); - - if (!list_member(used_vars_saved, var)) - used_vars_saved = lappend(used_vars_saved, var); + kexp->u.pack.offset[i] + = __appendBinaryStringInfo(&buf, karg, karg->len); + kexp->nr_args++; + pfree(karg); } - __appendStringInfo( - &context->decl, - ")\n" - "{\n" - " pg_bool_t status __attribute__((unused));\n" - " cl_bool has_null = false;\n" - "\n" - "%s" - " status.isnull |= has_null;\n" - " return status;\n" - "}\n\n", - temp.data); - __appendStringInfo(body, ")"); - context->used_vars = used_vars_saved; - - pfree(temp.data); + i++; } - else + Assert(nrels == i); + + if (kexp->nr_args > 0) { - elog(ERROR, "unknown BoolExprType: %d", (int) b->boolop); + memcpy(buf.data, kexp, sz); + __appendKernExpMagicAndLength(&buf, 0); + result = palloc(VARHDRSZ + buf.len); + memcpy(result + VARHDRSZ, buf.data, buf.len); + SET_VARSIZE(result, VARHDRSZ + buf.len); } - context->devcost += list_length(b->args); - return sizeof(cl_bool); + pfree(buf.data); + return (bytea *)result; } -static int -codegen_coalesce_expression(codegen_context *context, - StringInfo body, - CoalesceExpr *coalesce) +/* + * codegen_build_packed_hashkeys + */ +static kern_expression * +__codegen_build_hash_value(codegen_context *context, + List *hash_keys) { - devtype_info *dtype; - StringInfoData temp; - List *used_vars_saved; - ListCell *lc; - int maxlen = 0; + kern_expression *kexp; + StringInfoData buf; + size_t sz = MAXALIGN(SizeOfKernExpr(0)); + ListCell *lc; - initStringInfo(&temp); - dtype = pgstrom_devtype_lookup(coalesce->coalescetype); - if (!dtype) - __ELog("type %s is not device supported", - format_type_be(coalesce->coalescetype)); + if (hash_keys == NIL) + return NULL; - used_vars_saved = context->used_vars; - context->used_vars = NIL; - foreach (lc, coalesce->args) - { - Node *expr = lfirst(lc); - Oid type_oid = exprType(expr); - int width; - - if (dtype->type_oid != type_oid) - __ELog("device type mismatch in COALESCE: %s / %s", - format_type_be(dtype->type_oid), - format_type_be(type_oid)); - __appendStringInfo(&temp, - " retval = "); - codegen_expression_walker(context, &temp, expr, &width); - __appendStringInfo(&temp, - ";\n" - " if (!retval.isnull)\n" - " return retval;\n"); - if (width < 0) - maxlen = -1; - else if (maxlen >= 0) - maxlen = Max(maxlen, width); - context->devcost += 1; - } + kexp = alloca(sz); + memset(kexp, 0, sz); + kexp->exptype = TypeOpCode__int4; + kexp->expflags = context->kexp_flags; + kexp->opcode = FuncOpCode__HashValue; + kexp->nr_args = list_length(hash_keys); + kexp->args_offset = sz; - context->decl_count++; - __appendStringInfo( - &context->decl, - "DEVICE_INLINE(pg_%s_t)\n" - "__exprCoalesce_%u(kern_context *kcxt", - dtype->type_name, - context->decl_count); - __appendStringInfo( - body, - "__exprCoalesce_%u(kcxt", - context->decl_count); - - foreach (lc, context->used_vars) + initStringInfo(&buf); + buf.len = sz; + foreach (lc, hash_keys) { - devtype_info *__dtype; - Var *var = lfirst(lc); - - __dtype = pgstrom_devtype_lookup(var->vartype); - if (!__dtype) - __ELog("type %s is not device supported", - format_type_be(var->vartype)); - __appendStringInfo( - &context->decl, - ", pg_%s_t &", - __dtype->type_name); - codegen_expression_walker(context, - &context->decl, - (Node *)var, NULL); - __appendStringInfo(body, ", "); - codegen_expression_walker(context, body, (Node *)var, NULL); - - if (!list_member(used_vars_saved, var)) - used_vars_saved = lappend(used_vars_saved, var); + Expr *expr = lfirst(lc); + + codegen_expression_walker(context, &buf, expr); } - __appendStringInfo( - &context->decl, - ")\n" - "{\n" - " pg_%s_t retval __attribute__((unused));\n" - "\n" - " retval.isnull = true;\n" - "%s" - " return retval;\n" - "}\n\n", - dtype->type_name, - temp.data); - __appendStringInfo(body, ")"); - context->used_vars = used_vars_saved; - - pfree(temp.data); - - return maxlen; + memcpy(buf.data, kexp, sz); + __appendKernExpMagicAndLength(&buf, 0); + + return (kern_expression *)buf.data; } -static int -codegen_minmax_expression(codegen_context *context, - StringInfo body, - MinMaxExpr *minmax) +bytea * +codegen_build_packed_hashkeys(codegen_context *context, + List *stacked_hash_keys) { - devtype_info *dtype; - devfunc_info *dfunc; - List *used_vars_saved; - ListCell *lc; - StringInfoData temp; - int maxlen = 0; + kern_expression *kexp; + StringInfoData buf; + int i, nrels; + size_t sz; + ListCell *lc; + char *result = NULL; - dtype = pgstrom_devtype_lookup(minmax->minmaxtype); - if (!dtype) - __ELog("type %s is not device supported", - format_type_be(minmax->minmaxtype)); - context->extra_flags |= dtype->type_flags; + nrels = list_length(stacked_hash_keys); + sz = MAXALIGN(offsetof(kern_expression, u.pack.offset[nrels])); + kexp = alloca(sz); + memset(kexp, 0, sz); + kexp->exptype = TypeOpCode__int4; + kexp->expflags = context->kexp_flags; + kexp->opcode = FuncOpCode__Packed; + kexp->args_offset = sz; + kexp->u.pack.npacked = nrels; - dfunc = pgstrom_devfunc_lookup_type_compare(dtype, minmax->inputcollid); - if (!dfunc) - __ELog("device type %s has no comparison operator", - format_type_be(minmax->minmaxtype)); - context->extra_flags |= dfunc->func_flags; - - initStringInfo(&temp); - used_vars_saved = context->used_vars; - context->used_vars = NIL; - foreach (lc, minmax->args) + initStringInfo(&buf); + buf.len = sz; + + i = 0; + foreach (lc, stacked_hash_keys) { - Node *expr = lfirst(lc); - Oid type_oid = exprType(expr); - int width; - - if (dtype->type_oid != type_oid) - __ELog("device type mismatch in LEAST/GREATEST: %s / %s", - format_type_be(dtype->type_oid), - format_type_be(exprType(expr))); - if (lc == list_head(minmax->args)) - __appendStringInfo(&temp, " r = "); - else - __appendStringInfo(&temp, " x = "); - codegen_expression_walker(context, &temp, expr, &width); - __appendStringInfo(&temp, ";\n"); + List *hash_keys = lfirst(lc); + kern_expression *karg; - if (lc != list_head(minmax->args)) + karg = __codegen_build_hash_value(context, hash_keys); + if (karg) { - __appendStringInfo( - &temp, - " if (r.isnull)\n" - " r = x;\n" - " else if (!x.isnull && PG_%s_THAN(pgfn_%s(kcxt, x, r)))\n" - " r = x;\n", - minmax->op == IS_GREATEST ? "GREATER" : "LESS", - dfunc->func_devname); + kexp->u.pack.offset[i] + = __appendBinaryStringInfo(&buf, karg, karg->len); + kexp->nr_args++; } - if (width < 0) - maxlen = -1; - else if (maxlen >= 0) - maxlen = Max(maxlen, width); - context->devcost += 1; + i++; } + Assert(i == nrels); - context->decl_count++; - __appendStringInfo( - &context->decl, - "DEVICE_INLINE(pg_%s_t)\n" - "__exprMinMax_%u(kern_context *kcxt", - dtype->type_name, - context->decl_count); - __appendStringInfo( - body, - "__exprMinMax_%u(kcxt", - context->decl_count); - - foreach (lc, context->used_vars) - { - devtype_info *__dtype; - Var *var = lfirst(lc); - - __dtype = pgstrom_devtype_lookup(var->vartype); - if (!__dtype) - __ELog("type %s is not device supported", - format_type_be(var->vartype)); - __appendStringInfo( - &context->decl, - ", pg_%s_t &", - __dtype->type_name); - codegen_expression_walker(context, - &context->decl, - (Node *)var, NULL); - __appendStringInfo(body, ", "); - codegen_expression_walker(context, body, (Node *)var, NULL); - - if (!list_member(used_vars_saved, var)) - used_vars_saved = lappend(used_vars_saved, var); - } - __appendStringInfo( - &context->decl, - ")\n" - "{\n" - " pg_%s_t r, x __attribute__((unused));\n" - " pg_int4_t cmp __attribute__((unused));\n" - "\n" - "%s" - " return r;\n" - "}\n\n", - dtype->type_name, - temp.data); - __appendStringInfo(body, ")"); - context->used_vars = used_vars_saved; - - pfree(temp.data); - - return maxlen; -} - -static int -codegen_relabel_expression(codegen_context *context, - StringInfo body, - RelabelType *relabel) -{ - devtype_info *dtype; - Oid stype_oid = exprType((Node *)relabel->arg); - int width; - - dtype = pgstrom_devtype_lookup_and_track(stype_oid, context); - if (!dtype) - __ELog("type %s is not device supported", - format_type_be(stype_oid)); - - dtype = pgstrom_devtype_lookup_and_track(relabel->resulttype, context); - if (!dtype) - __ELog("type %s is not device supported", - format_type_be(relabel->resulttype)); - if (!pgstrom_devtype_can_relabel(stype_oid, dtype->type_oid)) - __ELog("type %s->%s cannot be relabeled on device", - format_type_be(stype_oid), - format_type_be(relabel->resulttype)); - - __appendStringInfo(body, "to_%s(", dtype->type_name); - codegen_expression_walker(context, body, (Node *)relabel->arg, &width); - __appendStringInfoChar(body, ')'); - - return width; -} + if (kexp->nr_args > 0) + { + memcpy(buf.data, kexp, sz); + __appendKernExpMagicAndLength(&buf, 0); + result = palloc(VARHDRSZ + buf.len); + memcpy(result + VARHDRSZ, buf.data, buf.len); + SET_VARSIZE(result, VARHDRSZ + buf.len); + } + pfree(buf.data); -static int -codegen_coerceviaio_expression(codegen_context *context, - StringInfo body, - CoerceViaIO *coerce) -{ - devcast_info *dcast; - Oid stype_oid = exprType((Node *)coerce->arg); - Oid dtype_oid = coerce->resulttype; - - dcast = pgstrom_devcast_lookup(stype_oid, dtype_oid); - if (!dcast || dcast->dcast_coerceviaio_callback == NULL) - __ELog("no device support of coerceviaio (%s -> %s)", - format_type_be(stype_oid), - format_type_be(dtype_oid)); - context->devcost += 8; /* just a rough estimation */ - - return dcast->dcast_coerceviaio_callback(context, body, dcast, coerce); + return (bytea *)result; } +/* + * __try_inject_groupby_expression + */ static int -codegen_coercetodomain_expression(codegen_context *context, - StringInfo body, - CoerceToDomain *coerce_d) +__try_inject_groupby_expression(codegen_context *context, + StringInfo buf, + Expr *expr, + bool *p_found) { - devcast_info *dcast; - Oid stype_oid = exprType((Node *)coerce_d->arg); - Oid dtype_oid = coerce_d->resulttype; - int width; - - dcast = pgstrom_devcast_lookup(stype_oid, dtype_oid); - if (!dcast || dcast->dcast_coerceviaio_callback != NULL) - __ELog("type cast (%s -> %s) is not binary compatible", - format_type_be(stype_oid), - format_type_be(dtype_oid)); - if (!dcast->has_domain_checks) - __ELog("type cast (%s -> %s) has no domain constraint", - format_type_be(stype_oid), - format_type_be(dtype_oid)); - __appendStringInfo(body, "to_%s_domain(kcxt,", - dcast->dst_type->type_name); - codegen_expression_walker(context, body, (Node *)coerce_d->arg, &width); - __appendStringInfoChar(body, ')'); - - return width; -} + int slot_id; + bool found = false; -static int -codegen_casewhen_expression(codegen_context *context, - StringInfo body, - CaseExpr *caseexpr) -{ - devtype_info *rtype; /* result type */ - devtype_info *dtype; - StringInfoData temp; - Node *defresult; - List *used_vars_saved; - ListCell *lc; - Oid type_oid; - int width, maxlen = 0; - - /* check result type */ - rtype = pgstrom_devtype_lookup(caseexpr->casetype); - if (!rtype) - __ELog("type %s is not device supported", - format_type_be(caseexpr->casetype)); - if (caseexpr->defresult) - defresult = (Node *)caseexpr->defresult; - else + slot_id = is_expression_equals_tlist(context, expr); + if (slot_id >= 0) { - defresult = (Node *)makeConst(rtype->type_oid, - -1, - InvalidOid, - rtype->type_length, - 0UL, - true, /* NULL */ - rtype->type_byval); + found = true; } - - initStringInfo(&temp); - used_vars_saved = context->used_vars; - context->used_vars = NIL; - if (caseexpr->arg) + else { - /* type compare function internally used */ - type_oid = exprType((Node *) caseexpr->arg); - dtype = pgstrom_devtype_lookup(type_oid); - if (!dtype) - __ELog("type %s is not device supported", - format_type_be(type_oid)); + ListCell *lc1, *lc2; - __appendStringInfo(&temp, " pg_%s_t CARG = ", dtype->type_name); - codegen_expression_walker(context, &temp, - (Node *)caseexpr->arg, NULL); - __appendStringInfo(&temp, ";\n\n"); - } + slot_id = 0; + forboth (lc1, context->kvars_depth, + lc2, context->kvars_resno) + { + int depth = lfirst_int(lc1); + int resno = lfirst_int(lc2); - foreach (lc, caseexpr->args) - { - CaseWhen *casewhen = (CaseWhen *) lfirst(lc); - Expr *expr = casewhen->expr; - - Assert(IsA(casewhen, CaseWhen) && - exprType((Node *)expr) == BOOLOID && - exprType((Node *)casewhen->result) == rtype->type_oid); - __appendStringInfo(&temp, " if (EVAL("); - codegen_expression_walker(context, &temp, (Node *)expr, NULL); - __appendStringInfo(&temp, "))\n" " return "); - codegen_expression_walker(context, &temp, - (Node *)casewhen->result, - &width); - __appendStringInfo(&temp, ";\n"); - if (width < 0) - maxlen = -1; - else if (maxlen >= 0) - maxlen = Max(maxlen, width); - context->devcost += 1; - } - __appendStringInfo(&temp, " return "); - codegen_expression_walker(context, &temp, defresult, NULL); - __appendStringInfo(&temp, ";\n"); - - context->decl_count++; - __appendStringInfo( - &context->decl, - "DEVICE_INLINE(pg_%s_t)\n" - "__exprCaseWhen_%u(kern_context *kcxt", - rtype->type_name, - context->decl_count); - __appendStringInfo( - body, - "__exprCaseWhen_%u(kcxt", - context->decl_count); - - foreach (lc, context->used_vars) - { - devtype_info *__dtype; - Var *var = lfirst(lc); - - __dtype = pgstrom_devtype_lookup(var->vartype); - if (!__dtype) - __ELog("type %s is not device supported", - format_type_be(var->vartype)); - __appendStringInfo( - &context->decl, - ", pg_%s_t &", - __dtype->type_name); - codegen_expression_walker(context, - &context->decl, - (Node *)var, NULL); - __appendStringInfo(body, ", "); - codegen_expression_walker(context, body, (Node *)var, NULL); - - if (!list_member(used_vars_saved, var)) - used_vars_saved = lappend(used_vars_saved, var); - } - __appendStringInfo( - &context->decl, - ")\n" - "{\n" - "%s" - "}\n\n", - temp.data); - __appendStringInfo(body, ")"); - context->used_vars = used_vars_saved; - - pfree(temp.data); - - return maxlen; -} + if (depth >= 0) + continue; + if (resno > 0 && + resno <= list_length(context->tlist_dev)) + { + TargetEntry *tle = list_nth(context->tlist_dev, resno-1); -static int -codegen_casetest_expression(codegen_context *context, - StringInfo body, - CaseTestExpr *ctest) -{ - __appendStringInfo(body, "CARG"); - return 0; + if (equal(expr, tle->expr)) + { + found = true; + break; + } + } + slot_id++; + } + + if (!found) + { + TargetEntry *tle; + kern_expression kexp; + StringInfoData temp; + devtype_info *dtype; + Oid type_oid; + + /* inject expression */ + slot_id = list_length(context->kvars_depth); + tle = makeTargetEntry(copyObject(expr), + list_length(context->tlist_dev) + 1, + psprintf("slot_%u", slot_id), + true); + context->kvars_depth = lappend_int(context->kvars_depth, -1); + context->kvars_resno = lappend_int(context->kvars_resno, tle->resno); + context->kvars_types = lappend_int(context->kvars_types, InvalidOid); + context->tlist_dev = lappend(context->tlist_dev, tle); + + /* SaveExpr */ + type_oid = exprType((Node *)expr); + dtype = pgstrom_devtype_lookup(type_oid); + if (!dtype) + elog(ERROR, "type %s is not device supported", + format_type_be(type_oid)); + + initStringInfo(&temp); + memset(&kexp, 0, sizeof(kexp)); + kexp.exptype = dtype->type_code; + kexp.expflags = context->kexp_flags; + kexp.opcode = FuncOpCode__SaveExpr; + kexp.nr_args = 1; + kexp.args_offset = MAXALIGN(offsetof(kern_expression, + u.save.data)); + kexp.u.save.slot_id = slot_id; + __appendBinaryStringInfo(&temp, &kexp, kexp.args_offset); + codegen_expression_walker(context, &temp, expr); + __appendKernExpMagicAndLength(&temp, 0); + __appendBinaryStringInfo(buf, temp.data, temp.len); + pfree(temp.data); + } + } + *p_found = found; + + return slot_id; } +/* + * __codegen_groupby_expression + */ static int -codegen_scalar_array_op_expression(codegen_context *context, - StringInfo body, - ScalarArrayOpExpr *opexpr) +__codegen_groupby_expression(codegen_context *context, + StringInfo buf, + Expr *expr) { - devfunc_info *dfunc; - devtype_info *dtype_s; - devtype_info *dtype_a; - devtype_info *dtype_e; - Node *node_s; - Node *node_a; - HeapTuple fn_tup; - oidvector *fn_argtypes = alloca(offsetof(oidvector, values[2])); - - Assert(list_length(opexpr->args) == 2); - node_s = linitial(opexpr->args); - node_a = lsecond(opexpr->args); - dtype_s = pgstrom_devtype_lookup_and_track(exprType(node_s), context); - if (!dtype_s) - __ELog("type %s is not device supported", - format_type_be(exprType(node_s))); - dtype_a = pgstrom_devtype_lookup_and_track(exprType(node_a), context); - if (!dtype_a) - __ELog("type %s is not device supported", - format_type_be(exprType(node_a))); - dtype_e = dtype_a->type_element; - if (!dtype_e) - __ELog("type %s is not an array data type", - format_type_be(exprType(node_a))); - - /* lookup operator function */ - fn_tup = SearchSysCache1(PROCOID, ObjectIdGetDatum(opexpr->opfuncid)); - if (!HeapTupleIsValid(fn_tup)) - elog(ERROR, "cache lookup failed for function %u", opexpr->opfuncid); - PG_TRY(); - { - memset(fn_argtypes, 0, offsetof(oidvector, values[2])); - fn_argtypes->ndim = 1; - fn_argtypes->dataoffset = 0; - fn_argtypes->elemtype = OIDOID; - fn_argtypes->dim1 = 2; - fn_argtypes->lbound1 = 0; - fn_argtypes->values[0] = dtype_s->type_oid; - fn_argtypes->values[1] = dtype_e->type_oid; - SET_VARSIZE(fn_argtypes, offsetof(oidvector, values[2])); - - dfunc = __pgstrom_devfunc_lookup(fn_tup, - BOOLOID, - fn_argtypes, - opexpr->inputcollid); - if (!dfunc) - __ELog("function %s is not device supported", - format_procedure(opexpr->opfuncid)); - pgstrom_devfunc_track(context, dfunc); - } - PG_CATCH(); - { - ReleaseSysCache(fn_tup); - PG_RE_THROW(); - } - PG_END_TRY(); - ReleaseSysCache(fn_tup); - - __appendStringInfo(body, - "PG_SCALAR_ARRAY_OP(kcxt, pgfn_%s, ", - dfunc->func_devname); - codegen_expression_walker(context, body, node_s, NULL); - __appendStringInfo(body, ", "); - codegen_expression_walker(context, body, node_a, NULL); - __appendStringInfo(body, ", %s, %d, %d)", - opexpr->useOr ? "true" : "false", - dtype_e->type_length, - dtype_e->type_align); - /* - * Cost for PG_SCALAR_ARRAY_OP - It repeats on number of invocation - * of the operator function for each array elements. Tentatively, - * we assume one array has 32 elements in average. - */ - context->devcost += 32 * dfunc->func_devcost; + int slot_id; + bool found; - return sizeof(cl_bool); + slot_id = __try_inject_groupby_expression(context, buf, expr, &found); + if (found) + codegen_var_expression(context, buf, expr, slot_id); + return slot_id; } -static void -codegen_expression_walker(codegen_context *context, - StringInfo body, - Node *node, int *p_width) +/* + * codegen_build_groupby_keyhash + */ +static List * +codegen_build_groupby_keyhash(codegen_context *context, + pgstromPlanInfo *pp_info) { - devfunc_info *dfunc; - int width = 0; - Node *__codegen_saved_node; + StringInfoData buf; + List *groupby_keys_input_slot = NIL; + kern_expression kexp; + char *xpucode; + ListCell *cell; - if (node == NULL) - return; - /* save the current node for error message */ - __codegen_saved_node = __codegen_current_node; - __codegen_current_node = node; + Assert(pp_info->groupby_keys != NIL); - switch (nodeTag(node)) + /* + * Add variable slots to reference grouping-keys from the input and + * kds_final buffer. + */ + initStringInfo(&buf); + memset(&kexp, 0, sizeof(kexp)); + kexp.exptype = TypeOpCode__int4; + kexp.expflags = context->kexp_flags; + kexp.opcode = FuncOpCode__HashValue; + kexp.nr_args = list_length(pp_info->groupby_keys); + kexp.args_offset = MAXALIGN(SizeOfKernExpr(0)); + __appendBinaryStringInfo(&buf, &kexp, kexp.args_offset); + foreach (cell, pp_info->groupby_keys) { - case T_Const: - width = codegen_const_expression(context, body, (Const *) node); - break; - - case T_Param: - width = codegen_param_expression(context, body, (Param *) node); - break; - - case T_Var: - width = codegen_varnode_expression(context, body, (Var *) node); - break; - - case T_FuncExpr: - { - FuncExpr *func = (FuncExpr *) node; - - dfunc = pgstrom_devfunc_lookup(func->funcid, - func->funcresulttype, - func->args, - func->inputcollid); - if (!dfunc) - __ELog("function %s is not device supported", - format_procedure(func->funcid)); - pgstrom_devfunc_track(context, dfunc); - width = codegen_function_expression(context, - body, - dfunc, - func->args); - context->devcost += dfunc->func_devcost; - } - break; - - case T_OpExpr: - case T_DistinctExpr: - { - OpExpr *op = (OpExpr *) node; - Oid func_oid = get_opcode(op->opno); - - dfunc = pgstrom_devfunc_lookup(func_oid, - op->opresulttype, - op->args, - op->inputcollid); - if (!dfunc) - __ELog("function %s is not device supported", - format_procedure(func_oid)); - pgstrom_devfunc_track(context, dfunc); - width = codegen_function_expression(context, - body, - dfunc, - op->args); - context->devcost += dfunc->func_devcost; - } - break; - - case T_NullTest: - width = codegen_nulltest_expression(context, - body, - (NullTest *) node); - break; - - case T_BooleanTest: - width = codegen_booleantest_expression(context, - body, - (BooleanTest *) node); - break; - - case T_BoolExpr: - width = codegen_bool_expression(context, - body, - (BoolExpr *) node); - break; + Expr *key = lfirst(cell); + int slot_id = __codegen_groupby_expression(context, &buf, key); - case T_CoalesceExpr: - width = codegen_coalesce_expression(context, - body, - (CoalesceExpr *) node); - break; + groupby_keys_input_slot = lappend_int(groupby_keys_input_slot, slot_id); + } + __appendKernExpMagicAndLength(&buf, 0); - case T_MinMaxExpr: - width = codegen_minmax_expression(context, - body, - (MinMaxExpr *) node); - break; + xpucode = palloc(VARHDRSZ + buf.len); + memcpy(xpucode + VARHDRSZ, buf.data, buf.len); + SET_VARSIZE(xpucode, VARHDRSZ + buf.len); + pp_info->kexp_groupby_keyhash = (bytea *)xpucode; - case T_RelabelType: - width = codegen_relabel_expression(context, - body, - (RelabelType *) node); - break; + return groupby_keys_input_slot; +} - case T_CoerceViaIO: - width = codegen_coerceviaio_expression(context, - body, - (CoerceViaIO *) node); - break; +/* + * codegen_build_groupby_keyload + */ +static List * +codegen_build_groupby_keyload(codegen_context *context, + pgstromPlanInfo *pp_info) +{ + kern_expression *kexp; + List *groupby_keys_final_slot = NIL; + char *xpucode = NULL; + ListCell *lc1, *lc2; - case T_CoerceToDomain: - width = codegen_coercetodomain_expression(context, - body, - (CoerceToDomain *) node); - break; + Assert(pp_info->groupby_keys != NIL); - case T_CaseExpr: - width = codegen_casewhen_expression(context, - body, - (CaseExpr *) node); - break; + foreach (lc1, pp_info->groupby_keys) + { + Expr *key = lfirst(lc1); - case T_CaseTestExpr: - width = codegen_casetest_expression(context, - body, - (CaseTestExpr *) node); - break; + foreach (lc2, context->tlist_dev) + { + TargetEntry *tle = lfirst(lc2); + int slot_id; - case T_ScalarArrayOpExpr: - width = codegen_scalar_array_op_expression(context, - body, - (ScalarArrayOpExpr *) node); - break; - default: - __ELog("Bug? unsupported expression: %s", nodeToString(node)); + if (tle->resjunk || !equal(key, tle->expr)) + continue; + slot_id = list_length(context->kvars_depth); + groupby_keys_final_slot = lappend_int(groupby_keys_final_slot, slot_id); + context->kvars_depth = lappend_int(context->kvars_depth, -2); + context->kvars_resno = lappend_int(context->kvars_resno, tle->resno); + context->kvars_types = lappend_oid(context->kvars_types, InvalidOid); break; + } + if (!lc2) + elog(ERROR, "Bug? group-by key is missing on the tlist_dev"); + } + kexp = __codegen_build_loadvars_one(context, -2); + if (kexp) + { + xpucode = palloc(VARHDRSZ + kexp->len); + memcpy(xpucode + VARHDRSZ, kexp, kexp->len); + SET_VARSIZE(xpucode, VARHDRSZ + kexp->len); + pfree(kexp); } - if (p_width) - *p_width = width; - /* restore */ - __codegen_current_node = __codegen_saved_node; + pp_info->kexp_groupby_keyload = (bytea *)xpucode; + + return groupby_keys_final_slot; } -char * -pgstrom_codegen_expression(Node *expr, codegen_context *context) +/* + * codegen_build_groupby_keycomp + */ +static void +codegen_build_groupby_keycomp(codegen_context *context, + pgstromPlanInfo *pp_info, + List *groupby_keys_input_slot, + List *groupby_keys_final_slot) { - StringInfoData body; - devtype_info *dtype; + StringInfoData buf; + kern_expression kexp; + size_t sz; + char *xpucode; + ListCell *lc1, *lc2, *lc3; + + Assert(pp_info->groupby_keys != NIL); + + initStringInfo(&buf); + forthree (lc1, pp_info->groupby_keys, + lc2, groupby_keys_input_slot, + lc3, groupby_keys_final_slot) + { + Expr *key = lfirst(lc1); + int i_slot_id = lfirst_int(lc2); + int f_slot_id = lfirst_int(lc3); + Oid type_oid = exprType((Node *)key); + Oid coll_oid = exprCollation((Node *)key); + int pos, __pos; + devtype_info *dtype; + devfunc_info *dfunc; - initStringInfo(&body); - if (IsA(expr, List)) - { - if (list_length((List *)expr) == 1) - expr = (Node *)linitial((List *)expr); + dtype = pgstrom_devtype_lookup(type_oid); + if (!dtype) + elog(ERROR, "type %s is not device supported", + format_type_be(type_oid)); + dfunc = devtype_lookup_equal_func(dtype, coll_oid); + if (!dfunc) + elog(ERROR, "type %s has no device executable equal function", + format_type_be(type_oid)); + Assert(dfunc->func_rettype->type_code == TypeOpCode__bool && + dfunc->func_nargs == 2 && + dfunc->func_argtypes[0]->type_oid == type_oid && + dfunc->func_argtypes[1]->type_oid == type_oid); + memset(&kexp, 0, sizeof(kern_expression)); + kexp.exptype = dfunc->func_rettype->type_code; + kexp.expflags = context->kexp_flags; + kexp.opcode = dfunc->func_code; + kexp.nr_args = 2; + kexp.args_offset = SizeOfKernExpr(0); + pos = __appendBinaryStringInfo(&buf, &kexp, kexp.args_offset); + + /* input variable */ + memset(&kexp, 0, sizeof(kern_expression)); + kexp.exptype = dtype->type_code; + kexp.expflags = context->kexp_flags; + kexp.opcode = FuncOpCode__VarExpr; + kexp.u.v.var_typlen = dtype->type_length; + kexp.u.v.var_typbyval = dtype->type_byval; + kexp.u.v.var_typalign = dtype->type_align; + kexp.u.v.var_slot_id = i_slot_id; + __pos = __appendBinaryStringInfo(&buf, &kexp, SizeOfKernExprVar); + __appendKernExpMagicAndLength(&buf, __pos); /* end of FuncExpr */ + + /* final variable */ + memset(&kexp, 0, sizeof(kern_expression)); + kexp.exptype = dtype->type_code; + kexp.expflags = context->kexp_flags; + kexp.opcode = FuncOpCode__VarExpr; + kexp.u.v.var_typlen = dtype->type_length; + kexp.u.v.var_typbyval = dtype->type_byval; + kexp.u.v.var_typalign = dtype->type_align; + kexp.u.v.var_slot_id = f_slot_id; + __pos = __appendBinaryStringInfo(&buf, &kexp, SizeOfKernExprVar); + __appendKernExpMagicAndLength(&buf, __pos); /* end of VarExpr */ + + __appendKernExpMagicAndLength(&buf, pos); /* end of FuncExpr */ + } + + if (list_length(pp_info->groupby_keys) > 1) + { + kern_expression *payload = (kern_expression *)buf.data; + int payload_sz = buf.len; + + initStringInfo(&buf); + memset(&kexp, 0, sizeof(kexp)); + kexp.exptype = TypeOpCode__bool; + kexp.expflags = context->kexp_flags; + kexp.opcode = FuncOpCode__BoolExpr_And; + kexp.nr_args = list_length(pp_info->groupby_keys); + kexp.args_offset = SizeOfKernExpr(0); + __appendBinaryStringInfo(&buf, &kexp, SizeOfKernExpr(0)); + __appendBinaryStringInfo(&buf, payload, payload_sz); + __appendKernExpMagicAndLength(&buf, 0); + pfree(payload); + } + sz = ((kern_expression *)buf.data)->len; + + xpucode = palloc(VARHDRSZ + sz); + memcpy(xpucode + VARHDRSZ, buf.data, sz); + SET_VARSIZE(xpucode, VARHDRSZ + sz); + pfree(buf.data); + + pp_info->kexp_groupby_keycomp = (bytea *)xpucode; +} + +/* + * __codegen_build_groupby_actions + */ +static void +__codegen_build_groupby_actions(codegen_context *context, + pgstromPlanInfo *pp_info) +{ + StringInfoData buf; + int nattrs = list_length(pp_info->groupby_actions); + int nexprs = 0; + int index = 0; + size_t head_sz = MAXALIGN(offsetof(kern_expression, u.pagg.desc[nattrs])); + char *xpucode; + ListCell *lc1, *lc2; + kern_expression *kexp; + + kexp = alloca(head_sz); + memset(kexp, 0, head_sz); + kexp->exptype = TypeOpCode__int4; + kexp->expflags = context->kexp_flags; + kexp->opcode = FuncOpCode__AggFuncs; + kexp->nr_args = 0; + kexp->args_offset = head_sz; + kexp->u.pagg.nattrs = nattrs; + + initStringInfo(&buf); + foreach (lc1, pp_info->groupby_actions) + { + /* MEMO: context->tlist_dev may be updated in the loop, so we cannot use + * forboth() macro here. + */ + TargetEntry *tle = list_nth(context->tlist_dev, index); + int action = lfirst_int(lc1); + int slot_id; + bool inject_new; + kern_aggregate_desc *desc; + + Assert(!tle->resjunk); + desc = &kexp->u.pagg.desc[index++]; + desc->action = action; + if (action == KAGG_ACTION__VREF) + { + slot_id = __try_inject_projection_expression(context, + &buf, + tle->expr, + false, + &inject_new); + if (slot_id < 0) + elog(ERROR, "Bug? grouping-key is not on the kvars-slot"); + if (inject_new) + nexprs++; + desc->arg0_slot_id = slot_id; + } else - expr = (Node *)make_andclause((List *)expr); + { + FuncExpr *func = (FuncExpr *)tle->expr; + int count = 0; + + Assert(IsA(func, FuncExpr) && list_length(func->args) <= 2); + foreach (lc2, func->args) + { + Expr *fn_arg = lfirst(lc2); + + slot_id = __try_inject_projection_expression(context, + &buf, + fn_arg, + false, + &inject_new); + if (slot_id < 0) + elog(ERROR, "Bug? partial-aggregate-function argument is missing"); + if (inject_new) + nexprs++; + if (count == 0) + desc->arg0_slot_id = slot_id; + else if (count == 1) + desc->arg1_slot_id = slot_id; + else + elog(ERROR, "Bug? too much partial function arguments"); + count++; + } + } } + Assert(index == nattrs); - PG_TRY(); + if (nexprs == 0) { - codegen_expression_walker(context, &body, expr, NULL); + Assert(buf.len == 0); + __appendBinaryStringInfo(&buf, kexp, head_sz); + __appendKernExpMagicAndLength(&buf, 0); } - PG_CATCH(); + else { - errdetail("problematic expression: %s", nodeToString(expr)); - PG_RE_THROW(); - } - PG_END_TRY(); + char *payload = buf.data; + size_t payload_sz = buf.len; - /* - * Even if expression itself needs no varlena extra buffer, projection - * code may require the buffer to construct a temporary datum. - * E.g) Numeric datum is encoded to 128bit at the GPU kernel, however, - * projection needs to decode to varlena again. - */ - dtype = pgstrom_devtype_lookup(exprType((Node *) expr)); - if (dtype) - context->extra_bufsz += MAXALIGN(dtype->extra_sz); + kexp->nr_args = nexprs; + initStringInfo(&buf); + __appendBinaryStringInfo(&buf, kexp, head_sz); + __appendBinaryStringInfo(&buf, payload, payload_sz); + __appendKernExpMagicAndLength(&buf, 0); + pfree(payload); + } + xpucode = palloc(VARHDRSZ + buf.len); + memcpy(xpucode + VARHDRSZ, buf.data, buf.len); + SET_VARSIZE(xpucode, VARHDRSZ + buf.len); + pfree(buf.data); - return body.data; + pp_info->kexp_groupby_actions = (bytea *)xpucode; } /* - * pgstrom_union_type_declarations - * - * put declaration of a union type which contains all the types in type_oid_list, - * as follows. OID of device types should be unique, must not duplicated. - * - * union { - * pg_bool_t bool_v; - * pg_text_t text_v; - * : - * } NAME; + * codegen_build_groupby_actions */ void -pgstrom_union_type_declarations(StringInfo buf, - const char *name, - List *type_oid_list) +codegen_build_groupby_actions(codegen_context *context, + pgstromPlanInfo *pp_info) { - ListCell *lc; - devtype_info *dtype; - bool meet_array_v = false; + List *groupby_keys_input_slot = NIL; + List *groupby_keys_final_slot = NIL; - if (type_oid_list == NIL) - return; - appendStringInfo(buf, " union {\n"); - foreach (lc, type_oid_list) + if (pp_info->groupby_keys != NIL) { - Oid type_oid = lfirst_oid(lc); - - dtype = pgstrom_devtype_lookup(type_oid); - if (!dtype) - __ELog("failed to lookup device type: %u", type_oid); - /* - * All the array types have same device type name (pg_array_t) - * regardless of the element type. So, we have to avoid duplication - * of the field name in union, by special handling. - */ - if (dtype->type_element) - { - if (meet_array_v) - continue; - meet_array_v = true; - } - appendStringInfo(buf, - " pg_%s_t %s_v;\n", - dtype->type_name, - dtype->type_name); + groupby_keys_input_slot = codegen_build_groupby_keyhash(context, pp_info); + groupby_keys_final_slot = codegen_build_groupby_keyload(context, pp_info); + codegen_build_groupby_keycomp(context, pp_info, + groupby_keys_input_slot, + groupby_keys_final_slot); } - appendStringInfo(buf, " } %s __attribute__((unused));\n", name); + __codegen_build_groupby_actions(context, pp_info); } /* - * __pgstrom_device_expression - * - * It shows a quick decision whether the provided expression tree is - * available to run on CUDA device, or not. + * pgstrom_xpu_expression */ bool -__pgstrom_device_expression(PlannerInfo *root, - RelOptInfo *baserel, - Expr *expr, - int *p_devcost, int *p_extra_sz, - const char *filename, int lineno) +pgstrom_xpu_expression(Expr *expr, + uint32_t task_kind, + List *input_rels_tlist, + int *p_devcost) { - MemoryContext memcxt = CurrentMemoryContext; - codegen_context con; - int dummy = 0; - bool result = true; + codegen_context context; + + Assert((task_kind & DEVKIND__ANY) == DEVKIND__NVIDIA_GPU || + (task_kind & DEVKIND__ANY) == DEVKIND__NVIDIA_DPU); + memset(&context, 0, sizeof(context)); + context.elevel = DEBUG2; + context.top_expr = expr; + context.required_flags = (task_kind & DEVKIND__ANY); + context.input_rels_tlist = input_rels_tlist; if (!expr) return false; - pgstrom_init_codegen_context(&con, root, baserel); - PG_TRY(); + if (IsA(expr, List)) { - if (IsA(expr, List)) - { - List *exprsList = (List *)expr; - ListCell *lc; - - foreach (lc, exprsList) - { - Node *node = (Node *)lfirst(lc); + List *l = (List *)expr; - codegen_expression_walker(&con, NULL, node, &dummy); - } - } + if (list_length(l) == 1) + expr = linitial(l); else - { - codegen_expression_walker(&con, NULL, (Node *)expr, &dummy); - } - } - PG_CATCH(); - { - ErrorData *edata; - - MemoryContextSwitchTo(memcxt); - edata = CopyErrorData(); - if (edata->sqlerrcode != ERRCODE_FEATURE_NOT_SUPPORTED) - PG_RE_THROW(); - - FlushErrorState(); - - ereport(DEBUG2, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("%s:%d %s, at %s:%d", - filename, lineno, - edata->message, - edata->filename, edata->lineno), - errdetail("expression: %s", - nodeToString(__codegen_current_node)))); - __codegen_current_node = NULL; - FreeErrorData(edata); - result = false; + expr = make_andclause(l); } - PG_END_TRY(); + if (codegen_expression_walker(&context, NULL, expr) < 0) + return false; + if (p_devcost) + *p_devcost = context.device_cost; + return true; +} - if (result) - { - if (con.extra_bufsz > KERN_CONTEXT_VARLENA_BUFSZ_LIMIT) - { - elog(DEBUG2, "Expression consumes too much buffer (%u): %s", - con.extra_bufsz, nodeToString(expr)); - return false; - } - Assert(con.devcost >= 0); - if (p_devcost) - *p_devcost = con.devcost; - if (p_extra_sz) - *p_extra_sz = con.extra_bufsz; - } - return result; +/* + * pgstrom_gpu_expression + * + * checks whether the expression is executable on GPU devices. + */ +bool +pgstrom_gpu_expression(Expr *expr, + List *input_rels_tlist, + int *p_devcost) +{ + return pgstrom_xpu_expression(expr, + DEVKIND__NVIDIA_GPU, + input_rels_tlist, + p_devcost); } /* - * devcast_text2numeric_callback - * ------ - * Special case handling of text->numeric values, including the case of - * jsonb key references. + * pgstrom_dpu_expression + * + * checks whether the expression is executable on DPU devices. */ -static int -devcast_text2numeric_callback(codegen_context *context, - StringInfo body, - devcast_info *dcast, - CoerceViaIO *node) +bool +pgstrom_dpu_expression(Expr *expr, + List *input_rels_tlist, + int *p_devcost) { - devtype_info *dtype = dcast->dst_type; - Expr *arg = node->arg; - Oid func_oid = InvalidOid; - List *func_args = NIL; - char dfunc_name[100]; - int width; - ListCell *lc; + return pgstrom_xpu_expression(expr, + DEVKIND__NVIDIA_DPU, + input_rels_tlist, + p_devcost); +} - /* check special case if jsonb key reference */ - if (IsA(arg, FuncExpr)) - { - FuncExpr *func = (FuncExpr *)arg; +/* + * pgstrom_xpucode_to_string + * + * transforms xPU code to human readable form. + */ +static void +__xpucode_to_cstring(StringInfo buf, + const kern_expression *kexp, + const CustomScanState *css, /* optional */ + ExplainState *es, /* optional */ + List *dcontext); /* optionsl */ - func_oid = func->funcid; - func_args = func->args; - } - else if (IsA(arg, OpExpr) || IsA(arg, DistinctExpr)) - { - OpExpr *op = (OpExpr *)arg; +static void +__xpucode_const_cstring(StringInfo buf, const kern_expression *kexp) +{ + devtype_info *dtype = devtype_lookup_by_opcode(kexp->exptype); - func_oid = get_opcode(op->opno); - func_args = op->args; + if (kexp->u.c.const_isnull) + { + appendStringInfo(buf, "{Const(%s): value=NULL}", dtype->type_name); } else - __ELog("Not supported CoerceViaIO with jsonb key reference"); - - switch (func_oid) { - case F_JSONB_OBJECT_FIELD_TEXT: - snprintf(dfunc_name, sizeof(dfunc_name), - "jsonb_object_field_as_%s", dtype->type_name); - break; - case F_JSONB_ARRAY_ELEMENT_TEXT: - snprintf(dfunc_name, sizeof(dfunc_name), - "jsonb_array_element_as_%s", dtype->type_name); - break; - default: - __ELog("Not supported CoerceViaIO with jsonb key reference"); + int16 type_len; + bool type_byval; + char type_align; + char type_delim; + Oid type_ioparam; + Oid type_outfunc; + Datum datum = 0; + Datum label; + + get_type_io_data(kexp->u.c.const_type, + IOFunc_output, + &type_len, + &type_byval, + &type_align, + &type_delim, + &type_ioparam, + &type_outfunc); + if (type_byval) + memcpy(&datum, kexp->u.c.const_value, type_len); + else + datum = PointerGetDatum(kexp->u.c.const_value); + label = OidFunctionCall1(type_outfunc, datum); + appendStringInfo(buf, "{Const(%s): value='%s'}", + dtype->type_name, + DatumGetCString(label)); } - context->extra_flags |= DEVKERNEL_NEEDS_JSONLIB; - __appendStringInfo(body, "pgfn_%s(kcxt", dfunc_name); - foreach (lc, func_args) - { - Node *expr = lfirst(lc); - int dummy; +} - __appendStringInfo(body, ", "); - codegen_expression_walker(context, body, expr, &dummy); - } - __appendStringInfoChar(body, ')'); - if (dtype->type_length > 0) - width = dtype->type_length; - else if (dtype->type_length == -1) - width = -1; /* we don't know max length of a jsonb field */ - else - elog(ERROR, "unexpected type length: %d", dtype->type_length); +static void +__xpucode_param_cstring(StringInfo buf, const kern_expression *kexp) +{ + devtype_info *dtype = devtype_lookup_by_opcode(kexp->exptype); - return width; + appendStringInfo(buf, "{Param(%s): param_id=%u}", + dtype->type_name, + kexp->u.p.param_id); } static void -devtype_cache_invalidator(Datum arg, int cacheid, uint32 hashvalue) +__xpucode_var_cstring(StringInfo buf, const kern_expression *kexp) { - dlist_mutable_iter iter; - int hindex; + devtype_info *dtype = devtype_lookup_by_opcode(kexp->exptype); + + appendStringInfo(buf, "{Var(%s): slot_id=%d}", + dtype->type_name, + kexp->u.v.var_slot_id); +} + +static void +__xpucode_loadvars_cstring(StringInfo buf, + const kern_expression *kexp, + const CustomScanState *css, + ExplainState *es, + List *dcontext) +{ + bool verbose = false; + int depth = kexp->u.load.depth; + int i; - Assert(cacheid == TYPEOID); - if (hashvalue == 0) + Assert(kexp->nr_args == 0); + appendStringInfo(buf, "{LoadVars: depth=%d", depth); + if (kexp->u.load.nloads > 0) + appendStringInfo(buf, " kvars=["); + + if (css) { - for (hindex=0; hindex < lengthof(devtype_info_slot); hindex++) - dlist_init(&devtype_info_slot[hindex]); - return; + CustomScan *cscan = (CustomScan *)css->ss.ps.plan; + verbose = (cscan->custom_plans != NIL); } - hindex = hashvalue % lengthof(devtype_info_slot); - dlist_foreach_modify (iter, &devtype_info_slot[hindex]) + for (i=0; i < kexp->u.load.nloads; i++) { - devtype_info *dtype = dlist_container(devtype_info, - chain, iter.cur); - if (dtype->hashvalue == hashvalue) + const kern_vars_defitem *vitem = &kexp->u.load.kvars[i]; + + if (i > 0) + appendStringInfo(buf, ", "); + if (!css) + { + appendStringInfo(buf, "(slot_id=%u, resno=%d)", + vitem->var_slot_id, + vitem->var_resno); + } + else if (depth == 0) + { + TupleDesc tupdesc = RelationGetDescr(css->ss.ss_currentRelation); + Form_pg_attribute attr = TupleDescAttr(tupdesc, vitem->var_resno - 1); + CustomScan *cscan = (CustomScan *)css->ss.ps.plan; + Var *kvar; + + kvar = makeVar(cscan->scan.scanrelid, + attr->attnum, + attr->atttypid, + attr->atttypmod, + attr->attcollation, 0); + appendStringInfo(buf, "%u:%s", + vitem->var_slot_id, + deparse_expression((Node *)kvar, + dcontext, + verbose, false)); + pfree(kvar); + } + else if (depth < 0) + { + CustomScan *cscan = (CustomScan *)css->ss.ps.plan; + + if (vitem->var_resno >= 1 || + vitem->var_resno <= list_length(cscan->custom_scan_tlist)) + { + TargetEntry *tle = list_nth(cscan->custom_scan_tlist, + vitem->var_resno-1); + appendStringInfo(buf, "%u:%s", + vitem->var_slot_id, + deparse_expression((Node *)tle->expr, + dcontext, + verbose, false)); + } + else + { + appendStringInfo(buf, "var(slot_id=%u)", vitem->var_slot_id); + } + } + else { - dlist_delete(&dtype->chain); - memset(&dtype->chain, 0, sizeof(dlist_node)); + CustomScan *cscan = (CustomScan *)css->ss.ps.plan; + Plan *plan; + TargetEntry *tle; + + plan = list_nth(cscan->custom_plans, depth - 1); + tle = list_nth(plan->targetlist, vitem->var_resno - 1); + appendStringInfo(buf, "%u:%s", + vitem->var_slot_id, + deparse_expression((Node *)tle->expr, + dcontext, + verbose, false)); } } + if (kexp->u.load.nloads > 0) + appendStringInfo(buf, "]"); } +#if 0 static void -devfunc_cache_invalidator(Datum arg, int cacheid, uint32 hashvalue) +__xpucode_projection_cstring(StringInfo buf, + const kern_expression *kexp, + const CustomScanState *css, /* optional */ + ExplainState *es, /* optional */ + List *dcontext) { - dlist_mutable_iter iter; - int hindex; + int i, nexprs = kexp->u.proj.nexprs; - Assert(cacheid == PROCOID); - if (hashvalue == 0) - { - for (hindex=0; hindex < lengthof(devfunc_info_slot); hindex++) - dlist_init(&devfunc_info_slot[hindex]); - return; - } - hindex = hashvalue % lengthof(devfunc_info_slot); - dlist_foreach_modify (iter, &devfunc_info_slot[hindex]) + if (kexp->nr_args > 0) { - devfunc_info *dfunc = dlist_container(devfunc_info, - chain, iter.cur); - if (dfunc->hashvalue == hashvalue) + const kern_expression *karg; + + if (kexp->nr_args == 1) + appendStringInfo(buf, " arg="); + else + appendStringInfo(buf, " args=["); + for (i=0, karg = KEXP_FIRST_ARG(kexp); + i < kexp->nr_args; + i++, karg = KEXP_NEXT_ARG(karg)) { - dlist_delete(&dfunc->chain); - memset(&dfunc->chain, 0, sizeof(dlist_node)); + const kern_projection_desc *desc = &kexp->u.proj.desc[i]; + + if (!__KEXP_IS_VALID(kexp, karg)) + elog(ERROR, "XpuCode looks corrupted"); + if (i > 0) + appendStringInfo(buf, ", "); + appendStringInfo(buf, "%d:", desc->slot_id); + __xpucode_to_cstring(buf, karg, css, es, dcontext); } + if (kexp->nr_args > 1) + appendStringInfoChar(buf, ']'); } + appendStringInfoChar(buf, '}'); } +#endif static void -devcast_cache_invalidator(Datum arg, int cacheid, uint32 hashvalue) +__xpucode_aggfuncs_cstring(StringInfo buf, + const kern_expression *kexp, + const CustomScanState *css, /* optional */ + ExplainState *es, /* optional */ + List *dcontext) { - dlist_mutable_iter iter; - int hindex; - - Assert(cacheid == CASTSOURCETARGET); - if (hashvalue == 0) + appendStringInfo(buf, "{AggFuncs <"); + for (int j=0; j < kexp->u.pagg.nattrs; j++) { - for (hindex=0; hindex < lengthof(devcast_info_slot); hindex++) - dlist_init(&devcast_info_slot[hindex]); - return; - } + const kern_aggregate_desc *desc = &kexp->u.pagg.desc[j]; - hindex = hashvalue % lengthof(devcast_info_slot); - dlist_foreach_modify (iter, &devcast_info_slot[hindex]) - { - devcast_info *dcast = dlist_container(devcast_info, - chain, iter.cur); - if (dcast->hashvalue == hashvalue) + if (j > 0) + appendStringInfo(buf, ", "); + switch (desc->action) { - dlist_delete(&dcast->chain); - memset(&dcast->chain, 0, sizeof(dlist_node)); + case KAGG_ACTION__VREF: + appendStringInfo(buf, "vref[%d]", desc->arg0_slot_id); + break; + case KAGG_ACTION__NROWS_ANY: + appendStringInfo(buf, "nrows[*]"); + break; + case KAGG_ACTION__NROWS_COND: + appendStringInfo(buf, "nrows[%d]", + desc->arg0_slot_id); + break; + case KAGG_ACTION__PMIN_INT: + case KAGG_ACTION__PMIN_FP: + appendStringInfo(buf, "pmin[%d]", + desc->arg0_slot_id); + break; + case KAGG_ACTION__PMAX_INT: + case KAGG_ACTION__PMAX_FP: + appendStringInfo(buf, "pmax[%d]", + desc->arg0_slot_id); + break; + case KAGG_ACTION__PSUM_INT: + case KAGG_ACTION__PSUM_FP: + appendStringInfo(buf, "psum[%d]", + desc->arg0_slot_id); + break; + case KAGG_ACTION__PAVG_INT: + case KAGG_ACTION__PAVG_FP: + appendStringInfo(buf, "pavg[%d]", + desc->arg0_slot_id); + break; + case KAGG_ACTION__STDDEV: + appendStringInfo(buf, "stddev[%d]", + desc->arg0_slot_id); + break; + case KAGG_ACTION__COVAR: + appendStringInfo(buf, "stddev[%d,%d]", + desc->arg0_slot_id, + desc->arg1_slot_id); + break; + default: + appendStringInfo(buf, "unknown[%d,%d]", + desc->arg0_slot_id, + desc->arg1_slot_id); + break; } } + appendStringInfo(buf, ">"); } static void -devindex_cache_invalidator(Datum arg, int cacheid, uint32 hashvalue) -{ - dlist_mutable_iter iter; - int hindex; - - Assert(cacheid == AMOPOPID); - if (hashvalue == 0) - { - for (hindex=0; hindex < lengthof(devindex_info_slot); hindex++) - dlist_init(&devindex_info_slot[hindex]); - return; +__xpucode_to_cstring(StringInfo buf, + const kern_expression *kexp, + const CustomScanState *css, /* optional */ + ExplainState *es, /* optional */ + List *dcontext) /* optionsl */ +{ + const kern_expression *karg; + int i, pos; + + switch (kexp->opcode) + { + case FuncOpCode__ConstExpr: + __xpucode_const_cstring(buf, kexp); + return; + case FuncOpCode__ParamExpr: + __xpucode_param_cstring(buf, kexp); + return; + case FuncOpCode__VarExpr: + __xpucode_var_cstring(buf, kexp); + return; + case FuncOpCode__Projection: + appendStringInfo(buf, "{Projection <"); + for (int j=0; j < kexp->u.proj.nattrs; j++) + { + const kern_projection_desc *desc = &kexp->u.proj.desc[j]; + if (j > 0) + appendStringInfoChar(buf, ','); + appendStringInfo(buf, "%d", desc->slot_id); + } + appendStringInfo(buf, ">"); + break; + case FuncOpCode__LoadVars: + __xpucode_loadvars_cstring(buf, kexp, css, es, dcontext); + break; + case FuncOpCode__HashValue: + appendStringInfo(buf, "{HashValue"); + break; + case FuncOpCode__JoinQuals: + appendStringInfo(buf, "{JoinQuals: "); + for (i=0, karg=KEXP_FIRST_ARG(kexp); + i < kexp->nr_args; + i++, karg=KEXP_NEXT_ARG(karg)) + { + if (!__KEXP_IS_VALID(kexp,karg)) + elog(ERROR, "XpuCode looks corrupted"); + appendStringInfo(buf, "%s ", i > 0 ? "," : ""); + if ((karg->expflags & KEXP_FLAG__IS_PUSHED_DOWN) != 0) + appendStringInfoChar(buf, '<'); + __xpucode_to_cstring(buf, karg, css, es, dcontext); + if ((karg->expflags & KEXP_FLAG__IS_PUSHED_DOWN) != 0) + appendStringInfoChar(buf, '>'); + } + appendStringInfo(buf, "}"); + return; + case FuncOpCode__SaveExpr: + appendStringInfo(buf, "{SaveExpr slot=%d:", + kexp->u.save.slot_id); + break; + case FuncOpCode__AggFuncs: + __xpucode_aggfuncs_cstring(buf, kexp, css, es, dcontext); + break; + case FuncOpCode__Packed: + appendStringInfo(buf, "{Packed"); + pos = buf->len; + for (i=0; i < kexp->u.pack.npacked; i++) + { + karg = __PICKUP_PACKED_KEXP(kexp, i); + if (!karg) + continue; + if (!__KEXP_IS_VALID(kexp,karg)) + elog(ERROR, "XpuCode looks corrupted"); + if (buf->len > pos) + appendStringInfoChar(buf,','); + appendStringInfo(buf, " items[%u]=", i); + __xpucode_to_cstring(buf, karg, css, es, dcontext); + } + appendStringInfo(buf, "}"); + return; + case FuncOpCode__BoolExpr_And: + appendStringInfo(buf, "{Bool::AND"); + break; + case FuncOpCode__BoolExpr_Or: + appendStringInfo(buf, "{Bool::OR"); + break; + case FuncOpCode__BoolExpr_Not: + appendStringInfo(buf, "{Bool::NOT"); + break; + case FuncOpCode__NullTestExpr_IsNull: + appendStringInfo(buf, "{IsNull"); + break; + case FuncOpCode__NullTestExpr_IsNotNull: + appendStringInfo(buf, "{IsNotNull"); + break; + case FuncOpCode__BoolTestExpr_IsTrue: + appendStringInfo(buf, "{BoolTest::IsTrue"); + break; + case FuncOpCode__BoolTestExpr_IsNotTrue: + appendStringInfo(buf, "{BoolTest::IsNotTrue"); + break; + case FuncOpCode__BoolTestExpr_IsFalse: + appendStringInfo(buf, "{BoolTest::IsFalse"); + break; + case FuncOpCode__BoolTestExpr_IsNotFalse: + appendStringInfo(buf, "{BoolTest::IsNotFalse"); + break; + case FuncOpCode__BoolTestExpr_IsUnknown: + appendStringInfo(buf, "{BoolTest::IsUnknown"); + break; + case FuncOpCode__BoolTestExpr_IsNotUnknown: + appendStringInfo(buf, "{BoolTest::IsNotUnknown"); + break; + default: + { + devtype_info *dtype = devtype_lookup_by_opcode(kexp->exptype); + devfunc_info *dfunc = devfunc_lookup_by_opcode(kexp->opcode); + + appendStringInfo(buf, "{Func::%s(%s)", + dfunc->func_name, + dtype->type_name); + } + break; } - - hindex = hashvalue % lengthof(devcast_info_slot); - dlist_foreach_modify (iter, &devcast_info_slot[hindex]) + if (kexp->nr_args > 0) { - devindex_info *dindex = dlist_container(devindex_info, - chain, iter.cur); - if (dindex->hashvalue == hashvalue) + if (kexp->nr_args == 1) + appendStringInfo(buf, " arg="); + else + appendStringInfo(buf, " args=["); + + for (i=0, karg=KEXP_FIRST_ARG(kexp); + i < kexp->nr_args; + i++, karg=KEXP_NEXT_ARG(karg)) { - dlist_delete(&dindex->chain); - memset(&dindex->chain, 0, sizeof(dlist_node)); + if (!__KEXP_IS_VALID(kexp,karg)) + elog(ERROR, "XpuCode looks corrupted"); + if (i > 0) + appendStringInfo(buf, ", "); + __xpucode_to_cstring(buf, karg, css, es, dcontext); } + if (kexp->nr_args > 1) + appendStringInfoChar(buf, ']'); } + appendStringInfoChar(buf, '}'); } void -pgstrom_init_codegen_context(codegen_context *context, - PlannerInfo *root, - RelOptInfo *baserel) +pgstrom_explain_xpucode(const CustomScanState *css, + ExplainState *es, + List *dcontext, + const char *label, + bytea *xpucode) { - memset(context, 0, sizeof(codegen_context)); - initStringInfo(&context->decl); - context->root = root; - context->baserel = baserel; + StringInfoData buf; + + if (xpucode) + { + const kern_expression *kexp = (const kern_expression *)VARDATA(xpucode); + + initStringInfo(&buf); + __xpucode_to_cstring(&buf, kexp, css, es, dcontext); + ExplainPropertyText(label, buf.data, es); + pfree(buf.data); + } } -void -pgstrom_init_codegen(void) +char * +pgstrom_xpucode_to_string(bytea *xpu_code) { - int i; + StringInfoData buf; + + initStringInfo(&buf); + __xpucode_to_cstring(&buf, (const kern_expression *)VARDATA(xpu_code), + NULL, NULL, NIL); - for (i=0; i < lengthof(devtype_info_slot); i++) - dlist_init(&devtype_info_slot[i]); - for (i=0; i < lengthof(devfunc_info_slot); i++) - dlist_init(&devfunc_info_slot[i]); - for (i=0; i < lengthof(devcast_info_slot); i++) - dlist_init(&devcast_info_slot[i]); - for (i=0; i < lengthof(devindex_info_slot); i++) - dlist_init(&devindex_info_slot[i]); + return buf.data; +} + +static void +pgstrom_devcache_invalidator(Datum arg, int cacheid, uint32 hashvalue) +{ + MemoryContextReset(devinfo_memcxt); + memset(devtype_info_slot, 0, sizeof(List *) * DEVTYPE_INFO_NSLOTS); + memset(devtype_code_slot, 0, sizeof(List *) * DEVTYPE_INFO_NSLOTS); + memset(devfunc_info_slot, 0, sizeof(List *) * DEVFUNC_INFO_NSLOTS); + memset(devfunc_code_slot, 0, sizeof(List *) * DEVFUNC_INFO_NSLOTS); +} +void +pgstrom_init_codegen(void) +{ devinfo_memcxt = AllocSetContextCreate(CacheMemoryContext, "device type/func info cache", ALLOCSET_DEFAULT_SIZES); - CacheRegisterSyscacheCallback(PROCOID, devfunc_cache_invalidator, 0); - CacheRegisterSyscacheCallback(TYPEOID, devtype_cache_invalidator, 0); - CacheRegisterSyscacheCallback(CASTSOURCETARGET, devcast_cache_invalidator, 0); - CacheRegisterSyscacheCallback(AMOPOPID, devindex_cache_invalidator, 0); + pgstrom_devcache_invalidator(0, 0, 0); + CacheRegisterSyscacheCallback(TYPEOID, pgstrom_devcache_invalidator, 0); + CacheRegisterSyscacheCallback(PROCOID, pgstrom_devcache_invalidator, 0); } diff --git a/src/cuda_common.h b/src/cuda_common.h index 0a593cda2..aa7e25881 100644 --- a/src/cuda_common.h +++ b/src/cuda_common.h @@ -1,283 +1,25 @@ /* * cuda_common.h * - * A common header for CUDA device code - * -- - * Copyright 2011-2021 (C) KaiGai Kohei - * Copyright 2014-2021 (C) PG-Strom Developers Team + * Common header for CUDA device code, in addition to xPU common definitions. + * ---- + * Copyright 2011-2023 (C) KaiGai Kohei + * Copyright 2014-2023 (C) PG-Strom Developers Team * * This program is free software; you can redistribute it and/or modify * it under the terms of the PostgreSQL License. */ #ifndef CUDA_COMMON_H #define CUDA_COMMON_H +#include "xpu_common.h" -/* ---- Check minimum required CUDA version ---- */ -#ifdef __CUDACC__ -#if __CUDACC_VER_MAJOR__ < 9 || \ - (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ < 2) -#error PG-Strom requires CUDA 9.2 or later. Use newer version. -#endif /* >=CUDA9.2 */ -#include -#include -#endif /* __CUDACC__ */ - -/* check MAXIMUM_ALIGNOF */ -#if MAXIMUM_ALIGNOF == 8 -#define MAXIMUM_ALIGNOF_SHIFT 3 -#else -#error Unexpected MAXIMUM_ALIGNOF definition -#endif +#define WARPSIZE 32 +#define MAXTHREADS_PER_BLOCK 1024 +#define MAXWARPS_PER_BLOCK (MAXTHREADS_PER_BLOCK / WARPSIZE) +#if defined(__CUDACC__) /* - * Basic type definition - because of historical reason, we use "cl_" - * prefix for the definition of data types below. It might imply - * something related to OpenCL, but what we intend at this moment is - * "CUDA Language". - */ -typedef char cl_bool; -typedef char cl_char; -typedef unsigned char cl_uchar; -typedef short cl_short; -typedef unsigned short cl_ushort; -typedef int cl_int; -typedef unsigned int cl_uint; -#ifdef __CUDACC__ -typedef long long cl_long; -typedef unsigned long long cl_ulong; -#else /* __CUDACC__ */ -typedef long cl_long; -typedef unsigned long cl_ulong; -#endif /* !__CUDACC__ */ -#ifdef __CUDACC__ -#include -typedef __half cl_half; -#else -/* Host code has no __half definition, so put dummy definition */ -typedef unsigned short cl_half; -#endif /* __CUDACC__ */ -typedef float cl_float; -typedef double cl_double; - -#define CL_SHORT_NBITS (sizeof(cl_short) * BITS_PER_BYTE) -#define CL_INT_NBITS (sizeof(cl_int) * BITS_PER_BYTE) -#define CL_LONG_NBITS (sizeof(cl_long) * BITS_PER_BYTE) - -/* PG's utility macros */ -#ifdef __CUDACC__ -#ifdef offsetof -#undef offsetof -#endif /* offsetof */ -#define offsetof(TYPE,FIELD) ((long) &((TYPE *)0UL)->FIELD) - -/* - * At CUDA10, we found nvcc replaces the offsetof above by __builtin_offsetof - * regardless of our macro definitions. It is mostly equivalent, however, it - * does not support offset calculation which includes run-time values. - * E.g) offsetof(kds, colmeta[kds->ncols]) made an error. - */ -#ifdef __NVCC__ -#define __builtin_offsetof(TYPE,FIELD) ((long) &((TYPE *)0UL)->FIELD) -#endif /* __NVCC__ */ - -#ifdef lengthof -#undef lengthof -#endif -#define lengthof(ARRAY) (sizeof(ARRAY) / sizeof((ARRAY)[0])) - -#ifdef container_of -#undef container_of -#endif -#define container_of(TYPE,FIELD,PTR) \ - ((TYPE *)((char *) (PTR) - offsetof(TYPE, FIELD))) - -#ifndef true -#define true ((cl_bool) 1) -#endif -#ifndef false -#define false ((cl_bool) 0) -#endif -#ifdef __CUDACC__ -#undef FLEXIBLE_ARRAY_MEMBER -#define FLEXIBLE_ARRAY_MEMBER 1 -#elif !defined(FLEXIBLE_ARRAY_MEMBER) -#define FLEXIBLE_ARRAY_MEMBER 1 -#endif /* __CUDACC__ */ - -/* - * If NVCC includes this file, some inline function needs declarations of - * basic utility functions. - */ -#ifndef __CUDACC_RTC__ -#include -#include -#endif /* __CUDACC_RTC__ */ - -#define Assert(cond) assert(cond) - -/* Another basic type definitions */ -typedef cl_ulong hostptr_t; -typedef cl_ulong Datum; -typedef struct nameData -{ - char data[NAMEDATALEN]; -} NameData; - -#define PointerGetDatum(X) ((Datum) (X)) -#define DatumGetPointer(X) ((char *) (X)) - -#define SET_1_BYTE(value) (((Datum) (value)) & 0x000000ffL) -#define SET_2_BYTES(value) (((Datum) (value)) & 0x0000ffffL) -#define SET_4_BYTES(value) (((Datum) (value)) & 0xffffffffL) -#define SET_8_BYTES(value) ((Datum) (value)) - -#define READ_INT8_PTR(addr) SET_1_BYTE(*((cl_uchar *)(addr))) -#define READ_INT16_PTR(addr) SET_2_BYTES(*((cl_ushort *)(addr))) -#define READ_INT32_PTR(addr) SET_4_BYTES(*((cl_uint *)(addr))) -#define READ_INT64_PTR(addr) SET_8_BYTES(*((cl_ulong *)(addr))) - -#define INT64CONST(x) ((cl_long) x##L) -#define UINT64CONST(x) ((cl_ulong) x##UL) - -#define Max(a,b) ((a) > (b) ? (a) : (b)) -#define Max3(a,b,c) ((a) > (b) ? Max((a),(c)) : Max((b),(c))) -#define Max4(a,b,c,d) Max(Max((a),(b)),Max((c),(d))) - -#define Min(a,b) ((a) < (b) ? (a) : (b)) -#define Min3(a,b,c) ((a) < (b) ? Min((a),(c)) : Min((b),(c))) -#define Min4(a,b,c,d) Min(Min((a),(b)),Min((c),(d))) - -#define Add(a,b) ((a) + (b)) -#define Add3(a,b,c) ((a) + (b) + (c)) -#define Add4(a,b,c,d) ((a) + (b) + (c) + (d)) - -#define Compare(a,b) ((a) > (b) ? 1 : ((a) < (b) ? -1 : 0)) - -/* same as host side get_next_log2() */ -#define get_next_log2(value) \ - ((value) == 0 ? 0 : (sizeof(cl_ulong) * BITS_PER_BYTE - \ - __clzll((cl_ulong)(value) - 1))) -/* - * Limitation of types - */ -#ifndef SCHAR_MAX -#define SCHAR_MAX 127 -#endif -#ifndef SCHAR_MIN -#define SCHAR_MIN (-128) -#endif -#ifndef UCHAR_MAX -#define UCHAR_MAX 255 -#endif -#ifndef SHRT_MAX -#define SHRT_MAX 32767 -#endif -#ifndef SHRT_MIN -#define SHRT_MIN (-32767-1) -#endif -#ifndef USHRT_MAX -#define USHRT_MAX 65535 -#endif -#ifndef INT_MAX -#define INT_MAX 2147483647 -#endif -#ifndef INT_MIN -#define INT_MIN (-INT_MAX - 1) -#endif -#ifndef UINT_MAX -#define UINT_MAX 4294967295U -#endif -#ifndef LONG_MAX -#define LONG_MAX 0x7FFFFFFFFFFFFFFFLL -#endif -#ifndef LONG_MIN -#define LONG_MIN (-LONG_MAX - 1LL) -#endif -#ifndef ULONG_MAX -#define ULONG_MAX 0xFFFFFFFFFFFFFFFFULL -#endif -#ifndef HALF_MAX -#define HALF_MAX __short_as_half(0x7bff) -#endif -#ifndef HALF_MIN -#define HALF_MIN __short_as_half(0x0400) -#endif -#ifndef HALF_INFINITY -#define HALF_INFINITY __short_as_half(0x0x7c00) -#endif -#ifndef FLT_MAX -#define FLT_MAX __int_as_float(0x7f7fffffU) -#endif -#ifndef FLT_MIN -#define FLT_MIN __int_as_float(0x00800000U) -#endif -#ifndef FLT_INFINITY -#define FLT_INFINITY __int_as_float(0x7f800000U) -#endif -#ifndef FLT_NAN -#define FLT_NAN __int_as_float(0x7fffffffU) -#endif -#ifndef DBL_MAX -#define DBL_MAX __longlong_as_double(0x7fefffffffffffffULL) -#endif -#ifndef DBL_MIN -#define DBL_MIN __longlong_as_double(0x0010000000000000ULL) -#endif -#ifndef DBL_INFINITY -#define DBL_INFINITY __longlong_as_double(0x7ff0000000000000ULL) -#endif -#ifndef DBL_NAN -#define DBL_NAN __longlong_as_double(0x7fffffffffffffffULL) -#endif - -/* - * Alignment macros - */ -#define TYPEALIGN(ALIGNVAL,LEN) \ - (((cl_ulong) (LEN) + ((ALIGNVAL) - 1)) & ~((cl_ulong) ((ALIGNVAL) - 1))) -#define TYPEALIGN_DOWN(ALIGNVAL,LEN) \ - (((cl_ulong) (LEN)) & ~((cl_ulong) ((ALIGNVAL) - 1))) -#define INTALIGN(LEN) TYPEALIGN(sizeof(cl_int), (LEN)) -#define INTALIGN_DOWN(LEN) TYPEALIGN_DOWN(sizeof(cl_int), (LEN)) -#define LONGALIGN(LEN) TYPEALIGN(sizeof(cl_long), (LEN)) -#define LONGALIGN_DOWN(LEN) TYPEALIGN_DOWN(sizeof(cl_long), (LEN)) -#define MAXALIGN(LEN) TYPEALIGN(MAXIMUM_ALIGNOF, (LEN)) -#define MAXALIGN_DOWN(LEN) TYPEALIGN_DOWN(MAXIMUM_ALIGNOF, (LEN)) -#endif /* __CUDACC__ */ - -/* wider alignment */ -#define STROMALIGN_LEN 16 -#define STROMALIGN(LEN) TYPEALIGN(STROMALIGN_LEN,(LEN)) -#define STROMALIGN_DOWN(LEN) TYPEALIGN_DOWN(STROMALIGN_LEN,(LEN)) - -#define GPUMEMALIGN_LEN 1024 -#define GPUMEMALIGN(LEN) TYPEALIGN(GPUMEMALIGN_LEN,(LEN)) -#define GPUMEMALIGN_DOWN(LEN) TYPEALIGN_DOWN(GPUMEMALIGN_LEN,(LEN)) - -#define BLCKALIGN(LEN) TYPEALIGN(BLCKSZ,(LEN)) -#define BLCKALIGN_DOWN(LEN) TYPEALIGN_DOWN(BLCKSZ,(LEN)) - -#ifdef __CUDACC__ -/* - * MEMO: We takes dynamic local memory using cl_ulong data-type because of - * alignment problem. The nvidia's driver adjust alignment of local memory - * according to the data type; 1byte for cl_char, 4bytes for cl_uint and - * so on. Unexpectedly, void * pointer has 1byte alignment even if it is - * expected to be casted another data types. - * A pragma option __attribute__((aligned)) didn't work at least driver - * version 340.xx. So, we declared the local_workmem as cl_ulong * pointer - * as a workaround. - */ -#define SHARED_WORKMEM(TYPE) ((TYPE *) __pgstrom_dynamic_shared_workmem) -extern __shared__ cl_ulong __pgstrom_dynamic_shared_workmem[]; - -/* - * Thread index like OpenCL style. - * - * Be careful to use this convenient alias if grid/block size may become - * larger than INT_MAX, because threadIdx and blockDim are declared as - * 32bit integer, thus, it makes overflow during intermediation results - * if it is larger than INT_MAX. + * Thread index at CUDA C */ #define get_group_id() (blockIdx.x) #define get_num_groups() (gridDim.x) @@ -285,1553 +27,248 @@ extern __shared__ cl_ulong __pgstrom_dynamic_shared_workmem[]; #define get_local_size() (blockDim.x) #define get_global_id() (threadIdx.x + blockIdx.x * blockDim.x) #define get_global_size() (blockDim.x * gridDim.x) -#define get_global_base() (blockIdx.x * blockDim.x) -#define get_warp_id() (threadIdx.x / warpSize) -#define get_lane_id() (threadIdx.x & (warpSize-1)) -#else /* __CUDACC__ */ -typedef cl_ulong hostptr_t; -#endif /* !__CUDACC__ */ -/* - * Template of static function declarations - * - * CUDA compilar raises warning if static functions are not used, but - * we can restain this message with"unused" attribute of function/values. - * STATIC_INLINE / STATIC_FUNCTION packs common attributes to be - * assigned on host/device functions - */ -#define MAXTHREADS_PER_BLOCK 1024 -#define MAXWARPS_PER_BLOCK (MAXTHREADS_PER_BLOCK / 32) -#ifdef __CUDACC__ -#define STATIC_INLINE(RET_TYPE) \ - __device__ __host__ __forceinline__ \ - static RET_TYPE __attribute__ ((unused)) -#define STATIC_FUNCTION(RET_TYPE) \ - __device__ __host__ \ - static RET_TYPE -#define DEVICE_INLINE(RET_TYPE) \ - __device__ __forceinline__ \ - static RET_TYPE __attribute__ ((unused)) -#define DEVICE_FUNCTION(RET_TYPE) \ - __device__ RET_TYPE __attribute__ ((unused)) -#define PUBLIC_FUNCTION(RET_TYPE) \ - __device__ __host__ RET_TYPE -#define KERNEL_FUNCTION(RET_TYPE) \ - extern "C" __global__ RET_TYPE -#else /* __CUDACC__ */ -#define STATIC_INLINE(RET_TYPE) static inline RET_TYPE -#define STATIC_FUNCTION(RET_TYPE) static inline RET_TYPE -#define PUBLIC_FUNCTION(RET_TYPE) RET_TYPE -#endif /* !__CUDACC__ */ +/* Dynamic shared memory entrypoint */ +extern __shared__ char __pgstrom_dynamic_shared_workmem[] __MAXALIGNED__; +#define SHARED_WORKMEM(UNITSZ,INDEX) \ + (__pgstrom_dynamic_shared_workmem + (UNITSZ)*(INDEX)) -/* - * __Fetch - access macro regardless of memory alignment - */ -#ifdef __CUDA_ARCH__ -template -DEVICE_INLINE(T) -__Fetch(const T *ptr) +INLINE_FUNCTION(uint32_t) LaneId(void) { - T temp; - /* - * (2019/06/01) Originally, this function used direct pointer access - * using *ptr, if pointer is aligned. However, it looks NVCC/NVRTC - * optimization generates binary code that accesses unaligned pointer. - * '--device-debug' eliminates the strange behavior, and 'volatile' - * qualification also stop the behavior. - * Maybe, future version of CUDA and NVCC/NVRTC will fix the problem. - */ - memcpy(&temp, ptr, sizeof(T)); + uint32_t rv; - return temp; -} -#else /* __CUDA_ARCH__ */ -#define __Fetch(PTR) (*(PTR)) -#endif /* !__CUDA_ARCH__ */ + asm volatile("mov.u32 %0, %laneid;" : "=r"(rv) ); -#ifdef __CUDA_ARCH__ -template -DEVICE_INLINE(T) -__volatileRead(const volatile T *ptr) -{ - return *ptr; + return rv; } -template -DEVICE_INLINE(void) -__volatileWrite(volatile T *ptr, T val) +INLINE_FUNCTION(uint32_t) DynamicShmemSize(void) { - *ptr = val; -} -#endif - -/* - * Error code definition - * - * MEMO: SQL ERRCODE_* uses 0-29bits. We also use 30bit for a flag of - * CPU fallback. Host code tries CPU fallback if this flag is set and - * pg_strom.cpu_fallback_enabled is set. - */ -#ifndef MAKE_SQLSTATE -#define PGSIXBIT(ch) (((ch) - '0') & 0x3F) -#define MAKE_SQLSTATE(ch1,ch2,ch3,ch4,ch5) \ - (PGSIXBIT(ch1) + (PGSIXBIT(ch2) << 6) + (PGSIXBIT(ch3) << 12) + \ - (PGSIXBIT(ch4) << 18) + (PGSIXBIT(ch5) << 24)) -#endif /* MAKE_SQLSTATE */ -#include "utils/errcodes.h" -#define ERRCODE_FLAGS_CPU_FALLBACK (1U<<30) -#define ERRCODE_STROM_SUCCESS 0 -#define ERRCODE_STROM_DATASTORE_NOSPACE MAKE_SQLSTATE('H','D','B','0','4') -#define ERRCODE_STROM_WRONG_CODE_GENERATION MAKE_SQLSTATE('H','D','B','0','5') -#define ERRCODE_STROM_DATA_CORRUPTION MAKE_SQLSTATE('H','D','B','0','7') -#define ERRCODE_STROM_VARLENA_UNSUPPORTED MAKE_SQLSTATE('H','D','B','0','8') -#define ERRCODE_STROM_RECURSION_TOO_DEEP MAKE_SQLSTATE('H','D','B','0','9') + uint32_t rv; -#define KERN_ERRORBUF_FILENAME_LEN 24 -#define KERN_ERRORBUF_FUNCNAME_LEN 64 -#define KERN_ERRORBUF_MESSAGE_LEN 200 -typedef struct -{ - cl_int errcode; /* one of the ERRCODE_* */ - cl_int lineno; - char filename[KERN_ERRORBUF_FILENAME_LEN]; - char funcname[KERN_ERRORBUF_FUNCNAME_LEN]; - char message[KERN_ERRORBUF_MESSAGE_LEN]; -} kern_errorbuf; + asm volatile("mov.u32 %0, %dynamic_smem_size;" : "=r"(rv) ); -/* - * kern_context - a set of run-time information - */ -struct kern_parambuf; + return rv; +} -typedef struct +INLINE_FUNCTION(uint32_t) TotalShmemSize(void) { - cl_int errcode; - const char *error_filename; - cl_int error_lineno; - const char *error_funcname; - const char *error_message; /* !!only const static cstring!! */ - struct kern_parambuf *kparams; - void *stack_bounds; - cl_char *vlpos; - cl_char *vlend; - cl_char vlbuf[1]; -} kern_context; - -/* - * Usually, kern_context is declared at the auto-generated portion, - * then its pointer shall be passed to the pre-built GPU binary part. - * Its vlbuf length shall be determined on run-time compilation using - * the macro below. - */ -#define KERN_CONTEXT_VARLENA_BUFSZ_LIMIT 8192 -#ifdef __CUDACC_RTC__ -#define DECL_KERNEL_CONTEXT(NAME) \ - union { \ - kern_context kcxt; \ - char __dummy__[offsetof(kern_context, vlbuf) + \ - MAXALIGN(KERN_CONTEXT_VARLENA_BUFSZ)]; \ - } NAME -#endif /* __CUDACC_RTC__ */ - -#define INIT_KERNEL_CONTEXT(kcxt,__kparams) \ - do { \ - memset(kcxt, 0, offsetof(kern_context, vlbuf)); \ - (kcxt)->kparams = (__kparams); \ - assert((cl_ulong)(__kparams) == MAXALIGN(__kparams)); \ - (kcxt)->stack_bounds = (char *)(kcxt) - KERN_CONTEXT_STACK_LIMIT; \ - (kcxt)->vlpos = (kcxt)->vlbuf; \ - (kcxt)->vlend = (kcxt)->vlbuf + KERN_CONTEXT_VARLENA_BUFSZ; \ - } while(0) + uint32_t rv; -#define PTR_ON_VLBUF(kcxt,ptr,len) \ - ((char *)(ptr) >= (kcxt)->vlbuf && \ - (char *)(ptr) + (len) <= (kcxt)->vlend) + asm volatile("mov.u32 %0, %total_smem_size;" : "=r"(rv) ); -STATIC_INLINE(void *) -kern_context_alloc(kern_context *kcxt, size_t len) -{ - char *pos = (char *)MAXALIGN(kcxt->vlpos); - - if (pos >= kcxt->vlbuf && pos + len <= kcxt->vlend) - { - kcxt->vlpos = pos + len; - return pos; - } - return NULL; + return rv; } -#define CHECK_KERNEL_STACK_DEPTH(kcxt) \ - (((cl_ulong)((kcxt)->stack_bounds)) > ((cl_ulong)(&(kcxt)))) - -#ifdef __CUDA_ARCH__ -/* - * It sets an error code unless no significant error code is already set. - * Also, CpuReCheck has higher priority than RowFiltered because CpuReCheck - * implies device cannot run the given expression completely. - * (Usually, due to compressed or external varlena datum) - */ -STATIC_INLINE(void) -__STROM_EREPORT(kern_context *kcxt, cl_int errcode, - const char *filename, cl_int lineno, - const char *funcname, const char *message) +template +INLINE_FUNCTION(T) +__reduce_stair_add_sync(T value, T *p_total_sum = NULL) { - cl_int oldcode = kcxt->errcode; + uint32_t lane_id = LaneId(); + uint32_t mask; + T temp; - if (oldcode == ERRCODE_STROM_SUCCESS && - errcode != ERRCODE_STROM_SUCCESS) + assert(__activemask() == 0xffffffffU); + for (mask = 1; mask < warpSize; mask <<= 1) { - const char *pos; - - for (pos=filename; *pos != '\0'; pos++) - { - if (pos[0] == '/' && pos[1] != '\0') - filename = pos + 1; - } - if (!message) - message = "GPU kernel internal error"; - kcxt->errcode = errcode; - kcxt->error_filename = filename; - kcxt->error_lineno = lineno; - kcxt->error_funcname = funcname; - kcxt->error_message = message; + temp = __shfl_sync(__activemask(), value, (lane_id & ~mask) | (mask - 1)); + if (lane_id & mask) + value += temp; } + temp = __shfl_sync(__activemask(), value, warpSize - 1); + if (p_total_sum) + *p_total_sum = temp; + return value; } -#define STROM_ELOG(kcxt, message) \ - __STROM_EREPORT((kcxt),ERRCODE_INTERNAL_ERROR, \ - __FILE__,__LINE__,__FUNCTION__,(message)) -#define STROM_EREPORT(kcxt, errcode, message) \ - __STROM_EREPORT((kcxt),(errcode), \ - __FILE__,__LINE__,__FUNCTION__,(message)) -#define STROM_CPU_FALLBACK(kcxt, errcode, message) \ - __STROM_EREPORT((kcxt),(errcode) | ERRCODE_FLAGS_CPU_FALLBACK, \ - __FILE__,__LINE__,__FUNCTION__,(message)) - -STATIC_INLINE(void) -__strncpy(char *d, const char *s, cl_uint n) -{ - cl_uint i, m = n-1; - - for (i=0; i < m && s[i] != '\0'; i++) - d[i] = s[i]; - while (i < n) - d[i++] = '\0'; -} - -/* - * kern_writeback_error_status - */ -STATIC_INLINE(void) -kern_writeback_error_status(kern_errorbuf *result, kern_context *kcxt) +INLINE_FUNCTION(void) +STROM_WRITEBACK_ERROR_STATUS(kern_errorbuf *ebuf, kern_context *kcxt) { - /* - * It writes back a thread local error status only when the global - * error status is not set yet and the caller thread contains any - * error status. Elsewhere, we don't involves any atomic operation - * in the most of code path. - */ if (kcxt->errcode != ERRCODE_STROM_SUCCESS && - atomicCAS(&result->errcode, + atomicCAS(&ebuf->errcode, ERRCODE_STROM_SUCCESS, kcxt->errcode) == ERRCODE_STROM_SUCCESS) { - result->errcode = kcxt->errcode; - result->lineno = kcxt->error_lineno; - __strncpy(result->filename, + ebuf->errcode = kcxt->errcode; + ebuf->lineno = kcxt->error_lineno; + __strncpy(ebuf->filename, kcxt->error_filename, KERN_ERRORBUF_FILENAME_LEN); - __strncpy(result->funcname, + __strncpy(ebuf->funcname, kcxt->error_funcname, KERN_ERRORBUF_FUNCNAME_LEN); - __strncpy(result->message, + __strncpy(ebuf->message, kcxt->error_message, KERN_ERRORBUF_MESSAGE_LEN); } } -#elif defined(__CUDACC__) -#define STROM_EREPORT(kcxt, errcode, message) \ - do { \ - fprintf(stderr, "%s:%d %s (code=%d)\n", \ - __FUNCTION__, __LINE__, \ - message, errcode); \ - exit(1); \ - } while(0) -#define STROM_CPU_FALLBACK(a,b,c) STROM_EREPORT((a),(b),(c)) -#else /* !__CUDA_ARCH__ && !__CUDACC__ == gcc by pg_config */ -#define STROM_EREPORT(kcxt, errcode, message) \ - elog(ERROR, "%s:%d %s (code=%d)", \ - __FUNCTION__, __LINE__, \ - message, errcode) -#define STROM_CPU_FALLBACK(a,b,c) STROM_EREPORT((a),(b),(c)) -#endif /* !__CUDA_ARCH__ && !__CUDACC__ */ - -#ifdef __CUDACC__ -/* definitions at storage/block.h */ -typedef cl_uint BlockNumber; -#define InvalidBlockNumber ((BlockNumber) 0xFFFFFFFF) -#define MaxBlockNumber ((BlockNumber) 0xFFFFFFFE) - -/* details are defined at cuda_gpuscan.h */ -struct PageHeaderData; - -/* definitions at access/htup_details.h */ -typedef struct { - struct { - cl_ushort bi_hi; - cl_ushort bi_lo; - } ip_blkid; - cl_ushort ip_posid; -} ItemPointerData; - -DEVICE_INLINE(cl_bool) -ItemPointerEquals(ItemPointerData *ip1, ItemPointerData *ip2) -{ - return (ip1->ip_blkid.bi_hi == ip2->ip_blkid.bi_hi && - ip1->ip_blkid.bi_lo == ip2->ip_blkid.bi_lo && - ip1->ip_posid == ip2->ip_posid); -} - -typedef struct HeapTupleFields -{ - cl_uint t_xmin; /* inserting xact ID */ - cl_uint t_xmax; /* deleting or locking xact ID */ - union - { - cl_uint t_cid; /* inserting or deleting command ID, or both */ - cl_uint t_xvac; /* old-style VACUUM FULL xact ID */ - } t_field3; -} HeapTupleFields; - -typedef struct DatumTupleFields -{ - cl_int datum_len_; /* varlena header (do not touch directly!) */ - cl_int datum_typmod; /* -1, or identifier of a record type */ - cl_uint datum_typeid; /* composite type OID, or RECORDOID */ -} DatumTupleFields; - -typedef struct { - union { - HeapTupleFields t_heap; - DatumTupleFields t_datum; - } t_choice; - - ItemPointerData t_ctid; /* current TID of this or newer tuple */ - - cl_ushort t_infomask2; /* number of attributes + various flags */ - cl_ushort t_infomask; /* various flag bits, see below */ - cl_uchar t_hoff; /* sizeof header incl. bitmap, padding */ - /* ^ - 23 bytes - ^ */ - cl_uchar t_bits[1]; /* bitmap of NULLs -- VARIABLE LENGTH */ -} HeapTupleHeaderData; - -#define att_isnull(ATT, BITS) (!((BITS)[(ATT) >> 3] & (1 << ((ATT) & 0x07)))) -#define BITMAPLEN(NATTS) (((int)(NATTS) + BITS_PER_BYTE - 1) / BITS_PER_BYTE) - -/* - * information stored in t_infomask: - */ -#define HEAP_HASNULL 0x0001 /* has null attribute(s) */ -#define HEAP_HASVARWIDTH 0x0002 /* has variable-width attribute(s) */ -#define HEAP_HASEXTERNAL 0x0004 /* has external stored attribute(s) */ -#define HEAP_HASOID 0x0008 /* has an object-id field */ -#define HEAP_XMAX_KEYSHR_LOCK 0x0010 /* xmax is a key-shared locker */ -#define HEAP_COMBOCID 0x0020 /* t_cid is a combo cid */ -#define HEAP_XMAX_EXCL_LOCK 0x0040 /* xmax is exclusive locker */ -#define HEAP_XMAX_LOCK_ONLY 0x0080 /* xmax, if valid, is only a locker */ - -#define HEAP_XMIN_COMMITTED 0x0100 /* t_xmin committed */ -#define HEAP_XMIN_INVALID 0x0200 /* t_xmin invalid/aborted */ -#define HEAP_XMAX_COMMITTED 0x0400 /* t_xmax committed */ -#define HEAP_XMAX_INVALID 0x0800 /* t_xmax invalid/aborted */ -#define HEAP_XMAX_IS_MULTI 0x1000 /* t_xmax is a MultiXactId */ -#define HEAP_UPDATED 0x2000 /* this is UPDATEd version of row */ -#define HEAP_MOVED_OFF 0x4000 /* unused in GPU */ -#define HEAP_MOVED_IN 0x8000 /* unused in GPU */ - -/* - * information stored in t_infomask2: - */ -#define HEAP_NATTS_MASK 0x07FF /* 11 bits for number of attributes */ -#define HEAP_KEYS_UPDATED 0x2000 /* tuple was updated and key cols - * modified, or tuple deleted */ -#define HEAP_HOT_UPDATED 0x4000 /* tuple was HOT-updated */ -#define HEAP_ONLY_TUPLE 0x8000 /* this is heap-only tuple */ -#define HEAP2_XACT_MASK 0xE000 /* visibility-related bits */ - -/* - * Index tuple header structure - * - * All index tuples start with IndexTupleData. If the HasNulls bit is set, - * this is followed by an IndexAttributeBitMapData. The index attribute - * values follow, beginning at a MAXALIGN boundary. - */ -typedef struct IndexTupleData -{ - ItemPointerData t_tid; /* reference TID to heap tuple */ - - /* --------------- - * t_info is laid out in the following fashion: - * - * 15th (high) bit: has nulls - * 14th bit: has var-width attributes - * 13th bit: AM-defined meaning - * 12-0 bit: size of tuple - * --------------- - */ - unsigned short t_info; - - char data[1]; /* data or IndexAttributeBitMapData */ -} IndexTupleData; - -typedef struct IndexAttributeBitMapData -{ - cl_uchar bits[(INDEX_MAX_KEYS + 8 - 1) / 8]; -} IndexAttributeBitMapData; - -#define INDEX_SIZE_MASK 0x1fff -#define INDEX_VAR_MASK 0x4000 -#define INDEX_NULL_MASK 0x8000 - -/* - * Below is routines to support KDS_FORMAT_BLOCKS - This KDS format is used - * to load raw PostgreSQL heap blocks to GPU without modification by CPU. - * All CPU has to pay attention is, not to load rows which should not be - * visible to the current scan snapshot. - */ -typedef cl_uint TransactionId; -#define InvalidTransactionId ((TransactionId) 0) -#define FrozenTransactionId ((TransactionId) 2) -#define InvalidCommandId (~0U) -#else -#include "access/htup_details.h" -#endif /* __CUDACC__ */ - -typedef struct -{ - cl_int vl_len_; - cl_int ndim; /* always 1 for xidvector */ - cl_int dataoffset; /* always 0 for xidvector */ - cl_uint elemtype; /* XIDOID */ - cl_int dim1; /* number of items */ - cl_int lbound1; /* always 1 for xidvector */ - TransactionId values[FLEXIBLE_ARRAY_MEMBER]; -} xidvector; - -#ifdef __CUDACC__ -/* definitions at storage/itemid.h */ -typedef struct ItemIdData -{ - unsigned lp_off:15, /* offset to tuple (from start of page) */ - lp_flags:2, /* state of item pointer, see below */ - lp_len:15; /* byte length of tuple */ -} ItemIdData; - -#define LP_UNUSED 0 /* unused (should always have lp_len=0) */ -#define LP_NORMAL 1 /* used (should always have lp_len>0) */ -#define LP_REDIRECT 2 /* HOT redirect (should have lp_len=0) */ -#define LP_DEAD 3 /* dead, may or may not have storage */ - -#define ItemIdGetOffset(itemId) ((itemId)->lp_off) -#define ItemIdGetLength(itemId) ((itemId)->lp_len) -#define ItemIdIsUsed(itemId) ((itemId)->lp_flags != LP_UNUSED) -#define ItemIdIsNormal(itemId) ((itemId)->lp_flags == LP_NORMAL) -#define ItemIdIsRedirected(itemId) ((itemId)->lp_flags == LP_REDIRECT) -#define ItemIdIsDead(itemId) ((itemId)->lp_flags == LP_DEAD) -#define ItemIdHasStorage(itemId) ((itemId)->lp_len != 0) -#define ItemIdSetUnused(itemId) \ - do { \ - (itemId)->lp_flags = LP_UNUSED; \ - (itemId)->lp_off = 0; \ - (itemId)->lp_len = 0; \ - } while(0) - -/* definitions at storage/off.h */ -typedef cl_ushort OffsetNumber; - -#define InvalidOffsetNumber ((OffsetNumber) 0) -#define FirstOffsetNumber ((OffsetNumber) 1) -#define MaxOffsetNumber ((OffsetNumber) (BLCKSZ / sizeof(ItemIdData))) -#define OffsetNumberMask (0xffff) /* valid uint16 bits */ - -#define OffsetNumberNext(offsetNumber) \ - ((OffsetNumber) (1 + (offsetNumber))) - -/* definitions at storage/bufpage.h */ -typedef cl_ushort LocationIndex; - -typedef struct PageHeaderData -{ -#if 0 - /* - * NOTE: device code (ab-)uses this field to track parent block/item - * when GiST index is loaded. Without this hack, hard to implement - * depth-first search at GpuJoin. - */ - PageXLogRecPtr pd_lsn; /* LSN: next byte after last byte of xlog - * record for last change to this page */ -#else - cl_uint pd_parent_blkno; - cl_uint pd_parent_item; -#endif - cl_ushort pd_checksum; /* checksum */ - cl_ushort pd_flags; /* flag bits, see below */ - LocationIndex pd_lower; /* offset to start of free space */ - LocationIndex pd_upper; /* offset to end of free space */ - LocationIndex pd_special; /* offset to start of special space */ - cl_ushort pd_pagesize_version; - TransactionId pd_prune_xid; /* oldest prunable XID, or zero if none */ - ItemIdData pd_linp[FLEXIBLE_ARRAY_MEMBER]; /* line pointer array */ -} PageHeaderData; - -#define SizeOfPageHeaderData (offsetof(PageHeaderData, pd_linp)) - -#define PD_HAS_FREE_LINES 0x0001 /* are there any unused line pointers? */ -#define PD_PAGE_FULL 0x0002 /* not enough free space for new tuple? */ -#define PD_ALL_VISIBLE 0x0004 /* all tuples on page are visible to - * everyone */ -#define PD_VALID_FLAG_BITS 0x0007 /* OR of all valid pd_flags bits */ - -#define PageGetItemId(page, offsetNumber) \ - (&((PageHeaderData *)(page))->pd_linp[(offsetNumber) - 1]) -#define PageGetItem(page, lpp) \ - ((HeapTupleHeaderData *)((char *)(page) + ItemIdGetOffset(lpp))) -STATIC_INLINE(cl_uint) -PageGetMaxOffsetNumber(PageHeaderData *page) -{ - cl_uint pd_lower = page->pd_lower; - - return (pd_lower <= SizeOfPageHeaderData ? 0 : - (pd_lower - SizeOfPageHeaderData) / sizeof(ItemIdData)); -} - -/* - * GiST index specific structures and labels - */ -#define F_LEAF (1 << 0) /* leaf page */ -#define F_DELETED (1 << 1) /* the page has been deleted */ -#define F_TUPLES_DELETED (1 << 2) /* some tuples on the page were deleted */ -#define F_FOLLOW_RIGHT (1 << 3) /* page to the right has no downlink */ -#define F_HAS_GARBAGE (1 << 4) /* some tuples on the page are dead */ - -#define GIST_PAGE_ID 0xFF81 - -typedef struct GISTPageOpaqueData -{ - struct { - cl_uint xlogid; - cl_uint xrecoff; - } nsn; - BlockNumber rightlink; /* next page if any */ - cl_ushort flags; /* see bit definitions above */ - cl_ushort gist_page_id; /* for identification of GiST indexes */ -} GISTPageOpaqueData; - -STATIC_INLINE(GISTPageOpaqueData *) -GistPageGetOpaque(PageHeaderData *page) -{ - return (GISTPageOpaqueData *)((char *)page + page->pd_special); -} - -STATIC_INLINE(cl_bool) -GistPageIsLeaf(PageHeaderData *page) -{ - return (GistPageGetOpaque(page)->flags & F_LEAF) != 0; -} - -STATIC_INLINE(cl_bool) -GistPageIsDeleted(PageHeaderData *page) -{ - return (GistPageGetOpaque(page)->flags & F_DELETED) != 0; -} - -STATIC_INLINE(cl_bool) -GistFollowRight(PageHeaderData *page) -{ - return (GistPageGetOpaque(page)->flags & F_FOLLOW_RIGHT) != 0; -} - -/* root page of a gist index */ -#define GIST_ROOT_BLKNO 0 - #endif /* __CUDACC__ */ -/* - * kern_data_store - */ -#include "arrow_defs.h" - -#define TYPE_KIND__NULL 'n' /* unreferenced column */ -#define TYPE_KIND__BASE 'b' -#define TYPE_KIND__ARRAY 'a' -#define TYPE_KIND__COMPOSITE 'c' -#define TYPE_KIND__DOMAIN 'd' -#define TYPE_KIND__ENUM 'e' -#define TYPE_KIND__PSEUDO 'p' -#define TYPE_KIND__RANGE 'r' - -struct kern_colmeta { - /* true, if column is held by value. Elsewhere, a reference */ - cl_char attbyval; - /* alignment; 1,2,4 or 8, not characters in pg_attribute */ - cl_char attalign; - /* length of attribute */ - cl_short attlen; - /* attribute number */ - cl_short attnum; - /* offset of attribute location, if deterministic */ - cl_short attcacheoff; - /* oid of the SQL data type */ - cl_uint atttypid; - /* typmod of the SQL data type */ - cl_int atttypmod; - /* one of TYPE_KIND__* */ - cl_char atttypkind; - /* - * (for array and composite types) - * Some of types contain sub-fields like array or composite type. - * We carry type definition information (kern_colmeta) using the - * kds->colmeta[] array next to the top-level fields. - * An array type has relevant element type. So, its @num_subattrs - * is always 1, and kds->colmeta[@idx_subattrs] informs properties - * of the element type. - * A composite type has several fields. - * kds->colmeta[@idx_subattrs ... @idx_subattrs + @num_subattrs -1] - * carries its sub-fields properties. - */ - cl_ushort idx_subattrs; - cl_ushort num_subattrs; - - /* column name */ - NameData attname; - - /* - * (only arrow/column format) - * @attoptions keeps extra information of Apache Arrow type. Unlike - * PostgreSQL types, it can have variation of data accuracy in time - * related data types, or precision in decimal data type. - */ - ArrowTypeOptions attopts; - cl_uint nullmap_offset; - cl_uint nullmap_length; - cl_uint values_offset; - cl_uint values_length; - cl_uint extra_offset; - cl_uint extra_length; -}; -typedef struct kern_colmeta kern_colmeta; - -/* - * kern_tupitem - individual items for KDS_FORMAT_ROW - */ -struct kern_tupitem -{ - cl_uint t_len; /* length of tuple */ - cl_uint rowid; /* unique Id of this item */ - HeapTupleHeaderData htup; -}; -typedef struct kern_tupitem kern_tupitem; - -/* - * kern_hashitem - individual items for KDS_FORMAT_HASH - */ -struct kern_hashitem -{ - cl_uint hash; /* 32-bit hash value */ - cl_uint next; /* offset of the next (PACKED) */ - kern_tupitem t; /* HeapTuple of this entry */ -}; -typedef struct kern_hashitem kern_hashitem; - -#define KDS_FORMAT_ROW 1 -#define KDS_FORMAT_SLOT 2 -#define KDS_FORMAT_HASH 3 /* inner hash table for GpuHashJoin */ -#define KDS_FORMAT_BLOCK 4 /* raw blocks for direct loading */ -#define KDS_FORMAT_COLUMN 5 /* columnar based storage format */ -#define KDS_FORMAT_ARROW 6 /* apache arrow format */ - -struct kern_data_store { - size_t length; /* length of this data-store */ - /* - * NOTE: {nitems + usage} must be aligned to 64bit because these pair of - * values can be updated atomically using cmpxchg. - */ - cl_uint nitems; /* number of rows in this store */ - cl_uint usage; /* usage of this data-store (PACKED) */ - cl_uint nrooms; /* number of available rows in this store */ - cl_uint ncols; /* number of columns in this store */ - cl_char format; /* one of KDS_FORMAT_* above */ - cl_char has_varlena; /* true, if any varlena attribute */ - cl_char tdhasoid; /* copy of TupleDesc.tdhasoid */ - cl_uint tdtypeid; /* copy of TupleDesc.tdtypeid */ - cl_int tdtypmod; /* copy of TupleDesc.tdtypmod */ - cl_uint table_oid; /* OID of the table (only if GpuScan) */ - cl_uint nslots; /* width of hash-slot (only HASH format) */ - cl_uint nrows_per_block; /* average number of rows per - * PostgreSQL block (only BLOCK format) */ - cl_uint nr_colmeta; /* number of colmeta[] array elements; - * maybe, >= ncols, if any composite types */ - kern_colmeta colmeta[FLEXIBLE_ARRAY_MEMBER]; /* metadata of columns */ -}; -typedef struct kern_data_store kern_data_store; - -/* - * kern_data_extra - extra buffer of KDS_FORMAT_COLUMN - */ -struct kern_data_extra -{ - cl_ulong length; - cl_ulong usage; - char data[FLEXIBLE_ARRAY_MEMBER]; -}; -typedef struct kern_data_extra kern_data_extra; - -/* attribute number of system columns */ -#ifndef SYSATTR_H -#define SelfItemPointerAttributeNumber (-1) -#define ObjectIdAttributeNumber (-2) -#define MinTransactionIdAttributeNumber (-3) -#define MinCommandIdAttributeNumber (-4) -#define MaxTransactionIdAttributeNumber (-5) -#define MaxCommandIdAttributeNumber (-6) -#define TableOidAttributeNumber (-7) -#define FirstLowInvalidHeapAttributeNumber (-8) -#endif /* !SYSATTR_H */ - -/* - * MEMO: Support of 32GB KDS - KDS with row-, hash- and column-format - * internally uses 32bit offset value from the head or base address. - * We have assumption here - any objects pointed by the offset value - * is always aligned to MAXIMUM_ALIGNOF boundary (64bit). - * It means we can use 32bit offset to represent up to 32GB range (35bit). - */ -STATIC_INLINE(cl_uint) -__kds_packed(size_t offset) -{ - - Assert((offset & ~(0xffffffffUL << MAXIMUM_ALIGNOF_SHIFT)) == 0); - return (cl_uint)(offset >> MAXIMUM_ALIGNOF_SHIFT); -} - -STATIC_INLINE(size_t) -__kds_unpack(cl_uint offset) -{ - return (size_t)offset << MAXIMUM_ALIGNOF_SHIFT; -} -#define KDS_OFFSET_MAX_SIZE ((size_t)UINT_MAX << MAXIMUM_ALIGNOF_SHIFT) - -/* 'nslots' estimation; 25% larger than nitems, but 128 at least */ -#define __KDS_NSLOTS(nitems) \ - Max(128, ((nitems) * 5) >> 2) -/* - * NOTE: For strict correctness, header portion of kern_data_store may - * have larger number of colmeta[] items than 'ncols', if array or composite - * types are in the field definition. - * However, it is relatively rare, and 'ncols' == 'nr_colmeta' in most cases. - * The macros below are used for just cost estimation; no need to be strict - * connect for size estimatino. - */ -// use KDS_calculateHeadSize() instead -#define KDS_ESTIMATE_HEAD_LENGTH(ncols) \ - STROMALIGN(offsetof(kern_data_store, colmeta[(ncols)])) -#define KDS_ESTIMATE_ROW_LENGTH(ncols,nitems,htup_sz) \ - (KDS_ESTIMATE_HEAD_LENGTH(ncols) + \ - STROMALIGN(sizeof(cl_uint) * (nitems)) + \ - STROMALIGN(MAXALIGN(offsetof(kern_tupitem, \ - htup) + htup_sz) * (nitems))) -#define KDS_ESTIMATE_HASH_LENGTH(ncols,nitems,htup_sz) \ - (KDS_ESTIMATE_HEAD_LENGTH(ncols) + \ - STROMALIGN(sizeof(cl_uint) * (nitems)) + \ - STROMALIGN(sizeof(cl_uint) * __KDS_NSLOTS(nitems)) + \ - STROMALIGN(MAXALIGN(offsetof(kern_hashitem, \ - t.htup) + htup_sz) * (nitems))) - -/* Length of the header postion of kern_data_store */ -STATIC_INLINE(size_t) -KERN_DATA_STORE_HEAD_LENGTH(kern_data_store *kds) -{ - return STROMALIGN(offsetof(kern_data_store, - colmeta[kds->nr_colmeta])); -} -/* Base address of the data body */ -STATIC_INLINE(char *) -KERN_DATA_STORE_BODY(kern_data_store *kds) -{ - return (char *)kds + KERN_DATA_STORE_HEAD_LENGTH(kds); -} - -/* access function for row- and hash-format */ -STATIC_INLINE(cl_uint *) -KERN_DATA_STORE_ROWINDEX(kern_data_store *kds) -{ - Assert(kds->format == KDS_FORMAT_ROW || - kds->format == KDS_FORMAT_HASH); - return (cl_uint *)KERN_DATA_STORE_BODY(kds); -} - -/* access function for hash-format */ -STATIC_INLINE(cl_uint *) -KERN_DATA_STORE_HASHSLOT(kern_data_store *kds) -{ - Assert(kds->format == KDS_FORMAT_HASH); - return (cl_uint *)(KERN_DATA_STORE_BODY(kds) + - STROMALIGN(sizeof(cl_uint) * kds->nrooms)); -} - -/* access function for row- and hash-format */ -STATIC_INLINE(kern_tupitem *) -KERN_DATA_STORE_TUPITEM(kern_data_store *kds, cl_uint kds_index) -{ - size_t offset = KERN_DATA_STORE_ROWINDEX(kds)[kds_index]; - - if (!offset) - return NULL; - return (kern_tupitem *)((char *)kds + __kds_unpack(offset)); -} - -/* access macro for row-format by tup-offset */ -STATIC_INLINE(HeapTupleHeaderData *) -KDS_ROW_REF_HTUP(kern_data_store *kds, - cl_uint tup_offset, - ItemPointerData *p_self, - cl_uint *p_len) -{ - kern_tupitem *tupitem; - - Assert(kds->format == KDS_FORMAT_ROW || - kds->format == KDS_FORMAT_HASH); - if (tup_offset == 0) - return NULL; - tupitem = (kern_tupitem *)((char *)(kds) - + __kds_unpack(tup_offset) - - offsetof(kern_tupitem, htup)); - if (p_self) - *p_self = tupitem->htup.t_ctid; - if (p_len) - *p_len = tupitem->t_len; - return &tupitem->htup; -} - -STATIC_INLINE(kern_hashitem *) -KERN_HASH_FIRST_ITEM(kern_data_store *kds, cl_uint hash) -{ - cl_uint *slot = KERN_DATA_STORE_HASHSLOT(kds); - size_t offset = __kds_unpack(slot[hash % kds->nslots]); - - if (offset == 0) - return NULL; - Assert(offset < kds->length); - return (kern_hashitem *)((char *)kds + offset); -} - -STATIC_INLINE(kern_hashitem *) -KERN_HASH_NEXT_ITEM(kern_data_store *kds, kern_hashitem *khitem) -{ - size_t offset; - - if (!khitem || khitem->next == 0) - return NULL; - offset = __kds_unpack(khitem->next); - Assert(offset < kds->length); - return (kern_hashitem *)((char *)kds + offset); -} - -/* access macro for tuple-slot format */ -STATIC_INLINE(size_t) -KERN_DATA_STORE_SLOT_LENGTH(kern_data_store *kds, cl_uint nitems) -{ - size_t headsz = KERN_DATA_STORE_HEAD_LENGTH(kds); - size_t unitsz = LONGALIGN((sizeof(Datum) + sizeof(char)) * kds->ncols); - - return headsz + unitsz * nitems; -} - -STATIC_INLINE(Datum *) -KERN_DATA_STORE_VALUES(kern_data_store *kds, cl_uint row_index) -{ - size_t offset = KERN_DATA_STORE_SLOT_LENGTH(kds, row_index); - - return (Datum *)((char *)kds + offset); -} - -STATIC_INLINE(cl_char *) -KERN_DATA_STORE_DCLASS(kern_data_store *kds, cl_uint row_index) -{ - Datum *values = KERN_DATA_STORE_VALUES(kds, row_index); - - return (cl_char *)(values + kds->ncols); -} - -/* access macro for block format */ -#define KERN_DATA_STORE_PARTSZ(kds) \ - Min(((kds)->nrows_per_block + \ - warpSize - 1) & ~(warpSize - 1), \ - get_local_size()) -#define KERN_DATA_STORE_BLOCK_BLCKNR(kds,kds_index) \ - (((BlockNumber *)KERN_DATA_STORE_BODY(kds))[kds_index]) -#define KERN_DATA_STORE_BLOCK_PGPAGE(kds,kds_index) \ - ((struct PageHeaderData *) \ - (KERN_DATA_STORE_BODY(kds) + \ - STROMALIGN(sizeof(BlockNumber) * (kds)->nrooms) + \ - BLCKSZ * kds_index)) - -/* - * KDS_BLOCK_REF_HTUP - * - * It pulls a HeapTupleHeader by a pair of KDS and lp_offset; - */ -STATIC_INLINE(HeapTupleHeaderData *) -KDS_BLOCK_REF_HTUP(kern_data_store *kds, - cl_uint lp_offset, - ItemPointerData *p_self, - cl_uint *p_len) -{ - /* - * NOTE: lp_offset is not packed offset! - * KDS_FORMAT_BLOCK will be never larger than 4GB. - */ - ItemIdData *lpp = (ItemIdData *)((char *)kds + lp_offset); - cl_uint head_size; - cl_uint block_id; - BlockNumber block_nr; - PageHeaderData *pg_page; - - Assert(kds->format == KDS_FORMAT_BLOCK); - if (lp_offset == 0) - return NULL; - head_size = (KERN_DATA_STORE_HEAD_LENGTH(kds) + - STROMALIGN(sizeof(BlockNumber) * kds->nrooms)); - Assert(lp_offset >= head_size && - lp_offset < head_size + BLCKSZ * kds->nitems); - block_id = (lp_offset - head_size) / BLCKSZ; - block_nr = KERN_DATA_STORE_BLOCK_BLCKNR(kds, block_id); - pg_page = KERN_DATA_STORE_BLOCK_PGPAGE(kds, block_id); - - Assert(lpp >= pg_page->pd_linp && - lpp - pg_page->pd_linp < PageGetMaxOffsetNumber(pg_page)); - if (p_self) - { - p_self->ip_blkid.bi_hi = block_nr >> 16; - p_self->ip_blkid.bi_lo = block_nr & 0xffff; - p_self->ip_posid = lpp - pg_page->pd_linp; - } - if (p_len) - *p_len = ItemIdGetLength(lpp); - return (HeapTupleHeaderData *)PageGetItem(pg_page, lpp); -} - -/* access functions for apache arrow format */ -STATIC_INLINE(void *) -kern_fetch_simple_datum_arrow(kern_colmeta *cmeta, - char *base, - cl_uint index, - cl_uint unitsz) -{ - cl_char *nullmap = NULL; - cl_char *values; - - if (cmeta->nullmap_offset) - { - nullmap = base + __kds_unpack(cmeta->nullmap_offset); - if (att_isnull(index, nullmap)) - return NULL; - } - Assert(cmeta->values_offset > 0); - Assert(cmeta->extra_offset == 0); - Assert(cmeta->extra_length == 0); - Assert(unitsz * (index+1) <= __kds_unpack(cmeta->values_length)); - values = base + __kds_unpack(cmeta->values_offset); - return values + unitsz * index; -} - -STATIC_INLINE(void *) -kern_fetch_varlena_datum_arrow(kern_colmeta *cmeta, - char *base, - cl_uint index, - cl_uint *p_length) -{ - cl_char *nullmap; - cl_uint *offset; - cl_char *extra; - - if (cmeta->nullmap_offset) - { - nullmap = base + __kds_unpack(cmeta->nullmap_offset); - if (att_isnull(index, nullmap)) - return NULL; - } - Assert(cmeta->values_offset > 0 && - cmeta->extra_offset > 0 && - sizeof(cl_uint) * (index+1) <= __kds_unpack(cmeta->values_length)); - offset = (cl_uint *)(base + __kds_unpack(cmeta->values_offset)); - extra = base + __kds_unpack(cmeta->extra_offset); - - Assert(offset[index] <= offset[index+1] && - offset[index+1] <= __kds_unpack(cmeta->extra_length)); - *p_length = offset[index+1] - offset[index]; - return (extra + offset[index]); -} - -/* - * kern_parambuf - * - * Const and Parameter buffer. It stores constant values during a particular - * scan, so it may make sense if it is obvious length of kern_parambuf is - * less than constant memory (NOTE: not implemented yet). - */ -typedef struct kern_parambuf -{ - /* - * Fields of system information on execution - */ - cl_long xactStartTimestamp; /* timestamp when transaction start */ - cl_uint xactIdVector; /* offset to xidvector */ - - /* variable length parameters / constants */ - cl_uint length; /* total length of parambuf */ - cl_uint nparams; /* number of parameters */ - cl_uint poffset[FLEXIBLE_ARRAY_MEMBER]; /* offset of params */ -} kern_parambuf; - -STATIC_INLINE(void *) -kparam_get_value(kern_parambuf *kparams, cl_uint pindex) -{ - if (pindex >= kparams->nparams) - return NULL; - if (kparams->poffset[pindex] == 0) - return NULL; - return (char *)kparams + kparams->poffset[pindex]; -} - -STATIC_INLINE(cl_bool) -pointer_on_kparams(void *ptr, kern_parambuf *kparams) -{ - return kparams && ((char *)ptr >= (char *)kparams && - (char *)ptr < (char *)kparams + kparams->length); -} - -/* - * PostgreSQL varlena related definitions +/* ---------------------------------------------------------------- * - * Unlike host code, device code cannot touch external and/or compressed - * toast datum. All the format device code can understand is usual - * in-memory form; 4-bytes length is put on the head and contents follows. - * So, it is a responsibility of host code to decompress the toast values - * if device code may access compressed varlena. - * In case when device code touches unsupported format, calculation result - * shall be postponed to calculate on the host side. + * Definitions related to per-warp context * - * Note that it is harmless to have external and/or compressed toast datam - * unless it is NOT referenced in the device code. It can understand the - * length of these values, unlike contents. - */ -typedef struct varlena varlena; -#ifndef POSTGRES_H -struct varlena { - cl_char vl_len_[4]; /* Do not touch this field directly! */ - cl_char vl_dat[1]; -}; - -#define VARHDRSZ ((int) sizeof(cl_int)) -#define VARDATA(PTR) VARDATA_4B(PTR) -#define VARSIZE(PTR) VARSIZE_4B(PTR) -#define VARSIZE_EXHDR(PTR) (VARSIZE(PTR) - VARHDRSZ) - -#define VARSIZE_SHORT(PTR) VARSIZE_1B(PTR) -#define VARDATA_SHORT(PTR) VARDATA_1B(PTR) - -typedef union -{ - struct /* Normal varlena (4-byte length) */ - { - cl_uint va_header; - cl_char va_data[1]; - } va_4byte; - struct /* Compressed-in-line format */ - { - cl_uint va_header; - cl_uint va_rawsize; /* Original data size (excludes header) */ - cl_char va_data[1]; /* Compressed data */ - } va_compressed; -} varattrib_4b; - -typedef struct -{ - cl_uchar va_header; - cl_char va_data[1]; /* Data begins here */ -} varattrib_1b; - -/* inline portion of a short varlena pointing to an external resource */ -typedef struct -{ - cl_uchar va_header; /* Always 0x80 or 0x01 */ - cl_uchar va_tag; /* Type of datum */ - cl_char va_data[1]; /* Data (of the type indicated by va_tag) */ -} varattrib_1b_e; - -typedef enum vartag_external -{ - VARTAG_INDIRECT = 1, - VARTAG_ONDISK = 18 -} vartag_external; - -#define VARHDRSZ_SHORT offsetof(varattrib_1b, va_data) -#define VARATT_SHORT_MAX 0x7F - -typedef struct varatt_external -{ - cl_int va_rawsize; /* Original data size (includes header) */ - cl_int va_extsize; /* External saved size (doesn't) */ - cl_int va_valueid; /* Unique ID of value within TOAST table */ - cl_int va_toastrelid; /* RelID of TOAST table containing it */ -} varatt_external; - -typedef struct varatt_indirect -{ - hostptr_t pointer; /* Host pointer to in-memory varlena */ -} varatt_indirect; - -#define VARTAG_SIZE(tag) \ - ((tag) == VARTAG_INDIRECT ? sizeof(varatt_indirect) : \ - (tag) == VARTAG_ONDISK ? sizeof(varatt_external) : \ - 0 /* should not happen */) - -#define VARHDRSZ_EXTERNAL offsetof(varattrib_1b_e, va_data) -#define VARTAG_EXTERNAL(PTR) VARTAG_1B_E(PTR) -#define VARSIZE_EXTERNAL(PTR) \ - (VARHDRSZ_EXTERNAL + VARTAG_SIZE(VARTAG_EXTERNAL(PTR))) - -/* - * compressed varlena format - */ -typedef struct toast_compress_header -{ - cl_int vl_len_; /* varlena header (do not touch directly!) */ - cl_int rawsize; -} toast_compress_header; - -#define TOAST_COMPRESS_HDRSZ ((cl_int)sizeof(toast_compress_header)) -#define TOAST_COMPRESS_RAWSIZE(ptr) \ - (((toast_compress_header *) (ptr))->rawsize) -#define TOAST_COMPRESS_RAWDATA(ptr) \ - (((char *) (ptr)) + TOAST_COMPRESS_HDRSZ) -#define TOAST_COMPRESS_SET_RAWSIZE(ptr, len) \ - (((toast_compress_header *) (ptr))->rawsize = (len)) - -/* basic varlena macros */ -#define VARATT_IS_4B(PTR) \ - ((((varattrib_1b *) (PTR))->va_header & 0x01) == 0x00) -#define VARATT_IS_4B_U(PTR) \ - ((((varattrib_1b *) (PTR))->va_header & 0x03) == 0x00) -#define VARATT_IS_4B_C(PTR) \ - ((((varattrib_1b *) (PTR))->va_header & 0x03) == 0x02) -#define VARATT_IS_1B(PTR) \ - ((((varattrib_1b *) (PTR))->va_header & 0x01) == 0x01) -#define VARATT_IS_1B_E(PTR) \ - ((((varattrib_1b *) (PTR))->va_header) == 0x01) -#define VARATT_IS_COMPRESSED(PTR) VARATT_IS_4B_C(PTR) -#define VARATT_IS_EXTERNAL(PTR) VARATT_IS_1B_E(PTR) -#define VARATT_IS_EXTERNAL_ONDISK(PTR) \ - (VARATT_IS_EXTERNAL(PTR) && VARTAG_EXTERNAL(PTR) == VARTAG_ONDISK) -#define VARATT_IS_EXTERNAL_INDIRECT(PTR) \ - (VARATT_IS_EXTERNAL(PTR) && VARTAG_EXTERNAL(PTR) == VARTAG_INDIRECT) -#define VARATT_IS_SHORT(PTR) VARATT_IS_1B(PTR) -#define VARATT_IS_EXTENDED(PTR) (!VARATT_IS_4B_U(PTR)) -#define VARATT_NOT_PAD_BYTE(PTR) (*((cl_uchar *) (PTR)) != 0) - -#define VARSIZE_4B(PTR) \ - ((__Fetch(&((varattrib_4b *)(PTR))->va_4byte.va_header)>>2) & 0x3FFFFFFF) -#define VARSIZE_1B(PTR) \ - ((((varattrib_1b *) (PTR))->va_header >> 1) & 0x7F) -#define VARTAG_1B_E(PTR) \ - (((varattrib_1b_e *) (PTR))->va_tag) - -#define VARRAWSIZE_4B_C(PTR) \ - __Fetch(&((varattrib_4b *) (PTR))->va_compressed.va_rawsize) - -#define VARSIZE_ANY_EXHDR(PTR) \ - (VARATT_IS_1B_E(PTR) ? VARSIZE_EXTERNAL(PTR)-VARHDRSZ_EXTERNAL : \ - (VARATT_IS_1B(PTR) ? VARSIZE_1B(PTR)-VARHDRSZ_SHORT : \ - VARSIZE_4B(PTR)-VARHDRSZ)) - -#define VARSIZE_ANY(PTR) \ - (VARATT_IS_1B_E(PTR) ? VARSIZE_EXTERNAL(PTR) : \ - (VARATT_IS_1B(PTR) ? VARSIZE_1B(PTR) : \ - VARSIZE_4B(PTR))) - -#define VARDATA_4B(PTR) (((varattrib_4b *) (PTR))->va_4byte.va_data) -#define VARDATA_1B(PTR) (((varattrib_1b *) (PTR))->va_data) -#define VARDATA_ANY(PTR) \ - (VARATT_IS_1B(PTR) ? VARDATA_1B(PTR) : VARDATA_4B(PTR)) - -#define SET_VARSIZE(PTR, len) \ - (((varattrib_4b *)(PTR))->va_4byte.va_header = (((cl_uint) (len)) << 2)) -#endif /* POSTGRES_H */ - -#ifndef ARRAY_H -/* - * Definitions of array internal of PostgreSQL + * ---------------------------------------------------------------- */ +#define UNIT_TUPLES_PER_DEPTH (2 * WARPSIZE) typedef struct { + uint32_t smx_row_count; /* just for suspend/resume */ + uint32_t __nrels__deprecated; /* number of inner relations, if JOIN */ + int depth; /* 'depth' when suspended */ + int scan_done; /* smallest depth that may produce more tuples */ + /* only KDS_FORMAT_BLOCK */ + uint32_t block_id; /* BLOCK format needs to keep htuples on the */ + uint32_t lp_count; /* lp_items array once, to pull maximum GPU */ + uint32_t lp_wr_pos; /* utilization by simultaneous execution of */ + uint32_t lp_rd_pos; /* the kern_scan_quals. */ + uint32_t lp_items[UNIT_TUPLES_PER_DEPTH]; + /* read/write_pos of the combination buffer for each depth */ + struct { + uint32_t read; /* read_pos of depth=X */ + uint32_t write; /* write_pos of depth=X */ + } pos[1]; /* variable length */ /* - * NOTE: We assume 4bytes varlena header for array type. It allows - * aligned references to the array elements. Unlike CPU side, we - * cannot have extra malloc to ensure 4bytes varlena header. It is - * the reason why our ScalarArrayOp implementation does not support - * array data type referenced by Var node; which is potentially has - * short format. + * <----- __KERN_WARP_CONTEXT_BASESZ -----> + * Above fields are always kept in the device shared memory. + * + * +-------------------------------------------------------------+------ + * | kvars_slot[nslots] + kvars_class[nslots] + extra_sz (pos-0) | + * | kvars_slot[nslots] + kvars_class[nslots] + extra_sz (pos-1) | + * | kvars_slot[nslots] + kvars_class[nslots] + extra_sz (pos-2) | depth=0 + * | : : : | + * | kvars_slot[nslots] + kvars_class[nslots] + extra_sz (pos-63)| + * +-------------------------------------------------------------+------ + * : : : + * +-------------------------------------------------------------+------ + * | kvars_slot[nslots] + kvars_class[nslots] + extra_sz (pos-0) | + * | kvars_slot[nslots] + kvars_class[nslots] + extra_sz (pos-1) | + * | kvars_slot[nslots] + kvars_class[nslots] + extra_sz (pos-2) | depth=nrels + * | : : : | + * | kvars_slot[nslots] + kvars_class[nslots] + extra_sz (pos-63)| + * +-------------------------------------------------------------+------ */ - cl_uint vl_len_; /* don't touch this field */ - cl_int ndim; /* # of dimensions */ - cl_int dataoffset; /* offset to data, or 0 if no bitmap */ - cl_uint elemtype; /* element type OID */ -} ArrayType; - -typedef struct -{ - cl_int ndim; /* # of dimensions */ - cl_int dataoffset; /* offset to data, or 0 if no bitmap */ - cl_uint elemtype; /* element type OID */ -} ArrayTypeData; - -#define ARR_SIZE(a) VARSIZE_ANY(a) -#define ARR_BODY(a) ((ArrayTypeData *)VARDATA_ANY(a)) -#define ARR_NDIM(a) __Fetch(&ARR_BODY(a)->ndim) -#define ARR_DATAOFFSET(a) __Fetch(&ARR_BODY(a)->dataoffset) -#define ARR_HASNULL(a) (ARR_DATAOFFSET(a) != 0) -#define ARR_ELEMTYPE(a) __Fetch(&ARR_BODY(a)->elemtype) -#define ARR_DIMS(a) \ - ((int *)((char *)VARDATA_ANY(a) + sizeof(ArrayTypeData))) -#define ARR_LBOUND(a) (ARR_DIMS(a) + ARR_NDIM(a)) -#define ARR_NULLBITMAP(a) \ - (ARR_HASNULL(a) ? (char *)(ARR_DIMS(a) + 2 * ARR_NDIM(a)) : (char *)NULL) -#define ARR_DATA_PTR(a) \ - ((char *)VARDATA_ANY(a) + \ - (ARR_HASNULL(a) ? (ARR_DATAOFFSET(a) - VARHDRSZ) \ - : (sizeof(ArrayTypeData) + 2 * sizeof(int) * ARR_NDIM(a)))) - -/* - * The total array header size (in bytes) for an array with the specified - * number of dimensions and total number of items. - * NOTE: This macro assume 4-bytes varlena header - */ -#define ARR_OVERHEAD_NONULLS(ndims) \ - MAXALIGN(sizeof(ArrayType) + 2 * sizeof(int) * (ndims)) -#define ARR_OVERHEAD_WITHNULLS(ndims, nitems) \ - MAXALIGN(sizeof(ArrayType) + 2 * sizeof(int) * (ndims) + \ - ((nitems) + 7) / 8) - -#endif /* ARRAY_H */ - -/* ---------------------------------------------------------------- - * - * About GPU Projection Support - * - * A typical projection code path is below: - * - * 1. Extract values from heap-tuple or column-store onto tup_dclass[] and - * tup_values[] array, and calculate length of the new heap-tuple. - * 2. Allocation of the destination buffer, per threads-group - * 3. Write out the heap-tuple - * - * Step-1 is usually handled by auto-generated code. In some case, it is not - * reasonable to extract values to in-storage format prior to allocation of - * the destination buffer, like a long text value that references a source - * buffer in Apache Arrow. - * Right now, we pay attention on simple varlena (Binary of Arrow that is - * bytes in PG, and Utf8 of Arrow that is text in PG), and array of fixed- - * length values (List of Arrow). - * If tup_values[] hold a pointer to pg_varlena_t or pg_array_t, not raw- - * varlena image, tup_dclass[] will have special flag to inform indirect - * reference to the value. - * - * pg_XXXX_datum_ref() routine of types are responsible to transform disk - * format to internal representation. - * pg_XXXX_datum_store() routine of types are also responsible to transform - * internal representation to disk format. We need to pay attention on - * projection stage. If and when GPU code tries to store expressions which - * are not simple Var, Const or Param, these internal representation must - * be written to extra-buffer first. - * - * Also note that KDS_FORMAT_SLOT is designed to have compatible layout to - * pair of tup_dclass[] / tup_values[] array if all the items have NULL or - * NORMAL state. Other state should be normalized prior to CPU writeback. - * - * ---------------------------------------------------------------- - */ -#define DATUM_CLASS__NORMAL 0 /* datum is normal value */ -#define DATUM_CLASS__NULL 1 /* datum is NULL */ -#define DATUM_CLASS__VARLENA 2 /* datum is pg_varlena_t reference */ -#define DATUM_CLASS__ARRAY 3 /* datum is pg_array_t reference */ -#define DATUM_CLASS__COMPOSITE 4 /* datum is pg_composite_t reference */ -#define DATUM_CLASS__GEOMETRY 5 /* datum is pg_geometry_t reference */ - -/* - * device functions in cuda_common.fatbin - */ -#ifdef __CUDACC__ -/* lightweight hash */ -DEVICE_FUNCTION(cl_uint) -pg_hash_any(const cl_uchar *k, cl_int keylen); -/* little bit heavy, but more randomized SipHash */ -DEVICE_FUNCTION(cl_ulong) -pg_siphash_any(const unsigned char *buf, const size_t len); -#endif /* __CUDACC__ */ - -/* - * Macro to extract a heap-tuple - * - * usage: - * char *addr; - * - * EXTRACT_HEAP_TUPLE_BEGIN(kds, htup, addr) - * -> addr shall point the device pointer of the first field, or NULL - * EXTRACT_HEAP_TUPLE_NEXT(addr) - * -> addr shall point the device pointer of the second field, or NULL - * : - * EXTRACT_HEAP_TUPLE_END() - * - * EXTRACT_HEAP_READ_XXXX() - * -> load raw values to dclass[]/values[], and update extras[] - */ -#define EXTRACT_HEAP_TUPLE_BEGIN(KDS,HTUP,NATTRS) \ - do { \ - kern_colmeta *__cmeta; \ - cl_int __colidx; \ - cl_int __ncols; \ - cl_uchar *__nullmap = NULL; \ - char *__pos; \ - void *addr; \ - \ - if (!(HTUP)) \ - __ncols = 0; /* to be considered as NULL */ \ - else \ - { \ - if (((HTUP)->t_infomask & HEAP_HASNULL) != 0) \ - __nullmap = (HTUP)->t_bits; \ - __ncols = ((HTUP)->t_infomask2 & HEAP_NATTS_MASK); \ - __ncols = Min((KDS)->ncols, __ncols); \ - __pos = (char *)(HTUP) + (HTUP)->t_hoff; \ - assert(__pos == (char *)MAXALIGN(__pos)); \ - } \ - \ - for (__colidx=0; __colidx < (NATTRS); __colidx++) \ - { \ - if (__colidx < __ncols && \ - (!__nullmap || !att_isnull(__colidx, __nullmap))) \ - { \ - __cmeta = &((KDS)->colmeta[__colidx]); \ - if (__cmeta->attlen > 0) \ - __pos = (char *)TYPEALIGN(__cmeta->attalign, __pos); \ - else if (!VARATT_NOT_PAD_BYTE(__pos)) \ - __pos = (char *)TYPEALIGN(__cmeta->attalign, __pos); \ - addr = __pos; \ - __pos += (__cmeta->attlen > 0 ? \ - __cmeta->attlen : \ - VARSIZE_ANY(__pos)); \ - } \ - else \ - addr = NULL - -#define EXTRACT_HEAP_TUPLE_END() \ - } \ - } while(0) - -#define EXTRACT_HEAP_READ_8BIT(ADDR,ATT_DCLASS,ATT_VALUES) \ - do { \ - if (!(ADDR)) \ - (ATT_DCLASS) = DATUM_CLASS__NULL; \ - else \ - { \ - (ATT_DCLASS) = DATUM_CLASS__NORMAL; \ - (ATT_VALUES) = *((cl_uchar *)(ADDR)); \ - } \ - } while(0) - -#define EXTRACT_HEAP_READ_16BIT(ADDR,ATT_DCLASS,ATT_VALUES) \ - do { \ - if (!(ADDR)) \ - (ATT_DCLASS) = DATUM_CLASS__NULL; \ - else \ - { \ - (ATT_DCLASS) = DATUM_CLASS__NORMAL; \ - (ATT_VALUES) = *((cl_ushort *)(ADDR)); \ - } \ - } while(0) - -#define EXTRACT_HEAP_READ_32BIT(ADDR,ATT_DCLASS,ATT_VALUES) \ - do { \ - if (!(ADDR)) \ - (ATT_DCLASS) = DATUM_CLASS__NULL; \ - else \ - { \ - (ATT_DCLASS) = DATUM_CLASS__NORMAL; \ - (ATT_VALUES) = *((cl_uint *)(ADDR)); \ - } \ - } while(0) - -#define EXTRACT_HEAP_READ_64BIT(ADDR,ATT_DCLASS,ATT_VALUES) \ - do { \ - if (!(ADDR)) \ - (ATT_DCLASS) = DATUM_CLASS__NULL; \ - else \ - { \ - (ATT_DCLASS) = DATUM_CLASS__NORMAL; \ - (ATT_VALUES) = *((cl_ulong *)(ADDR)); \ - } \ - } while(0) - -#define EXTRACT_HEAP_READ_POINTER(ADDR,ATT_DCLASS,ATT_VALUES) \ - do { \ - if (!(ADDR)) \ - (ATT_DCLASS) = DATUM_CLASS__NULL; \ - else \ - { \ - (ATT_DCLASS) = DATUM_CLASS__NORMAL; \ - (ATT_VALUES) = PointerGetDatum(ADDR); \ - } \ - } while(0) - -/* - * Similar macro to extract IndexTuple - */ -#define EXTRACT_INDEX_TUPLE_BEGIN(ADDR,KDS,itup) \ - do { \ - const kern_colmeta *__cmeta = (KDS)->colmeta; \ - cl_uint __ncols = (KDS)->ncols; \ - cl_uint __colidx = 0; \ - cl_uchar *__nullmap = NULL; \ - char *__pos; \ - \ - if (!(itup)) \ - __ncols = 0; \ - else if (((itup)->t_info & INDEX_NULL_MASK) == 0) \ - __pos = itup->data; \ - else \ - { \ - __nullmap = (cl_uchar *)(itup)->data; \ - __pos = (itup)->data + MAXALIGN(BITMAPLEN(__ncols)); \ - } \ - if (__colidx < __ncols && \ - (!__nullmap || !att_isnull(__colidx, __nullmap))) \ - { \ - (ADDR) = __pos; \ - __pos += (__cmeta->attlen > 0 ? \ - __cmeta->attlen : \ - VARSIZE_ANY(__pos)); \ - } \ - else \ - (ADDR) = NULL - -#define EXTRACT_INDEX_TUPLE_NEXT(ADDR,KDS) \ - __colidx++; \ - if (__colidx < __ncols && \ - (!__nullmap || !att_isnull(__colidx, __nullmap))) \ - { \ - __cmeta = &(KDS)->colmeta[__colidx]; \ - \ - if (__cmeta->attlen > 0) \ - __pos = (char *)TYPEALIGN(__cmeta->attalign, __pos); \ - else if (!VARATT_NOT_PAD_BYTE(__pos)) \ - __pos = (char *)TYPEALIGN(__cmeta->attalign, __pos); \ - (ADDR) = __pos; \ - __pos += (__cmeta->attlen > 0 ? \ - __cmeta->attlen : \ - VARSIZE_ANY(__pos)); \ - } \ - else \ - (ADDR) = NULL - -#define EXTRACT_INDEX_TUPLE_END() \ - } while(0) - -#ifdef __CUDACC__ -/* - * device functions to decompress a toast datum - */ -DEVICE_FUNCTION(size_t) -toast_raw_datum_size(kern_context *kcxt, varlena *attr); -DEVICE_FUNCTION(cl_int) -pglz_decompress(const char *source, cl_int slen, - char *dest, cl_int rawsize); -DEVICE_FUNCTION(cl_bool) -toast_decompress_datum(char *buffer, cl_uint buflen, - const varlena *datum); -/* - * device functions to reference a particular datum in a tuple - */ -DEVICE_FUNCTION(void *) -kern_get_datum_tuple(kern_colmeta *colmeta, - HeapTupleHeaderData *htup, - cl_uint colidx); -DEVICE_FUNCTION(void *) -kern_get_datum_column(kern_data_store *kds, - kern_data_extra *extra, - cl_uint colidx, cl_uint rowidx); -DEVICE_FUNCTION(cl_bool) -kern_check_visibility_column(kern_context *kcxt, - kern_data_store *kds, - cl_uint rowidx); -/* - * device functions to form/deform HeapTuple - */ -DEVICE_FUNCTION(cl_uint) -__compute_heaptuple_size(kern_context *kcxt, - kern_colmeta *__cmeta, - cl_bool heap_hasoid, - cl_uint ncols, - cl_char *tup_dclass, - Datum *tup_values); -DEVICE_FUNCTION(void) -deform_kern_heaptuple(cl_int nattrs, - kern_colmeta *tup_attrs, - HeapTupleHeaderData *htup, - cl_char *tup_dclass, - Datum *tup_values); -DEVICE_FUNCTION(cl_uint) -__form_kern_heaptuple(kern_context *kcxt, - void *buffer, /* out */ - cl_int ncols, /* in */ - kern_colmeta *colmeta, /* in */ - cl_uint comp_typeid, /* in */ - cl_int comp_typmod, /* in */ - ItemPointerData *tup_self,/* in */ - cl_char *tup_dclass, /* in */ - Datum *tup_values); /* in */ -/* - * support function for KDS_FORMAT_SLOT - */ -DEVICE_FUNCTION(cl_uint) -kds_slot_compute_extra(kern_context *kcxt, - kern_data_store *kds, - cl_char *tup_dclass, - Datum *tup_values); -DEVICE_FUNCTION(void) -kds_slot_store_values(kern_context *kcxt, +} kern_warp_context; + +#define __KERN_WARP_CONTEXT_BASESZ(n_rels) \ + MAXALIGN(offsetof(kern_warp_context, pos[(n_rels)+1])) +#define KERN_WARP_CONTEXT_UNITSZ(n_rels,nbytes) \ + (__KERN_WARP_CONTEXT_BASESZ(n_rels) + \ + (nbytes) * UNIT_TUPLES_PER_DEPTH * ((n_rels)+1)) +#define WARP_READ_POS(warp,depth) ((warp)->pos[(depth)].read) +#define WARP_WRITE_POS(warp,depth) ((warp)->pos[(depth)].write) + +/* + * definitions related to generic device executor routines + */ +EXTERN_FUNCTION(int) +execGpuScanLoadSource(kern_context *kcxt, + kern_warp_context *wp, + kern_data_store *kds_src, + kern_data_extra *kds_extra, + kern_expression *kexp_load_vars, + kern_expression *kexp_scan_quals, + char *kvars_addr_wp, + uint32_t *p_smx_row_count); +EXTERN_FUNCTION(int) +execGpuJoinProjection(kern_context *kcxt, + kern_warp_context *wp, + int n_rels, kern_data_store *kds_dst, - cl_uint dst_index, - char *dst_extra, - cl_char *tup_dclass, - Datum *tup_values); -/* - * Reduction Operations + kern_expression *kexp_projection, + char *kvars_addr_wp, + bool *p_try_suspend); +EXTERN_FUNCTION(int) +execGpuPreAggGroupBy(kern_context *kcxt, + kern_warp_context *wp, + int n_rels, + kern_data_store *kds_final, + char *kvars_addr_wp, + bool *p_try_suspend); +/* + * Definitions related to GpuScan/GpuJoin/GpuPreAgg */ -DEVICE_FUNCTION(cl_uint) -pgstromStairlikeSum(cl_uint my_value, cl_uint *total_sum); -DEVICE_FUNCTION(cl_uint) -pgstromStairlikeBinaryCount(int predicate, cl_uint *total_count); -#endif /* __CUDACC__ */ - -/* base type definitions and templates */ -#include "cuda_basetype.h" -/* numeric functions support (must be here) */ -#include "cuda_numeric.h" -/* text functions support (must be here) */ -#include "cuda_textlib.h" -/* time functions support (must be here) */ -#include "cuda_timelib.h" -/* static inline and c++ template functions */ -#include "cuda_utils.h" +typedef struct { + kern_errorbuf kerror; + uint32_t grid_sz; + uint32_t block_sz; + uint32_t extra_sz; + uint32_t kvars_nslots; /* width of the kvars slot */ + uint32_t kvars_nbytes; /* extra buffer size of kvars-slot */ + uint32_t n_rels; /* >0, if JOIN is involved */ + /* suspend/resume support */ + bool resume_context; + uint32_t suspend_count; + /* kernel statistics */ + uint32_t nitems_raw; /* nitems in the raw data chunk */ + uint32_t nitems_in; /* nitems after the scan_quals */ + uint32_t nitems_out; /* nitems of final results */ + struct { + uint32_t nitems_gist; /* nitems picked up by GiST index */ + uint32_t nitems_out; /* nitems after this depth */ + } stats[1]; + /* + * variable length fields + * +-----------------------------------+ + * | kern_warp_context[0] for warp-0 | + * | kern_warp_context[1] for warp-1 | + * | : : : | + * | kern_warp_context[nwarps-1] | + * +-----------------------------------+ ----- + * | l_state[num_rels] for each thread | only if JOIN is involved + * +-----------------------------------+ (n_rels > 0) + * | matched[num_rels] for each thread | + * +-----------------------------------+ ----- + */ +} kern_gputask; + +#define __KERN_GPUTASK_WARP_OFFSET(n_rels,nbytes,gid) \ + (MAXALIGN(offsetof(kern_gputask,stats[(n_rels)])) + \ + KERN_WARP_CONTEXT_UNITSZ(n_rels,nbytes) * ((gid)/WARPSIZE)) + +#define KERN_GPUTASK_WARP_CONTEXT(kgtask) \ + ((kern_warp_context *) \ + ((char *)(kgtask) + \ + __KERN_GPUTASK_WARP_OFFSET((kgtask)->n_rels, \ + (kgtask)->kvars_nbytes, \ + get_global_id()))) +#define KERN_GPUTASK_LSTATE_ARRAY(kgtask) \ + ((kgtask)->n_rels == 0 ? NULL : (uint32_t *) \ + ((char *)(kgtask) + \ + __KERN_GPUTASK_WARP_OFFSET((kgtask)->n_rels, \ + (kgtask)->kvars_nbytes, \ + get_global_size()) + \ + sizeof(uint32_t) * (kgtask)->n_rels * get_global_id())) +#define KERN_GPUTASK_MATCHED_ARRAY(kgtask) \ + ((kgtask)->n_rels == 0 ? NULL : (bool *) \ + ((char *)(kgtask) + \ + __KERN_GPUTASK_WARP_OFFSET((kgtask)->n_rels, \ + (kgtask)->kvars_nbytes, \ + get_global_size()) + \ + sizeof(uint32_t) * (kgtask)->n_rels * get_global_size() + \ + sizeof(bool) * (kgtask)->n_rels * get_global_id())) + +#define KERN_GPUTASK_LENGTH(n_rels,nbytes,n_threads) \ + (__KERN_GPUTASK_WARP_OFFSET((n_rels),(nbytes),(n_threads)) + \ + sizeof(uint32_t) * (n_rels) * (n_threads) + \ + sizeof(bool) * (n_rels) * (n_threads)) + +/* + * GPU Kernel Entrypoint + */ +KERNEL_FUNCTION(void) +kern_gpuscan_main(kern_session_info *session, + kern_gputask *kgtask, + kern_multirels *__kmrels, /* always null */ + kern_data_store *kds_src, + kern_data_extra *kds_extra, + kern_data_store *kds_dst); +KERNEL_FUNCTION(void) +kern_gpujoin_main(kern_session_info *session, + kern_gputask *kgtask, + kern_multirels *kmrels, + kern_data_store *kds_src, + kern_data_extra *kds_extra, + kern_data_store *kds_dst); #endif /* CUDA_COMMON_H */ diff --git a/src/cuda_gpujoin.cu b/src/cuda_gpujoin.cu index deeb40446..4e1a5f08f 100644 --- a/src/cuda_gpujoin.cu +++ b/src/cuda_gpujoin.cu @@ -4,1924 +4,599 @@ * GPU accelerated parallel relations join based on hash-join or * nested-loop logic. * -- - * Copyright 2011-2021 (C) KaiGai Kohei - * Copyright 2014-2021 (C) PG-Strom Developers Team + * Copyright 2011-2023 (C) KaiGai Kohei + * Copyright 2014-2023 (C) PG-Strom Developers Team * * This program is free software; you can redistribute it and/or modify * it under the terms of the PostgreSQL License. */ #include "cuda_common.h" -#include "cuda_gpujoin.h" /* - * static shared variables + * GPU Nested-Loop */ -static __shared__ cl_bool scan_done; -static __shared__ cl_int base_depth; -static __shared__ cl_uint src_read_pos; -static __shared__ cl_uint dst_base_index; -static __shared__ size_t dst_base_usage; -extern __shared__ cl_uint wip_count[0]; /* [GPUJOIN_MAX_DEPTH+1] items */ -extern __shared__ cl_uint read_pos[0]; /* [GPUJOIN_MAX_DEPTH+1] items */ -extern __shared__ cl_uint write_pos[0]; /* [GPUJOIN_MAX_DEPTH+1] items */ -extern __shared__ cl_uint temp_pos[0]; /* [GPUJOIN_MAX_DEPTH+1] items */ -extern __shared__ cl_uint gist_pos[0]; /* [(GPUJOIN_MAX_DEPTH+1)*32] items */ -static __shared__ cl_uint stat_source_nitems; -extern __shared__ cl_uint stat_nitems[0]; /* [GPUJOIN_MAX_DEPTH+1] items */ -extern __shared__ cl_uint stat_nitems2[0]; /* [GPUJOIN_MAX_DEPTH+1] items */ - -/* - * gpujoin_suspend_context - */ -STATIC_FUNCTION(void) -gpujoin_suspend_context(kern_gpujoin *kgjoin, - cl_int depth, cl_uint *l_state, cl_bool *matched) -{ - gpujoinSuspendContext *sb; - cl_int i, max_depth = kgjoin->num_rels; - - sb = KERN_GPUJOIN_SUSPEND_CONTEXT(kgjoin, get_group_id()); - if (get_local_id() == 0) - { - sb->depth = depth; - sb->scan_done = scan_done; - sb->src_read_pos = src_read_pos; - sb->stat_source_nitems = stat_source_nitems; - } - - for (i=get_local_id(); i <= max_depth; i+=get_local_size()) - { - sb->pd[i].wip_count = wip_count[i]; - sb->pd[i].read_pos = read_pos[i]; - sb->pd[i].write_pos = write_pos[i]; - sb->pd[i].temp_pos = temp_pos[i]; - memcpy(sb->pd[i].gist_pos, gist_pos + i * MAXWARPS_PER_BLOCK, - sizeof(cl_uint) * MAXWARPS_PER_BLOCK); - sb->pd[i].stat_nitems = stat_nitems[i]; - sb->pd[i].stat_nitems2 = stat_nitems2[i]; - } - - for (i=0; i <= max_depth; i++) - { - sb->pd[i].l_state[get_local_id()] = l_state[i]; - sb->pd[i].matched[get_local_id()] = matched[i]; - } - /* tells host-code GPU kernel needs to be resumed */ - if (get_local_id() == 0) - atomicAdd(&kgjoin->suspend_count, 1); - __syncthreads(); -} - -/* - * gpujoin_resume_context - */ -STATIC_FUNCTION(cl_int) -gpujoin_resume_context(kern_gpujoin *kgjoin, - cl_uint *l_state, cl_bool *matched) -{ - gpujoinSuspendContext *sb; - cl_int i, max_depth = kgjoin->num_rels; - - sb = KERN_GPUJOIN_SUSPEND_CONTEXT(kgjoin, get_group_id()); - if (get_local_id() == 0) - { - scan_done = sb->scan_done; - src_read_pos = sb->src_read_pos; - stat_source_nitems = sb->stat_source_nitems; - } - - for (i=get_local_id(); i <= max_depth; i+=get_local_size()) - { - wip_count[i] = sb->pd[i].wip_count; - read_pos[i] = sb->pd[i].read_pos; - write_pos[i] = sb->pd[i].write_pos; - temp_pos[i] = sb->pd[i].temp_pos; - memcpy(gist_pos + i * MAXWARPS_PER_BLOCK, sb->pd[i].gist_pos, - sizeof(cl_uint) * MAXWARPS_PER_BLOCK); - stat_nitems[i] = sb->pd[i].stat_nitems; - stat_nitems2[i] = sb->pd[i].stat_nitems2; - } - - for (i=0; i <= max_depth; i++) - { - l_state[i] = sb->pd[i].l_state[get_local_id()]; - matched[i] = sb->pd[i].matched[get_local_id()]; - } - return sb->depth; -} - -/* - * gpujoin_rewind_stack - */ -STATIC_INLINE(cl_int) -gpujoin_rewind_stack(kern_gpujoin *kgjoin, cl_int depth, - cl_uint *l_state, cl_bool *matched) -{ - cl_int max_depth = kgjoin->num_rels; - static __shared__ cl_int __depth; - - assert(depth >= base_depth && depth <= max_depth); - __syncthreads(); - if (get_local_id() == 0) - { - __depth = depth; - for (;;) - { - /* - * At the time of rewind, all the upper tuples (outer combinations - * from the standpoint of deeper depth) are already processed. - * So, we can safely rewind the read/write index of this depth. - */ - read_pos[__depth] = 0; - write_pos[__depth] = 0; - - /* - * If any of outer combinations are in progress to find out - * matching inner tuple, we have to resume the task, prior - * to the increment of read pointer. - */ - if (wip_count[__depth] > 0) - break; - if (__depth == base_depth || - read_pos[__depth-1] < write_pos[__depth-1]) - break; - __depth--; - } - } - __syncthreads(); - depth = __depth; - if (depth < max_depth) - { - memset(l_state + depth + 1, 0, - sizeof(cl_uint) * (max_depth - depth)); - memset(matched + depth + 1, 0, - sizeof(cl_bool) * (max_depth - depth)); - } - if (scan_done && depth == base_depth) - return -1; - return depth; -} - -/* - * gpujoin_load_source - */ -STATIC_FUNCTION(cl_int) -gpujoin_load_source(kern_context *kcxt, - kern_gpujoin *kgjoin, - kern_data_store *kds_src, - kern_data_extra *kds_extra, - cl_uint *wr_stack, - cl_uint *l_state) +STATIC_FUNCTION(int) +execGpuJoinNestLoop(kern_context *kcxt, + kern_warp_context *wp, + kern_multirels *kmrels, + int depth, + char *src_kvars_addr_wp, + char *dst_kvars_addr_wp, + uint32_t &l_state, + bool &matched) { - cl_uint t_offset = UINT_MAX; - cl_bool visible = false; - cl_uint count; - cl_uint wr_index; - - /* extract a HeapTupleHeader */ - if (kds_src->format == KDS_FORMAT_ROW) - { - kern_tupitem *tupitem; - cl_uint row_index; - - /* fetch next window */ - if (get_local_id() == 0) - src_read_pos = atomicAdd(&kgjoin->src_read_pos, - get_local_size()); - __syncthreads(); - row_index = src_read_pos + get_local_id(); - - if (row_index < __ldg(&kds_src->nitems)) - { - tupitem = KERN_DATA_STORE_TUPITEM(kds_src, row_index); - t_offset = __kds_packed((char *)&tupitem->htup - - (char *)kds_src); - visible = gpujoin_quals_eval(kcxt, - kds_src, - &tupitem->htup.t_ctid, - &tupitem->htup); - } - assert(wip_count[0] == 0); - } - else if (kds_src->format == KDS_FORMAT_BLOCK) - { - cl_uint part_sz = KERN_DATA_STORE_PARTSZ(kds_src); - cl_uint n_parts = get_local_size() / part_sz; - cl_uint part_id; - cl_uint line_no; - cl_uint n_lines; - cl_uint loops = l_state[0]++; - - /* fetch next window, if needed */ - if (loops == 0 && get_local_id() == 0) - src_read_pos = atomicAdd(&kgjoin->src_read_pos, n_parts); - __syncthreads(); - part_id = src_read_pos + get_local_id() / part_sz; - line_no = get_local_id() % part_sz + loops * part_sz + 1; - - if (part_id < __ldg(&kds_src->nitems) && - get_local_id() < part_sz * n_parts) - { - PageHeaderData *pg_page; - BlockNumber block_nr; - ItemPointerData t_self; - HeapTupleHeaderData *htup; - - pg_page = KERN_DATA_STORE_BLOCK_PGPAGE(kds_src, part_id); - n_lines = PageGetMaxOffsetNumber(pg_page); - block_nr = KERN_DATA_STORE_BLOCK_BLCKNR(kds_src, part_id); - - if (line_no <= n_lines) - { - ItemIdData *lpp = PageGetItemId(pg_page, line_no); - if (ItemIdIsNormal(lpp)) - { - t_offset = (cl_uint)((char *)lpp - (char *)kds_src); - t_self.ip_blkid.bi_hi = block_nr >> 16; - t_self.ip_blkid.bi_lo = block_nr & 0xffff; - t_self.ip_posid = line_no; - - htup = PageGetItem(pg_page, lpp); - - visible = gpujoin_quals_eval(kcxt, - kds_src, - &t_self, - htup); - } - } - } - } - else if (kds_src->format == KDS_FORMAT_ARROW) - { - cl_uint row_index; - - /* fetch next window */ - if (get_local_id() == 0) - src_read_pos = atomicAdd(&kgjoin->src_read_pos, - get_local_size()); - __syncthreads(); - row_index = src_read_pos + get_local_id(); - - if (row_index < __ldg(&kds_src->nitems)) - { - t_offset = row_index + 1; - visible = gpujoin_quals_eval_arrow(kcxt, - kds_src, - row_index); - } - assert(wip_count[0] == 0); - } - else if (kds_src->format == KDS_FORMAT_COLUMN) - { - cl_uint row_index; - - /* fetch next window */ - if (get_local_id() == 0) - src_read_pos = atomicAdd(&kgjoin->src_read_pos, - get_local_size()); - __syncthreads(); - - row_index = src_read_pos + get_local_id(); - if (row_index < kds_src->nitems && - kern_check_visibility_column(kcxt, kds_src, row_index)) - { - t_offset = row_index + 1; - visible = gpujoin_quals_eval_column(kcxt, - kds_src, - kds_extra, - row_index); - } - assert(wip_count[0] == 0); - } - else - { - STROM_ELOG(kcxt, "unsupported KDS format"); - } - /* error checks */ - if (__syncthreads_count(kcxt->errcode) > 0) - return -1; - /* statistics */ - count = __syncthreads_count(t_offset != UINT_MAX); - if (get_local_id() == 0) - { - if (__ldg(&kds_src->format) == KDS_FORMAT_BLOCK) - wip_count[0] = count; - stat_source_nitems += count; - } + kern_data_store *kds_heap = KERN_MULTIRELS_INNER_KDS(kmrels, depth-1); + bool *oj_map = KERN_MULTIRELS_OUTER_JOIN_MAP(kmrels, depth-1); + kern_expression *kexp; + uint32_t read_pos; + uint32_t write_pos; + uint32_t mask; + bool tuple_is_valid = false; - /* store the source tuple if visible */ - wr_index = pgstromStairlikeBinaryCount(visible, &count); - if (count > 0) + if (WARP_WRITE_POS(wp,depth) >= WARP_READ_POS(wp,depth) + warpSize) { - wr_index += write_pos[0]; - __syncthreads(); - if (get_local_id() == 0) - { - write_pos[0] += count; - stat_nitems[0] += count; - } - if (visible) - wr_stack[wr_index] = t_offset; - __syncthreads(); - /* - * An iteration can fetch up to get_local_size() tuples - * at once, thus, we try to dive into deeper depth prior - * to the next outer tuples. + * The destination depth already keeps warpSize or more pending + * tuple. So, flush out these tuples first. */ - if (write_pos[0] + get_local_size() > GPUJOIN_PSEUDO_STACK_NROOMS) - return 1; - __syncthreads(); - } - else - { - /* no tuples we could fetch */ - assert(write_pos[0] + get_local_size() <= GPUJOIN_PSEUDO_STACK_NROOMS); - l_state[0] = 0; - __syncthreads(); + return depth+1; } - /* End of the outer relation? */ - if (src_read_pos >= kds_src->nitems) + if (__all_sync(__activemask(), l_state >= kds_heap->nitems)) { - /* don't rewind the stack any more */ - if (get_local_id() == 0) - scan_done = true; - __syncthreads(); - /* - * We may have to dive into the deeper depth if we still have - * pending join combinations. + * OK, all the threads in this warp reached to the end of hash-slot + * chain. Due to the above checks, the next depth has enough space + * to store the result in this depth. */ - if (write_pos[0] == 0) - { - cl_int max_depth = kgjoin->num_rels; - - for (cl_int depth=1; depth <= max_depth; depth++) + if (LaneId() == 0) + WARP_READ_POS(wp,depth-1) = Min(WARP_READ_POS(wp,depth-1) + warpSize, + WARP_WRITE_POS(wp,depth-1)); + __syncwarp(); + l_state = 0; + matched = false; + if (wp->scan_done >= depth) + { + assert(wp->scan_done == depth); + if (WARP_READ_POS(wp,depth-1) >= WARP_WRITE_POS(wp,depth-1)) { - if (temp_pos[depth] > 0) - return depth; - if (read_pos[depth] < write_pos[depth]) - return depth+1; + if (LaneId() == 0) + wp->scan_done = Max(wp->scan_done, depth+1); + return depth+1; } - return -1; + /* + * Elsewhere, remaining tuples in the combination buffer + * shall be wiped-out first, then, we update 'scan_done' + * to mark this depth will never generate results any more. + */ } - return 1; - } - return 0; -} - -/* - * gpujoin_load_outer - */ -STATIC_FUNCTION(cl_int) -gpujoin_load_outer(kern_context *kcxt, - kern_gpujoin *kgjoin, - kern_multirels *kmrels, - cl_int outer_depth, - cl_uint *wr_stack, - cl_uint *l_state) -{ - kern_data_store *kds_in = KERN_MULTIRELS_INNER_KDS(kmrels, outer_depth); - cl_bool *ojmap = KERN_MULTIRELS_OUTER_JOIN_MAP(kmrels, outer_depth); - HeapTupleHeaderData *htup = NULL; - kern_tupitem *tupitem; - cl_uint t_offset; - cl_uint row_index; - cl_uint wr_index; - cl_uint count; - - assert(ojmap != NULL); - - if (get_local_id() == 0) - src_read_pos = atomicAdd(&kgjoin->src_read_pos, - get_local_size()); - __syncthreads(); - row_index = src_read_pos + get_local_id(); - - /* pickup inner rows, if unreferenced */ - if (row_index < kds_in->nitems && !ojmap[row_index]) - { - tupitem = KERN_DATA_STORE_TUPITEM(kds_in, row_index); - t_offset = __kds_packed((char *)&tupitem->htup - - (char *)kds_in); - htup = &tupitem->htup; - } - wr_index = write_pos[outer_depth]; - wr_index += pgstromStairlikeBinaryCount(htup != NULL, &count); - __syncthreads(); - if (count > 0) - { - if (get_local_id() == 0) - { - write_pos[outer_depth] += count; - stat_nitems[outer_depth] += count; - } - if (htup) + else { - wr_stack += wr_index * (outer_depth + 1); - memset(wr_stack, 0, sizeof(cl_uint) * outer_depth); - wr_stack[outer_depth] = t_offset; + /* back to the previous depth to generate the source tuples. */ + if (WARP_READ_POS(wp,depth-1) + warpSize > WARP_WRITE_POS(wp,depth-1)) + return depth-1; } - __syncthreads(); } - - /* end of the inner relation? */ - if (src_read_pos >= kds_in->nitems) + read_pos = WARP_READ_POS(wp,depth-1) + LaneId(); + if (read_pos < WARP_WRITE_POS(wp,depth-1)) { - /* don't rewind the stack any more */ - if (get_local_id() == 0) - scan_done = true; - __syncthreads(); + uint32_t index = l_state++; - /* - * We may have to dive into the deeper depth if we still have - * pending join combinations. - */ - if (write_pos[outer_depth] == 0) + read_pos = (read_pos % UNIT_TUPLES_PER_DEPTH); + kcxt->kvars_slot = (kern_variable *) + (src_kvars_addr_wp + read_pos * kcxt->kvars_nbytes); + kcxt->kvars_class = (int *)(kcxt->kvars_slot + kcxt->kvars_nslots); + if (index < kds_heap->nitems) { - cl_int max_depth = kgjoin->num_rels; + kern_tupitem *tupitem; + uint32_t offset = KDS_GET_ROWINDEX(kds_heap)[index]; + xpu_int4_t status; - for (cl_int depth=outer_depth + 1; depth <= max_depth; depth++) + tupitem = (kern_tupitem *)((char *)kds_heap + + kds_heap->length - + __kds_unpack(offset)); + kexp = SESSION_KEXP_JOIN_LOAD_VARS(kcxt->session, depth-1); + ExecLoadVarsHeapTuple(kcxt, kexp, depth, kds_heap, &tupitem->htup); + kexp = SESSION_KEXP_JOIN_QUALS(kcxt->session, depth-1); + if (EXEC_KERN_EXPRESSION(kcxt, kexp, &status)) { - if (read_pos[depth] < write_pos[depth]) - return depth+1; + assert(!XPU_DATUM_ISNULL(&status)); + if (status.value > 0) + tuple_is_valid = true; + if (status.value != 0) + matched = true; } - return -1; - } - return outer_depth+1; - } - return outer_depth; -} - -/* - * gpujoin_projection_row - */ -STATIC_FUNCTION(cl_int) -gpujoin_projection_row(kern_context *kcxt, - kern_gpujoin *kgjoin, - kern_multirels *kmrels, - kern_data_store *kds_src, - kern_data_extra *kds_extra, - kern_data_store *kds_dst, - cl_uint *rd_stack, - cl_uint *l_state, - cl_bool *matched) -{ - cl_uint nrels = kgjoin->num_rels; - cl_uint read_index; - cl_uint dest_index; - size_t dest_offset; - cl_uint count; - cl_uint nvalids; - cl_uint required; - cl_char *tup_dclass; - Datum *tup_values; - cl_int needs_suspend = 0; - - /* sanity checks */ - assert(rd_stack != NULL); - - /* Any more result rows to be written? */ - if (read_pos[nrels] >= write_pos[nrels]) - return gpujoin_rewind_stack(kgjoin, nrels, l_state, matched); - - /* Allocation of tup_dclass/values */ - tup_dclass = (cl_char *) - kern_context_alloc(kcxt, sizeof(cl_char) * kds_dst->ncols); - tup_values = (Datum *) - kern_context_alloc(kcxt, sizeof(Datum) * kds_dst->ncols); - if (!tup_dclass || !tup_values) - STROM_EREPORT(kcxt, ERRCODE_OUT_OF_MEMORY, "out of memory"); - if (__syncthreads_count(kcxt->errcode) > 0) - return -1; /* bailout GpuJoin */ - - /* pick up combinations from the pseudo-stack */ - nvalids = Min(write_pos[nrels] - read_pos[nrels], - get_local_size()); - read_index = read_pos[nrels] + get_local_id(); - __syncthreads(); - - /* step.1 - compute length of the result tuple to be written */ - if (read_index < write_pos[nrels]) - { - rd_stack += read_index * (nrels + 1); - - gpujoin_projection(kcxt, - kds_src, - kds_extra, - kmrels, - rd_stack, - kds_dst, - tup_dclass, - tup_values, - NULL); - required = MAXALIGN(offsetof(kern_tupitem, htup) + - compute_heaptuple_size(kcxt, - kds_dst, - tup_dclass, - tup_values)); - } - else - required = 0; - - if (__syncthreads_count(kcxt->errcode) > 0) - return -1; /* bailout */ - - /* step.2 - increments nitems/usage of the kds_dst */ - dest_offset = pgstromStairlikeSum(required, &count); - assert(count > 0); - if (get_local_id() == 0) - { - union { - struct { - cl_uint nitems; - cl_uint usage; - } i; - cl_ulong v64; - } oldval, curval, newval; - - needs_suspend = 0; - curval.i.nitems = kds_dst->nitems; - curval.i.usage = kds_dst->usage; - do { - newval = oldval = curval; - newval.i.nitems += nvalids; - newval.i.usage += __kds_packed(count); - - if (KERN_DATA_STORE_HEAD_LENGTH(kds_dst) + - STROMALIGN(sizeof(cl_uint) * newval.i.nitems) + - __kds_unpack(newval.i.usage) > kds_dst->length) + if (oj_map && matched) { - needs_suspend = 1; - break; + assert(tupitem->rowid < kds_heap->nitems); + oj_map[tupitem->rowid] = true; } - } while ((curval.v64 = atomicCAS((cl_ulong *)&kds_dst->nitems, - oldval.v64, - newval.v64)) != oldval.v64); - dst_base_index = oldval.i.nitems; - dst_base_usage = __kds_unpack(oldval.i.usage); + } + else if (kmrels->chunks[depth-1].left_outer && + index >= kds_heap->nitems && !matched) + { + /* fill up NULL fields, if FULL/LEFT OUTER JOIN */ + kexp = SESSION_KEXP_JOIN_LOAD_VARS(kcxt->session, depth-1); + ExecLoadVarsHeapTuple(kcxt, kexp, depth, kds_heap, NULL); + tuple_is_valid = true; + } } - if (__syncthreads_count(needs_suspend) > 0) + /* error checks */ + if (__any_sync(__activemask(), kcxt->errcode != ERRCODE_STROM_SUCCESS)) + return -1; + /* save the result */ + mask = __ballot_sync(__activemask(), tuple_is_valid); + if (LaneId() == 0) { - /* No space left on the kds_dst, suspend the GPU kernel and bailout */ - gpujoin_suspend_context(kgjoin, nrels+1, l_state, matched); - return -2; /* <-- not to update statistics */ + write_pos = WARP_WRITE_POS(wp,depth); + WARP_WRITE_POS(wp,depth) += __popc(mask); } - dest_index = dst_base_index + get_local_id(); - dest_offset += dst_base_usage + required; + write_pos = __shfl_sync(__activemask(), write_pos, 0); + mask &= ((1U << LaneId()) - 1); + write_pos += __popc(mask); - /* step.3 - write out HeapTuple on the destination buffer */ - if (required > 0) + if (tuple_is_valid) { - cl_uint *row_index = KERN_DATA_STORE_ROWINDEX(kds_dst); - kern_tupitem *tupitem = (kern_tupitem *) - ((char *)kds_dst + kds_dst->length - dest_offset); - form_kern_heaptuple(kcxt, - tupitem, - kds_dst, - NULL, /* ItemPointerData */ - tup_dclass, - tup_values); - tupitem->rowid = dest_index; - row_index[dest_index] = __kds_packed(kds_dst->length - dest_offset); + write_pos = (write_pos % UNIT_TUPLES_PER_DEPTH); + memcpy(dst_kvars_addr_wp + write_pos * kcxt->kvars_nbytes, + kcxt->kvars_slot, + kcxt->kvars_nbytes); } - if (__syncthreads_count(kcxt->errcode) > 0) - return -1; /* bailout */ - - /* step.4 - make advance the read position */ - if (get_local_id() == 0) - read_pos[nrels] += nvalids; - return nrels + 1; + __syncwarp(); + if (WARP_WRITE_POS(wp,depth) >= WARP_READ_POS(wp,depth) + warpSize) + return depth+1; + return depth; } -/* to be defined by gpupreagg.c */ -DEVICE_FUNCTION(void) -gpupreagg_projection_slot(kern_context *kcxt_gpreagg, - cl_char *src_dclass, - Datum *src_values, - cl_char *dst_dclass, - Datum *dst_values); - /* - * gpujoin_projection_slot + * GPU Hash-Join */ -STATIC_FUNCTION(cl_int) -gpujoin_projection_slot(kern_context *kcxt, - kern_parambuf *kparams_gpreagg, - kern_gpujoin *kgjoin, - kern_multirels *kmrels, - kern_data_store *kds_src, - kern_data_extra *kds_extra, - kern_data_store *kds_dst, - cl_uint *rd_stack, - cl_uint *l_state, - cl_bool *matched) +STATIC_FUNCTION(int) +execGpuJoinHashJoin(kern_context *kcxt, + kern_warp_context *wp, + kern_multirels *kmrels, + int depth, + char *src_kvars_addr_wp, + char *dst_kvars_addr_wp, + uint32_t &l_state, + bool &matched) { - kern_parambuf *kparams_saved = kcxt->kparams; - cl_uint nrels = kgjoin->num_rels; - cl_uint read_index; - cl_uint dest_index; - size_t dest_offset; - cl_uint count; - cl_uint nvalids; - cl_bool tup_is_valid = false; - cl_char *tup_dclass = NULL; - Datum *tup_values = NULL; - cl_uint *tup_extras = NULL; - cl_uint extra_sz = 0; - cl_int needs_suspend = 0; - - /* sanity checks */ - assert(rd_stack != NULL); - - /* Any more result rows to be written? */ - if (read_pos[nrels] >= write_pos[nrels]) - return gpujoin_rewind_stack(kgjoin, nrels, l_state, matched); - - /* Allocation of tup_dclass/values/extra */ - tup_dclass = (cl_char *) - kern_context_alloc(kcxt, sizeof(cl_char) * kds_dst->ncols); - tup_values = (Datum *) - kern_context_alloc(kcxt, sizeof(Datum) * kds_dst->ncols); - tup_extras = (cl_uint *) - kern_context_alloc(kcxt, sizeof(cl_uint) * kds_dst->ncols); - if (!tup_dclass || !tup_values || !tup_extras) - STROM_EREPORT(kcxt, ERRCODE_OUT_OF_MEMORY, "out of memory"); - if (__syncthreads_count(kcxt->errcode) > 0) - return -1; /* bailout GpuJoin */ - - /* pick up combinations from the pseudo-stack */ - nvalids = Min(write_pos[nrels] - read_pos[nrels], - get_local_size()); - read_index = read_pos[nrels] + get_local_id(); - __syncthreads(); - - /* step.1 - projection by GpuJoin */ - if (read_index < write_pos[nrels]) + kern_data_store *kds_hash = KERN_MULTIRELS_INNER_KDS(kmrels, depth-1); + bool *oj_map = KERN_MULTIRELS_OUTER_JOIN_MAP(kmrels, depth-1); + kern_expression *kexp = NULL; + kern_hashitem *khitem = NULL; + uint32_t read_pos; + uint32_t write_pos; + uint32_t index; + uint32_t mask; + bool tuple_is_valid = false; + + if (WARP_WRITE_POS(wp,depth) >= WARP_READ_POS(wp,depth) + warpSize) { - rd_stack += read_index * (nrels + 1); - - extra_sz = gpujoin_projection(kcxt, - kds_src, - kds_extra, - kmrels, - rd_stack, - kds_dst, - tup_dclass, - tup_values, - tup_extras); - tup_is_valid = true; - } - - /* step.2 - increments nitems/usage of the kds_dst */ - dest_offset = pgstromStairlikeSum(extra_sz, &count); - if (get_local_id() == 0) - { - union { - struct { - cl_uint nitems; - cl_uint usage; - } i; - cl_ulong v64; - } oldval, curval, newval; - - needs_suspend = 0; - curval.i.nitems = kds_dst->nitems; - curval.i.usage = kds_dst->usage; - do { - newval = oldval = curval; - newval.i.nitems += nvalids; - newval.i.usage += __kds_packed(count); - - if (KERN_DATA_STORE_SLOT_LENGTH(kds_dst, newval.i.nitems) + - __kds_unpack(newval.i.usage) > kds_dst->length) - { - needs_suspend = 1; - break; - } - } while ((curval.v64 = atomicCAS((cl_ulong *)&kds_dst->nitems, - oldval.v64, - newval.v64)) != oldval.v64); - dst_base_index = oldval.i.nitems; - dst_base_usage = __kds_unpack(oldval.i.usage); - } - if (__syncthreads_count(needs_suspend) > 0) - { - /* No space left on the kds_dst, suspend the GPU kernel and bailout */ - gpujoin_suspend_context(kgjoin, nrels+1, l_state, matched); - return -2; /* <-- not to update statistics */ - } - dest_index = dst_base_index + get_local_id(); - dest_offset += dst_base_usage + extra_sz; - - /* step.3 - projection by GpuPreAgg on the destination buffer */ - if (tup_is_valid) - { - cl_char *dst_dclass = KERN_DATA_STORE_DCLASS(kds_dst, dest_index); - Datum *dst_values = KERN_DATA_STORE_VALUES(kds_dst, dest_index); - - /* - * Fixup pointers, if it points out of kds_src/kmrels because these - * variables must be visible to the next GpuPreAgg kernel. - */ - if (extra_sz > 0) - { - char *dpos = (char *)kds_dst + kds_dst->length - dest_offset; - char *addr; - cl_int extra_sum = 0; - cl_int len; - - for (int j=0; j < kds_dst->ncols; j++) - { - len = tup_extras[j]; - if (len == 0) - continue; - addr = DatumGetPointer(tup_values[j]); - memcpy(dpos, addr, len); - tup_values[j] = PointerGetDatum(dpos); - dpos += MAXALIGN(len); - extra_sum += MAXALIGN(len); - } - assert(extra_sz == extra_sum); - } /* - * Initial projection by GpuPreAgg - * - * This code block is generated by gpupreagg.c; that may reference - * const/parameters of GpuPreAgg, not GpuJoin. So, we temporarily - * switch kparams of the current context. + * Next depth already keeps warpSize or more pending tuples, + * so wipe out these tuples first. */ - kcxt->kparams = kparams_gpreagg; - gpupreagg_projection_slot(kcxt, - tup_dclass, - tup_values, - dst_dclass, - dst_values); - kcxt->kparams = kparams_saved; + return depth+1; } - if (__syncthreads_count(kcxt->errcode) > 0) - return -1; /* bailout */ - /* step.4 - make advance the read position */ - if (get_local_id() == 0) - read_pos[nrels] += nvalids; //get_local_size(); - return nrels + 1; -} - -/* - * gpujoin_exec_nestloop - */ -STATIC_FUNCTION(cl_int) -gpujoin_exec_nestloop(kern_context *kcxt, - kern_gpujoin *kgjoin, - kern_multirels *kmrels, - kern_data_store *kds_src, - kern_data_extra *kds_extra, - cl_int depth, - cl_uint *rd_stack, - cl_uint *wr_stack, - cl_uint *l_state, - cl_bool *matched) -{ - kern_data_store *kds_in = KERN_MULTIRELS_INNER_KDS(kmrels, depth); - cl_bool *oj_map = KERN_MULTIRELS_OUTER_JOIN_MAP(kmrels, depth); - kern_tupitem *tupitem = NULL; - cl_int max_depth = kgjoin->num_rels; - cl_uint x_unitsz; - cl_uint y_unitsz; - cl_uint x_index; /* outer index */ - cl_uint y_index; /* inner index */ - cl_uint wr_index; - cl_uint count; - cl_bool result = false; - __shared__ cl_bool matched_sync[MAXTHREADS_PER_BLOCK]; - - assert(kds_in->format == KDS_FORMAT_ROW); - assert(depth >= 1 && depth <= max_depth); - if (read_pos[depth-1] >= write_pos[depth-1]) + if (__all_sync(__activemask(), l_state == UINT_MAX)) { /* - * When this depth has enough room (even if all the threads generate - * join combinations on the next try), upper depth may be able to - * generate more outer tuples; which shall be used to input for the - * next depth. - * It is mostly valuable to run many combinations on the next depth. + * OK, all the threads in this warp reached to the end of hash-slot + * chain. Due to the above checks, the next depth has enough space + * to store the result in this depth. + * So, we process this depth again (if we have enough pending tuples), + * back to the previsou depth (if we don't have enough pending tuples + * in this depth), or move to the next depth if previous depth already + * reached to end of the chunk. */ - assert(wip_count[depth] == 0); - if (write_pos[depth] + get_local_size() <= GPUJOIN_PSEUDO_STACK_NROOMS) + if (LaneId() == 0) + WARP_READ_POS(wp,depth-1) = Min(WARP_READ_POS(wp,depth-1) + warpSize, + WARP_WRITE_POS(wp,depth-1)); + __syncwarp(); + l_state = 0; + matched = false; + if (wp->scan_done < depth) { - cl_int __depth = gpujoin_rewind_stack(kgjoin, depth-1, - l_state, matched); - if (__depth >= base_depth) - return __depth; + /* + * The previous depth still may generate the source tuple. + */ + if (WARP_WRITE_POS(wp,depth-1) < WARP_READ_POS(wp,depth-1) + warpSize) + return depth-1; } - /* elsewhere, dive into the deeper depth or projection */ - return depth + 1; - } - __syncthreads(); - x_unitsz = Min(write_pos[depth-1], get_local_size()); - y_unitsz = get_local_size() / x_unitsz; - - x_index = get_local_id() % x_unitsz; - y_index = get_local_id() / x_unitsz; - - if (y_unitsz * l_state[depth] >= kds_in->nitems) - { - /* - * In case of LEFT OUTER JOIN, we need to check whether the outer - * combination had any matched inner tuples, or not. - */ - if (KERN_MULTIRELS_LEFT_OUTER_JOIN(kmrels, depth)) + else { - if (get_local_id() < x_unitsz) - matched_sync[get_local_id()] = false; - __syncthreads(); - if (matched[depth]) - matched_sync[x_index] = true; - if (__syncthreads_count(!matched_sync[x_index]) > 0) + assert(wp->scan_done == depth); + if (WARP_READ_POS(wp,depth-1) >= WARP_WRITE_POS(wp,depth-1)) { - if (y_index == 0 && y_index < y_unitsz) - result = !matched_sync[x_index]; - else - result = false; - /* adjust x_index and rd_stack as usual */ - x_index += read_pos[depth-1]; - assert(x_index < write_pos[depth-1]); - rd_stack += (x_index * depth); - /* don't generate LEFT OUTER tuple any more */ - matched[depth] = true; - goto left_outer; + if (LaneId() == 0) + wp->scan_done = depth+1; + return depth+1; } + /* + * Elsewhere, remaining tuples in the combination buffer + * shall be wiped-out first, then, we update 'scan_done' + * to mark this depth will never generate results any more. + */ } - l_state[depth] = 0; - matched[depth] = false; - if (get_local_id() == 0) - { - wip_count[depth] = 0; - read_pos[depth-1] += x_unitsz; - } - return depth; } - x_index += read_pos[depth-1]; - rd_stack += (x_index * depth); - if (x_index < write_pos[depth-1] && y_index < y_unitsz) + write_pos = WARP_WRITE_POS(wp,depth-1); + read_pos = WARP_READ_POS(wp,depth-1) + LaneId(); + index = (read_pos % UNIT_TUPLES_PER_DEPTH); + kcxt->kvars_slot = (kern_variable *) + (src_kvars_addr_wp + index * kcxt->kvars_nbytes); + kcxt->kvars_class = (int *)(kcxt->kvars_slot + kcxt->kvars_nslots); + + if (l_state == 0) { - y_index += y_unitsz * l_state[depth]; - if (y_index < kds_in->nitems) + /* pick up the first item from the hash-slot */ + if (read_pos < write_pos) { - tupitem = KERN_DATA_STORE_TUPITEM(kds_in, y_index); + xpu_int4_t hash; + uint32_t *hslot; - result = gpujoin_join_quals(kcxt, - kds_src, - kds_extra, - kmrels, - depth, - rd_stack, - &tupitem->htup, - NULL); - if (result) + kexp = SESSION_KEXP_HASH_VALUE(kcxt->session, depth-1); + if (EXEC_KERN_EXPRESSION(kcxt, kexp, &hash)) { - matched[depth] = true; - if (oj_map && !oj_map[y_index]) - oj_map[y_index] = true; + assert(!XPU_DATUM_ISNULL(&hash)); + hslot = KDS_GET_HASHSLOT(kds_hash, hash.value); + for (khitem = KDS_HASH_FIRST_ITEM(kds_hash, hslot, NULL); + khitem != NULL && khitem->hash != hash.value; + khitem = KDS_HASH_NEXT_ITEM(kds_hash, khitem)); } } - } - l_state[depth]++; - -left_outer: - wr_index = write_pos[depth]; - wr_index += pgstromStairlikeBinaryCount(result, &count); - if (get_local_id() == 0) - { - wip_count[depth] = get_local_size(); - write_pos[depth] += count; - stat_nitems[depth] += count; - } - wr_stack += wr_index * (depth + 1); - if (result) - { - memcpy(wr_stack, rd_stack, sizeof(cl_uint) * depth); - wr_stack[depth] = (!tupitem ? 0 : __kds_packed((char *)&tupitem->htup - - (char *)kds_in)); - } - __syncthreads(); - /* - * If we have enough room to store the combinations more, execute this - * depth one more. Elsewhere, dive into a deeper level to flush results. - */ - if (write_pos[depth] + get_local_size() <= GPUJOIN_PSEUDO_STACK_NROOMS) - return depth; - return depth + 1; -} - -/* - * gpujoin_exec_hashjoin - */ -STATIC_FUNCTION(cl_int) -gpujoin_exec_hashjoin(kern_context *kcxt, - kern_gpujoin *kgjoin, - kern_multirels *kmrels, - kern_data_store *kds_src, - kern_data_extra *kds_extra, - cl_int depth, - cl_uint *rd_stack, - cl_uint *wr_stack, - cl_uint *l_state, - cl_bool *matched) -{ - kern_data_store *kds_hash = KERN_MULTIRELS_INNER_KDS(kmrels, depth); - cl_bool *oj_map = KERN_MULTIRELS_OUTER_JOIN_MAP(kmrels, depth); - kern_hashitem *khitem = NULL; - cl_int max_depth = kgjoin->num_rels; - cl_uint t_offset = UINT_MAX; - cl_uint hash_value; - cl_uint rd_index; - cl_uint wr_index; - cl_uint count; - cl_bool result; - - assert(kds_hash->format == KDS_FORMAT_HASH); - assert(depth >= 1 && depth <= max_depth); - - if (__syncthreads_count(l_state[depth] != UINT_MAX) == 0) - { - /* - * OK, all the threads reached to the end of hash-slot chain - * Move to the next outer window. - */ - if (get_local_id() == 0) - read_pos[depth-1] += get_local_size(); - l_state[depth] = 0; - matched[depth] = false; - return depth; - } - else if (read_pos[depth-1] >= write_pos[depth-1]) - { - /* - * When this depth has enough room (even if all the threads generate - * join combinations on the next try), upper depth may be able to - * generate more outer tuples; which shall be used to input for the - * next depth. - * It is mostly valuable to run many combinations on the next depth. - */ - assert(wip_count[depth] == 0); - if (write_pos[depth] + get_local_size() <= GPUJOIN_PSEUDO_STACK_NROOMS) - { - cl_int __depth = gpujoin_rewind_stack(kgjoin, depth-1, - l_state, matched); - if (__depth >= base_depth) - return __depth; - } - /* elsewhere, dive into the deeper depth or projection */ - return depth + 1; - } - rd_index = read_pos[depth-1] + get_local_id(); - rd_stack += (rd_index * depth); - - if (l_state[depth] == 0) - { - /* first touch to the hash-slot */ - if (rd_index < write_pos[depth-1]) - { - cl_bool is_null_keys; - - hash_value = gpujoin_hash_value(kcxt, - kds_src, - kds_extra, - kmrels, - depth, - rd_stack, - &is_null_keys); - /* MEMO: NULL-keys will never match to inner-join */ - if (!is_null_keys) - khitem = KERN_HASH_FIRST_ITEM(kds_hash, hash_value); - /* rewind the varlena buffer */ - kcxt->vlpos = kcxt->vlbuf; - } else { - /* - * MEMO: We must ensure the threads without outer tuple don't - * generate any LEFT OUTER results. - */ - l_state[depth] = UINT_MAX; + l_state = UINT_MAX; } } - else if (l_state[depth] != UINT_MAX) + else if (l_state != UINT_MAX) { - /* walks on the hash-slot chain */ - khitem = (kern_hashitem *)((char *)kds_hash - + __kds_unpack(l_state[depth]) - - offsetof(kern_hashitem, t.htup)); - hash_value = khitem->hash; + /* pick up the next one if any */ + uint32_t hash_value; - /* pick up next one if any */ - khitem = KERN_HASH_NEXT_ITEM(kds_hash, khitem); + khitem = (kern_hashitem *)((char *)kds_hash + __kds_unpack(l_state)); + hash_value = khitem->hash; + for (khitem = KDS_HASH_NEXT_ITEM(kds_hash, khitem); + khitem != NULL && khitem->hash != hash_value; + khitem = KDS_HASH_NEXT_ITEM(kds_hash, khitem)); } - - while (khitem && khitem->hash != hash_value) - khitem = KERN_HASH_NEXT_ITEM(kds_hash, khitem); + /* error checks */ + if (__any_sync(__activemask(), kcxt->errcode != ERRCODE_STROM_SUCCESS)) + return -1; if (khitem) { - cl_bool joinquals_matched; - - assert(khitem->hash == hash_value); + xpu_int4_t status; - result = gpujoin_join_quals(kcxt, - kds_src, - kds_extra, - kmrels, - depth, - rd_stack, - &khitem->t.htup, - &joinquals_matched); - assert(result == joinquals_matched); - if (joinquals_matched) + kexp = SESSION_KEXP_JOIN_LOAD_VARS(kcxt->session, depth-1); + ExecLoadVarsHeapTuple(kcxt, kexp, depth, kds_hash, &khitem->t.htup); + kexp = SESSION_KEXP_JOIN_QUALS(kcxt->session, depth-1); + if (EXEC_KERN_EXPRESSION(kcxt, kexp, &status)) + { + assert(!XPU_DATUM_ISNULL(&status)); + if (status.value > 0) + tuple_is_valid = true; + if (status.value != 0) + matched = true; + } + if (oj_map && matched) { - /* No LEFT/FULL JOIN are needed */ - matched[depth] = true; - /* No RIGHT/FULL JOIN are needed */ assert(khitem->t.rowid < kds_hash->nitems); - if (oj_map && !oj_map[khitem->t.rowid]) - oj_map[khitem->t.rowid] = true; + oj_map[khitem->t.rowid] = true; } - t_offset = __kds_packed((char *)&khitem->t.htup - - (char *)kds_hash); - } - else if (KERN_MULTIRELS_LEFT_OUTER_JOIN(kmrels, depth) && - l_state[depth] != UINT_MAX && - !matched[depth]) - { - /* No matched outer rows, but LEFT/FULL OUTER */ - result = true; + l_state = __kds_packed((char *)khitem - (char *)kds_hash); } else - result = false; - - /* save the current hash item */ - l_state[depth] = t_offset; - wr_index = write_pos[depth]; - wr_index += pgstromStairlikeBinaryCount(result, &count); - if (get_local_id() == 0) - { - write_pos[depth] += count; - stat_nitems[depth] += count; - } - wr_stack += wr_index * (depth + 1); - if (result) - { - memcpy(wr_stack, rd_stack, sizeof(cl_uint) * depth); - wr_stack[depth] = (!khitem ? 0U : t_offset); - } - /* count number of threads still in-progress */ - count = __syncthreads_count(khitem != NULL); - if (get_local_id() == 0) - wip_count[depth] = count; - /* - * (2019/05/25) We saw a strange behavior on Tesla T4 (CUDA 10.1 with - * driver 418.67), but never seen at Pascal/Volta devices. - * Even though "write_pos[depth]" is updated by the leader thread above, - * then __syncthreads_count() shall synchronize all the local threads, - * a part of threads read different value from this variable. - * I doubt compiler may have some optimization problem here, therefore, - * the code below avoid to reference "write_pos[depth]" directly. - * It loads this value to local variable once, then injects a barrier - * synchronization explicitly. - * - * We should check whether the future version of CUDA can fix the problem. - */ - wr_index = write_pos[depth]; - __syncthreads(); - if (wr_index + get_local_size() <= GPUJOIN_PSEUDO_STACK_NROOMS) - return depth; - return depth+1; -} - -//#include "cuda_postgis.h" - -/* - * gpujoin_prep_gistindex - * - * MEMO: We must load the entire GiST-index, but part of the leaf items indicate - * invalid items because a part of inner rows can be filtered out already. - * So, this kernel function preliminary invalidates these items on the inner - * preload timing. - */ -KERNEL_FUNCTION(void) -gpujoin_prep_gistindex(kern_multirels *kmrels, int depth) -{ - kern_data_store *kds_hash = KERN_MULTIRELS_INNER_KDS(kmrels, depth); - kern_data_store *kds_gist = KERN_MULTIRELS_GIST_INDEX(kmrels, depth); - BlockNumber block_nr; - OffsetNumber i, maxoff; - - assert(kds_hash->format == KDS_FORMAT_HASH && - kds_gist->format == KDS_FORMAT_BLOCK); - assert(depth >= 1 && depth <= kmrels->nrels); - - for (block_nr = get_group_id(); - block_nr < kds_gist->nrooms; - block_nr += get_num_groups()) { - PageHeaderData *gist_page; - ItemIdData *lpp; - IndexTupleData *itup; - kern_hashitem *khitem; - cl_uint hash, t_off; - - gist_page = KERN_DATA_STORE_BLOCK_PGPAGE(kds_gist, block_nr); - if (!GistPageIsLeaf(gist_page)) - continue; - maxoff = PageGetMaxOffsetNumber(gist_page); - for (i = get_local_id(); i < maxoff; i += get_local_size()) + if (kmrels->chunks[depth-1].left_outer && + l_state != UINT_MAX && !matched) { - lpp = PageGetItemId(gist_page, i+1); - if (ItemIdIsDead(lpp)) - continue; - itup = (IndexTupleData *)PageGetItem(gist_page, lpp); - - /* lookup kds_hash */ - hash = pg_hash_any((cl_uchar *)&itup->t_tid, - sizeof(ItemPointerData)); - for (khitem = KERN_HASH_FIRST_ITEM(kds_hash, hash); - khitem != NULL; - khitem = KERN_HASH_NEXT_ITEM(kds_hash, khitem)) - { - if (ItemPointerEquals(&khitem->t.htup.t_ctid, &itup->t_tid)) - { - t_off = __kds_packed((char *)&khitem->t.htup - - (char *)kds_hash); - itup->t_tid.ip_blkid.bi_hi = (t_off >> 16); - itup->t_tid.ip_blkid.bi_lo = (t_off & 0x0000ffffU); - itup->t_tid.ip_posid = USHRT_MAX; - break; - } - } - /* invalidate this leaf item, if not exist on kds_hash */ - if (!khitem) - lpp->lp_flags = LP_DEAD; + /* load NULL values on the inner portion */ + kexp = SESSION_KEXP_JOIN_LOAD_VARS(kcxt->session, depth-1); + ExecLoadVarsHeapTuple(kcxt, kexp, depth, kds_hash, NULL); + tuple_is_valid = true; } + l_state = UINT_MAX; } + /* error checks */ + if (__any_sync(__activemask(), kcxt->errcode != ERRCODE_STROM_SUCCESS)) + return -1; + /* save the result on the destination buffer */ + mask = __ballot_sync(__activemask(), tuple_is_valid); + if (LaneId() == 0) + { + write_pos = WARP_WRITE_POS(wp,depth); + WARP_WRITE_POS(wp,depth) += __popc(mask); + } + write_pos = __shfl_sync(__activemask(), write_pos, 0); + mask &= ((1U << LaneId()) - 1); + write_pos += __popc(mask); + if (tuple_is_valid) + { + index = write_pos % UNIT_TUPLES_PER_DEPTH; + memcpy(dst_kvars_addr_wp + index * kcxt->kvars_nbytes, + kcxt->kvars_slot, + kcxt->kvars_nbytes); + } + __syncwarp(); + if (WARP_WRITE_POS(wp,depth) >= WARP_READ_POS(wp,depth) + warpSize) + return depth+1; + return depth; } /* - * gpujoin_gist_getnext + * GPU Projection */ -STATIC_INLINE(ItemPointerData *) -gpujoin_gist_getnext(kern_context *kcxt, - kern_gpujoin *kgjoin, - cl_int depth, - kern_data_store *kds_gist, - void *gist_keys, - cl_uint *p_item_offset) +PUBLIC_FUNCTION(int) +execGpuJoinProjection(kern_context *kcxt, + kern_warp_context *wp, + int n_rels, /* index of read/write-pos */ + kern_data_store *kds_dst, + kern_expression *kexp_projection, + char *kvars_addr_wp, + bool *p_try_suspend) { - PageHeaderData *gist_base = KERN_DATA_STORE_BLOCK_PGPAGE(kds_gist, 0); - PageHeaderData *gist_page; - cl_char *vlpos_saved = kcxt->vlpos; - OffsetNumber start; - OffsetNumber index; - OffsetNumber maxoff; - ItemIdData *lpp = NULL; - IndexTupleData *itup = NULL; - cl_bool rv = false; - - assert(kds_gist->format == KDS_FORMAT_BLOCK); + uint32_t write_pos = WARP_WRITE_POS(wp,n_rels); + uint32_t read_pos = WARP_READ_POS(wp,n_rels); + uint32_t count; + uint32_t mask; + uint32_t row_id; + uint32_t offset; + int tupsz = 0; + int total_sz = 0; + bool try_suspend = false; + union { + struct { + uint32_t nitems; + uint32_t usage; + } i; + uint64_t v64; + } oldval, curval, newval; /* - * Setup starting point of GiST-index lookup + * The previous depth still may produce new tuples, and number of + * the current result tuples is not sufficient to run projection. */ - if (*p_item_offset == UINT_MAX) - { - /* this warp already reached to the end */ - return NULL; - } - else if (*p_item_offset == 0) - { - /* walk on GiST index from the root page */ - start = FirstOffsetNumber + LaneId(); - gist_page = KERN_DATA_STORE_BLOCK_PGPAGE(kds_gist, GIST_ROOT_BLKNO); - assert(gist_page->pd_parent_blkno == InvalidBlockNumber && - gist_page->pd_parent_item == InvalidOffsetNumber); - } - else - { - /* walk on GiST index from the next item */ - PageHeaderData *gist_base = KERN_DATA_STORE_BLOCK_PGPAGE(kds_gist, 0); - size_t off; - - assert(*p_item_offset < kds_gist->length); - lpp = (ItemIdData *)((char *)kds_gist + *p_item_offset); - off = (((char *)lpp - (char *)gist_base) & (BLCKSZ - 1)); - gist_page = (PageHeaderData *)((char *)lpp - off); - start = (lpp - gist_page->pd_linp) + 1 + warpSize; - } -restart: - assert((((char *)gist_page - (char *)gist_base) & (BLCKSZ - 1)) == 0); + if (wp->scan_done <= n_rels && read_pos + warpSize > write_pos) + return n_rels; - if (GistPageIsDeleted(gist_page)) - maxoff = InvalidOffsetNumber; /* skip any entries */ - else - maxoff = PageGetMaxOffsetNumber(gist_page); - - rv = false; - for (index=start; index <= maxoff; index += warpSize) + read_pos += LaneId(); + if (read_pos < write_pos) { - lpp = PageGetItemId(gist_page, index); - if (ItemIdIsDead(lpp)) - continue; - itup = (IndexTupleData *) PageGetItem(gist_page, lpp); + int index = (read_pos % UNIT_TUPLES_PER_DEPTH); - kcxt->vlpos = vlpos_saved; /* rewind */ - rv = gpujoin_gist_index_quals(kcxt, depth, - kds_gist, gist_page, - itup, gist_keys); - if (rv) - break; + kcxt->kvars_slot = (kern_variable *) + (kvars_addr_wp + index * kcxt->kvars_nbytes); + kcxt->kvars_class = (int *)(kcxt->kvars_slot + kcxt->kvars_nslots); + tupsz = kern_estimate_heaptuple(kcxt, + kexp_projection, + kds_dst); + if (tupsz < 0) + STROM_ELOG(kcxt, "unable to compute tuple size"); } - kcxt->vlpos = vlpos_saved; /* rewind */ + /* error checks */ + if (__any_sync(__activemask(), kcxt->errcode != ERRCODE_STROM_SUCCESS)) + return -1; + /* allocation of the destination buffer */ + assert(kds_dst->format == KDS_FORMAT_ROW); + mask = __ballot_sync(__activemask(), tupsz > 0); + count = __popc(mask); + mask &= ((1U << LaneId()) - 1); + row_id = __popc(mask); + assert(tupsz == 0 || row_id < count); - assert(__activemask() == ~0U); - if (__any_sync(__activemask(), rv)) + offset = __reduce_stair_add_sync(tupsz, &total_sz); + if (LaneId() == 0) { - /* By here, one or more threads meet the matched entry */ - if (!GistPageIsLeaf(gist_page)) - { - /* dive into deeper tree node */ - BlockNumber blkno_curr; - BlockNumber blkno_next; - PageHeaderData *gist_next; - OffsetNumber least_index = (rv ? index : UINT_MAX); - OffsetNumber buddy_index; + curval.i.nitems = kds_dst->nitems; + curval.i.usage = kds_dst->usage; + do { + newval = oldval = curval; + newval.i.nitems += count; + newval.i.usage += __kds_packed(total_sz); - for (int mask=1; mask <= 16; mask *= 2) + if (KDS_HEAD_LENGTH(kds_dst) + + MAXALIGN(sizeof(uint32_t) * newval.i.nitems) + + __kds_unpack(newval.i.usage) > kds_dst->length) { - buddy_index = __shfl_xor_sync(__activemask(), least_index, mask); - least_index = Min(least_index, buddy_index); + try_suspend = true; + break; } - __syncwarp(~0U); - assert(least_index <= maxoff); - - lpp = PageGetItemId(gist_page, least_index); - itup = (IndexTupleData *) PageGetItem(gist_page, lpp); - blkno_curr = ((char *)gist_page - (char *)gist_base) / BLCKSZ; - blkno_next = ((BlockNumber)itup->t_tid.ip_blkid.bi_hi << 16 | - (BlockNumber)itup->t_tid.ip_blkid.bi_lo); - assert(blkno_next < kds_gist->nrooms); - gist_next = KERN_DATA_STORE_BLOCK_PGPAGE(kds_gist, blkno_next); - assert(gist_next->pd_parent_blkno == blkno_curr && - gist_next->pd_parent_item == least_index); - gist_page = gist_next; - start = FirstOffsetNumber + LaneId(); - goto restart; - } - - /* this is matched */ - if (rv) - { - assert((char *)lpp >= (char *)gist_page && - (char *)lpp < (char *)gist_page + BLCKSZ); - *p_item_offset = (cl_uint)((char *)lpp - (char *)kds_gist); - - return &itup->t_tid; - } - - /* - * this is not matched - ensure the next call skips the main loop - * above, we set next offset of the 'maxoff' onto the p_item_offset. - */ - lpp = PageGetItemId(gist_page, maxoff+1); - *p_item_offset = (cl_uint)((char *)lpp - (char *)kds_gist); - - return NULL; + } while ((curval.v64 = atomicCAS((unsigned long long *)&kds_dst->nitems, + oldval.v64, + newval.v64)) != oldval.v64); } - - /* - * By here, nobody meet any entries in this page - */ - if (gist_page != gist_base) + oldval.v64 = __shfl_sync(__activemask(), oldval.v64, 0); + row_id += oldval.i.nitems; + /* data store has no space? */ + if (__any_sync(__activemask(), try_suspend)) { - /* pop up to the parent */ - BlockNumber blkno_next = gist_page->pd_parent_blkno; - - assert(blkno_next < kds_gist->nrooms); - start = gist_page->pd_parent_item + 1 + LaneId(); - gist_page = KERN_DATA_STORE_BLOCK_PGPAGE(kds_gist, blkno_next); - goto restart; + *p_try_suspend = true; + return -1; } - /* cannot pop up from the root page */ - assert(gist_page->pd_parent_blkno == InvalidBlockNumber && - gist_page->pd_parent_item == InvalidOffsetNumber); - *p_item_offset = UINT_MAX; - - return NULL; -} - -/* - * gpujoin_exec_gistindex - */ -STATIC_FUNCTION(cl_int) -gpujoin_exec_gistindex(kern_context *kcxt, - kern_gpujoin *kgjoin, - kern_multirels *kmrels, - kern_data_store *kds_src, - kern_data_extra *kds_extra, - cl_int depth, - cl_uint *__rd_stack_base, - cl_uint *__wr_stack_base, - cl_uint *l_state, - cl_bool *matched) -{ - kern_data_store *kds_hash = KERN_MULTIRELS_INNER_KDS(kmrels, depth); - kern_data_store *kds_gist = KERN_MULTIRELS_GIST_INDEX(kmrels, depth); - cl_bool *oj_map = KERN_MULTIRELS_OUTER_JOIN_MAP(kmrels, depth); - cl_uint *wr_stack; - cl_uint *temp_stack; - cl_uint rd_index; - cl_uint wr_index; - cl_uint temp_index; - cl_uint count; - void *gist_keys; - cl_char *vlpos_saved_1 = kcxt->vlpos; - - assert(kds_hash->format == KDS_FORMAT_HASH); - assert(depth >= 1 && depth <= kgjoin->num_rels); - - if (__syncthreads_count(l_state[depth] != UINT_MAX && - l_state[depth] != 0) == 0 && - read_pos[depth-1] >= write_pos[depth-1]) + /* write out the tuple */ + if (tupsz > 0) { - if (write_pos[depth] + get_local_size() <= GPUJOIN_PSEUDO_STACK_NROOMS) - { - cl_int __depth = gpujoin_rewind_stack(kgjoin, depth-1, - l_state, matched); - if (__depth >= base_depth) - return __depth; - } - /* flush if temporary index search results still remain */ - if (scan_done && temp_pos[depth] > 0) - goto bailout; - /* elsewhere, dive into the deeper depth or projection */ - return depth + 1; - } - __syncthreads(); + kern_tupitem *tupitem; -reload: - kcxt->vlpos = vlpos_saved_1; /* rewind */ - assert(__activemask() == ~0U); - if (__all_sync(__activemask(), l_state[depth] == UINT_MAX) || - __all_sync(__activemask(), l_state[depth] == 0)) - { - /* - * all the threads in warp reached in the tail of GiST-index tree, so move to - * the next index key. - */ - if (LaneId() == 0) - { - rd_index = atomicAdd(&read_pos[depth-1], 1); - gist_pos[depth * MAXWARPS_PER_BLOCK + get_local_id() / warpSize] = rd_index; - } - __syncwarp(~0U); - rd_index = __shfl_sync(__activemask(), rd_index, 0); - l_state[depth] = 0; + offset += __kds_unpack(oldval.i.usage); + KDS_GET_ROWINDEX(kds_dst)[row_id] = __kds_packed(offset); + tupitem = (kern_tupitem *) + ((char *)kds_dst + kds_dst->length - offset); + tupitem->rowid = row_id; + tupitem->t_len = kern_form_heaptuple(kcxt, + kexp_projection, + kds_dst, + &tupitem->htup); } - else + /* update the read position */ + if (LaneId() == 0) { - /* resume the index-key */ - rd_index = gist_pos[depth * MAXWARPS_PER_BLOCK + get_local_id() / warpSize]; + WARP_READ_POS(wp,n_rels) += count; + assert(WARP_WRITE_POS(wp,n_rels) >= WARP_READ_POS(wp,n_rels)); } - /* threads in a warp must load exactly same index-key */ - assert(rd_index == __shfl_sync(__activemask(), rd_index, 0)); - - if (rd_index < write_pos[depth-1]) + __syncwarp(); + if (wp->scan_done <= n_rels) { - cl_uint *rd_stack = __rd_stack_base + (rd_index * depth); - cl_char *vlpos_saved_2; - - gist_keys = gpujoin_gist_load_keys(kcxt, - kmrels, - kds_src, - kds_extra, - depth, - rd_stack); - assert(__activemask() == ~0U); - if (__any_sync(__activemask(), kcxt->errcode != 0)) - goto bailout; /* error */ - assert(gist_keys != NULL); - - /* - * MEMO: Cost to run gpujoin_gist_getnext highly depends on the key value. - * If key never matches any bounding-box, gpujoin_gist_getnext() returns - * immediately. If key matches some entries, thus walks down into the leaf - * of R-tree, it takes longer time than the above misshit cases. - * In case when individual warps have various execution time, in general, - * we should not put __syncthreads() because the warps that returned - * immediately from the gpujoin_gist_getnext() are blocked until completion - * of someone's R-tree index search. - * So, we don't put any __syncthreads() in the loop below. If a warp finished - * gpujoin_gist_getnext() very early, it can reload another index-key for - * the next search during the GiST-index search by the other warps/threads. - * If usage of temp_stack[] exceeds get_local_size(), all the warps move to - * the second phase to run gpujoin_join_quals(), because it means we can - * utilize all the core to evaluate Join quals in parallel; that is the most - * efficient way to run. - */ - vlpos_saved_2 = kcxt->vlpos; - do { - ItemPointerData *t_ctid; - cl_uint mask; - cl_uint t_off; - cl_uint l_next = l_state[depth]; - - t_ctid = gpujoin_gist_getnext(kcxt, - kgjoin, - depth, - kds_gist, - gist_keys, - &l_next); - assert(__activemask() == ~0U); - if (__any_sync(__activemask(), kcxt->errcode != 0)) - goto bailout; /* error */ - - mask = __ballot_sync(__activemask(), t_ctid != NULL); - count = __popc(mask); - if (LaneId() == 0) - temp_index = atomicAdd(&temp_pos[depth], count); - __syncwarp(~0U); - temp_index = __shfl_sync(__activemask(), temp_index, 0); - - if (temp_index + count > GPUJOIN_PSEUDO_STACK_NROOMS) - goto bailout; /* urgent flush; cannot write out all the results */ - temp_index += __popc(mask & ((1U << LaneId()) - 1)); - - if (t_ctid) - { - assert(t_ctid->ip_posid == USHRT_MAX); - t_off = (((cl_uint)t_ctid->ip_blkid.bi_hi << 16) | - ((cl_uint)t_ctid->ip_blkid.bi_lo)); - assert(temp_index < GPUJOIN_PSEUDO_STACK_NROOMS); - temp_stack = __wr_stack_base + - (depth+1) * (GPUJOIN_PSEUDO_STACK_NROOMS + temp_index); - memcpy(temp_stack, rd_stack, sizeof(cl_uint) * depth); - temp_stack[depth] = t_off; - assert(__kds_unpack(t_off) < kds_hash->length); - } - - if (LaneId() == 0) - atomicAdd(&stat_nitems2[depth], count); - __syncwarp(~0U); - l_state[depth] = l_next; - kcxt->vlpos = vlpos_saved_2; /* rewind */ - assert(__activemask() == ~0U); - } while (__any_sync(__activemask(), l_state[depth] != UINT_MAX)); - /* try to reload the next index-key, if temp_stack[] still has space. */ - assert(__activemask() == ~0U); - if (__shfl_sync(__activemask(), temp_pos[depth], 0) < get_local_size()) - goto reload; + if (WARP_WRITE_POS(wp,n_rels) < WARP_READ_POS(wp,n_rels) + warpSize) + return n_rels; /* back to the previous depth */ } else { - l_state[depth] = UINT_MAX; - } -bailout: - /* error checks */ - if (__syncthreads_count(kcxt->errcode != 0) > 0) - return -1; - - if (temp_pos[depth] >= (scan_done ? 1 : get_local_size())) - { - temp_stack = NULL; - if (get_local_id() < temp_pos[depth]) - { - kern_tupitem *tupitem; - cl_bool joinquals_matched = false; - - temp_stack = __wr_stack_base + - (depth+1) * (GPUJOIN_PSEUDO_STACK_NROOMS + get_local_id()); - tupitem = (kern_tupitem *)((char *)kds_hash - + __kds_unpack(temp_stack[depth]) - - offsetof(kern_tupitem, htup)); - assert((char *)tupitem < (char *)kds_hash + kds_hash->length); - /* check join quals */ - if (gpujoin_join_quals(kcxt, - kds_src, - kds_extra, - kmrels, - depth, - temp_stack, - &tupitem->htup, - &joinquals_matched)) - { - assert(joinquals_matched); - /* No RIGHT JOIN are needed */ - assert(tupitem->rowid < kds_hash->nitems); - if (oj_map && !oj_map[tupitem->rowid]) - oj_map[tupitem->rowid] = true; - } - else - { - temp_stack = NULL; - } - } - - /* write out the result */ - wr_index = write_pos[depth]; - wr_index += pgstromStairlikeBinaryCount(temp_stack != NULL, &count); - if (get_local_id() == 0) - { - write_pos[depth] += count; - stat_nitems[depth] += count; - } - wr_stack = __wr_stack_base + (depth+1) * wr_index; - if (temp_stack) - memcpy(wr_stack, temp_stack, sizeof(cl_uint) * (depth+1)); - __syncthreads(); - - /* rewind the temp stack */ - if (get_local_id() == 0) - { - if (get_local_size() < temp_pos[depth]) - { - cl_uint remain = temp_pos[depth] - get_local_size(); - - temp_stack = __wr_stack_base + (depth+1) * GPUJOIN_PSEUDO_STACK_NROOMS; - memcpy(temp_stack, - temp_stack + (depth+1) * get_local_size(), - sizeof(cl_uint) * (depth+1) * remain); - temp_pos[depth] -= get_local_size(); - } - else - { - temp_pos[depth] = 0; - } - } + if (WARP_READ_POS(wp,n_rels) >= WARP_WRITE_POS(wp,n_rels)) + return -1; /* ok, end of GpuJoin */ } - /* count number of threads still in-progress */ - count = __syncthreads_count(l_state[depth] != UINT_MAX && - l_state[depth] != 0); - if (get_local_id() == 0) - wip_count[depth] = count; - - /* see comment in gpujoin_exec_hashjoin */ - wr_index = write_pos[depth]; - __syncthreads(); - if (wr_index + get_local_size() <= GPUJOIN_PSEUDO_STACK_NROOMS) - return depth; - return depth+1; + return n_rels + 1; /* elsewhere, try again? */ } -#define PSTACK_DEPTH(d) \ - ((d) >= 0 && (d) <= kgjoin->num_rels \ - ? (cl_uint *)((char *)pstack + pstack->ps_headsz + \ - get_group_id() * pstack->ps_unitsz + \ - pstack->ps_offset[(d)]) \ - : NULL) - /* - * gpujoin_main + * kern_gpujoin_main */ -DEVICE_FUNCTION(void) -gpujoin_main(kern_context *kcxt, - kern_gpujoin *kgjoin, - kern_multirels *kmrels, - kern_data_store *kds_src, - kern_data_extra *kds_extra, - kern_data_store *kds_dst, - kern_parambuf *kparams_gpreagg, /* only if combined GpuJoin */ - cl_uint *l_state, - cl_bool *matched) +KERNEL_FUNCTION(void) +kern_gpujoin_main(kern_session_info *session, + kern_gputask *kgtask, + kern_multirels *kmrels, + kern_data_store *kds_src, + kern_data_extra *kds_extra, + kern_data_store *kds_dst) { - gpujoinPseudoStack *pstack = kgjoin->pstack; - cl_int max_depth = kgjoin->num_rels; - cl_int depth; - __shared__ cl_int depth_thread0 __attribute__((unused)); - - assert(kds_src->format == KDS_FORMAT_ROW || - kds_src->format == KDS_FORMAT_BLOCK || - kds_src->format == KDS_FORMAT_ARROW || - kds_src->format == KDS_FORMAT_COLUMN); - assert((kds_dst->format == KDS_FORMAT_ROW && kparams_gpreagg == NULL) || - (kds_dst->format == KDS_FORMAT_SLOT && kparams_gpreagg != NULL)); - - /* init per-depth context */ - if (get_local_id() == 0) - { - src_read_pos = UINT_MAX; - stat_source_nitems = 0; - memset(stat_nitems, 0, sizeof(cl_uint) * (max_depth+1)); - memset(stat_nitems2, 0, sizeof(cl_uint) * (max_depth+1)); - memset(wip_count, 0, sizeof(cl_uint) * (max_depth+1)); - memset(read_pos, 0, sizeof(cl_uint) * (max_depth+1)); - memset(write_pos, 0, sizeof(cl_uint) * (max_depth+1)); - memset(temp_pos, 0, sizeof(cl_uint) * (max_depth+1)); - memset(gist_pos, 0, sizeof(cl_uint) * (max_depth+1) * MAXWARPS_PER_BLOCK); - scan_done = false; - base_depth = 0; + kern_context *kcxt; + kern_warp_context *wp, *wp_saved; + char *kvars_addr_wp; + uint32_t kvars_chunksz; + uint32_t *l_state; + bool *matched; + uint32_t wp_base_sz; + uint32_t n_rels = (kmrels ? kmrels->num_rels : 0); + int depth; + __shared__ uint32_t smx_row_count; + + assert(kgtask->kvars_nslots == session->kcxt_kvars_nslots && + kgtask->kvars_nbytes == session->kcxt_kvars_nbytes && + kgtask->n_rels == n_rels); + /* setup execution context */ + INIT_KERNEL_CONTEXT(kcxt, session); + wp_base_sz = __KERN_WARP_CONTEXT_BASESZ(n_rels); + wp = (kern_warp_context *)SHARED_WORKMEM(wp_base_sz, get_local_id() / warpSize); + wp_saved = KERN_GPUTASK_WARP_CONTEXT(kgtask); + l_state = KERN_GPUTASK_LSTATE_ARRAY(kgtask); + matched = KERN_GPUTASK_MATCHED_ARRAY(kgtask); + kvars_chunksz = kcxt->kvars_nbytes * UNIT_TUPLES_PER_DEPTH; + kvars_addr_wp = (char *)wp_saved + wp_base_sz; + + if (kgtask->resume_context) + { + /* resume the warp-context from the previous execution */ + if (LaneId() == 0) + memcpy(wp, wp_saved, wp_base_sz); + if (get_local_id() == 0) + smx_row_count = wp->smx_row_count; + depth = __shfl_sync(__activemask(), wp->depth, 0); } - /* resume the per-depth context, if any */ - if (kgjoin->resume_context) - depth = gpujoin_resume_context(kgjoin, l_state, matched); else + { + /* zero clear the wp */ + if (LaneId() == 0) + memset(wp, 0, wp_base_sz); + if (get_local_id() == 0) + smx_row_count = 0; depth = 0; + if (l_state) + memset(l_state, 0, sizeof(void *) * kcxt->kvars_nslots); + if (matched) + memset(matched, 0, sizeof(bool) * kcxt->kvars_nslots); + } __syncthreads(); - + /* main logic of GpuJoin */ while (depth >= 0) { - /* rewind the varlena buffer */ - kcxt->vlpos = kcxt->vlbuf; + kcxt_reset(kcxt); if (depth == 0) { - /* LOAD FROM KDS_SRC (ROW/BLOCK/ARROW) */ - depth = gpujoin_load_source(kcxt, - kgjoin, - kds_src, - kds_extra, - PSTACK_DEPTH(depth), - l_state); - } - else if (depth > max_depth) - { - assert(depth == kmrels->nrels + 1); - if (kds_dst->format == KDS_FORMAT_ROW) - { - /* PROJECTION (ROW) */ - depth = gpujoin_projection_row(kcxt, - kgjoin, - kmrels, - kds_src, - kds_extra, - kds_dst, - PSTACK_DEPTH(kgjoin->num_rels), - l_state, - matched); - } - else - { - /* PROJECTION (SLOT) */ - depth = gpujoin_projection_slot(kcxt, - kparams_gpreagg, - kgjoin, - kmrels, - kds_src, - kds_extra, - kds_dst, - PSTACK_DEPTH(kgjoin->num_rels), - l_state, - matched); - } - } - else if (kmrels->chunks[depth-1].is_nestloop) - { - /* NEST-LOOP */ - depth = gpujoin_exec_nestloop(kcxt, - kgjoin, - kmrels, + /* LOAD FROM THE SOURCE */ + depth = execGpuScanLoadSource(kcxt, wp, kds_src, kds_extra, - depth, - PSTACK_DEPTH(depth-1), - PSTACK_DEPTH(depth), - l_state, - matched); + SESSION_KEXP_SCAN_LOAD_VARS(session), + SESSION_KEXP_SCAN_QUALS(session), + kvars_addr_wp, /* depth=0 */ + &smx_row_count); } - else if (kmrels->chunks[depth-1].gist_offset != 0) + else if (depth > n_rels) { - /* GiST-INDEX */ - depth = gpujoin_exec_gistindex(kcxt, - kgjoin, - kmrels, - kds_src, - kds_extra, - depth, - PSTACK_DEPTH(depth-1), - PSTACK_DEPTH(depth), - l_state, - matched); - } - else - { - /* HASH-JOIN */ - depth = gpujoin_exec_hashjoin(kcxt, - kgjoin, - kmrels, - kds_src, - kds_extra, - depth, - PSTACK_DEPTH(depth-1), - PSTACK_DEPTH(depth), - l_state, - matched); - } - if (get_local_id() == 0) - depth_thread0 = depth; - if (__syncthreads_count(kcxt->errcode) > 0) - return; - assert(depth_thread0 == depth); - } - - /* update statistics only if normal exit */ - if (depth == -1 && get_local_id() == 0) - { - gpujoinSuspendContext *sb - = KERN_GPUJOIN_SUSPEND_CONTEXT(kgjoin, get_group_id()); - sb->depth = -1; /* no more suspend/resume! */ - - atomicAdd(&kgjoin->source_nitems, stat_source_nitems); - atomicAdd(&kgjoin->outer_nitems, stat_nitems[0]); - for (int i=0; i <= max_depth; i++) - { - atomicAdd(&kgjoin->stat[i].nitems, stat_nitems[i+1]); - atomicAdd(&kgjoin->stat[i].nitems2, stat_nitems2[i+1]); - } - } -} + bool try_suspend = false; -/* - * gpujoin_collocate_outer_join_map - * - * it merges the result of other GPU devices and CPU fallback - */ -KERNEL_FUNCTION(void) -gpujoin_colocate_outer_join_map(kern_multirels *kmrels, - cl_uint num_devices) -{ - size_t nrooms = kmrels->ojmaps_length / sizeof(cl_uint); - cl_uint *ojmaps = (cl_uint *)((char *)kmrels + kmrels->kmrels_length); - cl_uint *destmap = ojmaps + kmrels->cuda_dindex * nrooms; - cl_uint i, j, map; - - for (i = get_global_id(); - i < nrooms; - i += get_global_size()) - { - map = 0; - for (j = 0; j <= num_devices; j++) - { - map |= ojmaps[i]; - ojmaps += nrooms; - } - destmap[i] = map; - } -} - -/* - * gpujoin_right_outer - */ -DEVICE_FUNCTION(void) -gpujoin_right_outer(kern_context *kcxt, - kern_gpujoin *kgjoin, - kern_multirels *kmrels, - cl_int outer_depth, - kern_data_store *kds_dst, - kern_parambuf *kparams_gpreagg, - cl_uint *l_state, - cl_bool *matched) -{ - gpujoinPseudoStack *pstack = kgjoin->pstack; - cl_int max_depth = kgjoin->num_rels; - cl_int depth; - __shared__ cl_int depth_thread0 __attribute__((unused)); - - assert(KERN_MULTIRELS_RIGHT_OUTER_JOIN(kmrels, outer_depth)); - assert((kds_dst->format == KDS_FORMAT_ROW && kparams_gpreagg == NULL) || - (kds_dst->format == KDS_FORMAT_SLOT && kparams_gpreagg != NULL)); - - /* setup per-depth context */ - memset(l_state, 0, sizeof(l_state)); - memset(matched, 0, sizeof(matched)); - if (get_local_id() == 0) - { - src_read_pos = UINT_MAX; - stat_source_nitems = 0; - memset(stat_nitems, 0, sizeof(cl_uint) * (max_depth+1)); - memset(stat_nitems2, 0, sizeof(cl_uint) * (max_depth+1)); - memset(wip_count, 0, sizeof(cl_uint) * (max_depth+1)); - memset(read_pos, 0, sizeof(cl_uint) * (max_depth+1)); - memset(write_pos, 0, sizeof(cl_uint) * (max_depth+1)); - memset(temp_pos, 0, sizeof(cl_uint) * (max_depth+1)); - memset(gist_pos, 0, sizeof(cl_uint) * (max_depth+1) * MAXWARPS_PER_BLOCK); - scan_done = false; - base_depth = outer_depth; - } - /* resume the per-depth context, if any */ - if (kgjoin->resume_context) - depth = gpujoin_resume_context(kgjoin, l_state, matched); - else - depth = outer_depth; - __syncthreads(); - - /* main logic of GpuJoin */ - while (depth >= outer_depth) - { - /* rewind the varlena buffer */ - kcxt->vlpos = kcxt->vlbuf; - if (depth == outer_depth) - { - /* makes RIGHT OUTER combinations using OUTER JOIN map */ - depth = gpujoin_load_outer(kcxt, - kgjoin, - kmrels, - outer_depth, - PSTACK_DEPTH(outer_depth), - l_state); - } - else if (depth > max_depth) - { - assert(depth == kmrels->nrels + 1); - if (kds_dst->format == KDS_FORMAT_ROW) + assert(depth == n_rels+1); + if (session->xpucode_projection) { - /* PROJECTION (ROW) */ - depth = gpujoin_projection_row(kcxt, - kgjoin, - kmrels, - NULL, - NULL, - kds_dst, - PSTACK_DEPTH(kgjoin->num_rels), - l_state, - matched); + /* PROJECTION */ + depth = execGpuJoinProjection(kcxt, wp, + n_rels, + kds_dst, + SESSION_KEXP_PROJECTION(session), + kvars_addr_wp + kvars_chunksz * n_rels, + &try_suspend); } else { - /* PROJECTION (SLOT) */ - depth = gpujoin_projection_slot(kcxt, - kparams_gpreagg, - kgjoin, - kmrels, - NULL, - NULL, - kds_dst, - PSTACK_DEPTH(kgjoin->num_rels), - l_state, - matched); + /* PRE-AGG */ + depth = execGpuPreAggGroupBy(kcxt, wp, + n_rels, + kds_dst, + kvars_addr_wp + kvars_chunksz * n_rels, + &try_suspend); + } + if (__any_sync(__activemask(), try_suspend)) + { + if (LaneId() == 0) + atomicAdd(&kgtask->suspend_count, 1); + assert(depth < 0); } } else if (kmrels->chunks[depth-1].is_nestloop) { /* NEST-LOOP */ - depth = gpujoin_exec_nestloop(kcxt, - kgjoin, - kmrels, - NULL, - NULL, - depth, - PSTACK_DEPTH(depth-1), - PSTACK_DEPTH(depth), - l_state, - matched); + depth = execGpuJoinNestLoop(kcxt, wp, + kmrels, + depth, + kvars_addr_wp + kvars_chunksz * (depth-1), + kvars_addr_wp + kvars_chunksz * depth, + l_state[depth-1], /* call by reference */ + matched[depth-1]); /* call by reference */ } - else if (kmrels->chunks[depth-1].gist_offset) +#if 0 + else if (kmrels->chunks[depth-1].gist_offset != 0) { - /* GiST-INDEX */ - depth = gpujoin_exec_gistindex(kcxt, - kgjoin, - kmrels, - NULL, - NULL, - depth, - PSTACK_DEPTH(depth-1), - PSTACK_DEPTH(depth), - l_state, - matched); + /* GiST-INDEX-JOIN */ + depth = execGpuJoinGiSTJoin(kcxt, wp, ...); } +#endif else { /* HASH-JOIN */ - depth = gpujoin_exec_hashjoin(kcxt, - kgjoin, - kmrels, - NULL, - NULL, - depth, - PSTACK_DEPTH(depth-1), - PSTACK_DEPTH(depth), - l_state, - matched); - } - if (get_local_id() == 0) - depth_thread0 = depth; - if (__syncthreads_count(kcxt->errcode) > 0) - return; - assert(depth == depth_thread0); + depth = execGpuJoinHashJoin(kcxt, wp, + kmrels, + depth, + kvars_addr_wp + kvars_chunksz * (depth-1), + kvars_addr_wp + kvars_chunksz * depth, + l_state[depth-1], /* call by reference */ + matched[depth-1]); /* call by reference */ + } + assert(__shfl_sync(__activemask(), depth, 0) == depth); + /* bailout if any error status */ + if (__any_sync(__activemask(), kcxt->errcode != ERRCODE_STROM_SUCCESS)) + break; } - - /* write out statistics */ - if (get_local_id() == 0) + __syncthreads(); + /* suspend the execution context */ + if (LaneId() == 0) { - gpujoinSuspendContext *sb - = KERN_GPUJOIN_SUSPEND_CONTEXT(kgjoin, get_group_id()); - sb->depth = -1; /* no more suspend/resume! */ - - assert(stat_source_nitems == 0); - assert(stat_nitems[0] == 0); - for (int i=outer_depth; i <= max_depth; i++) - { - atomicAdd(&kgjoin->stat[i-1].nitems, stat_nitems[i]); - atomicAdd(&kgjoin->stat[i-1].nitems2, stat_nitems2[i]); - } + wp->depth = depth; + wp->smx_row_count = smx_row_count; + memcpy(wp_saved, wp, wp_base_sz); } - __syncthreads(); + STROM_WRITEBACK_ERROR_STATUS(&kgtask->kerror, kcxt); } diff --git a/src/cuda_gpupreagg.cu b/src/cuda_gpupreagg.cu index f18dc2666..d0a33814c 100644 --- a/src/cuda_gpupreagg.cu +++ b/src/cuda_gpupreagg.cu @@ -1,1773 +1,1750 @@ /* - * cuda_gpupreagg.h + * cuda_gpupreagg.cu * - * Preprocess of aggregate using GPU acceleration, to reduce number of - * rows to be processed by CPU; including the Sort reduction. - * -- - * Copyright 2011-2021 (C) KaiGai Kohei - * Copyright 2014-2021 (C) PG-Strom Developers Team + * Device implementation of GpuScan + * ---- + * Copyright 2011-2023 (C) KaiGai Kohei + * Copyright 2014-2023 (C) PG-Strom Developers Team * * This program is free software; you can redistribute it and/or modify * it under the terms of the PostgreSQL License. */ #include "cuda_common.h" -#include "cuda_gpupreagg.h" -#include "cuda_postgis.h" +#include "float2.h" /* - * common portion for gpupreagg_setup_* + * Atomic operations */ -STATIC_FUNCTION(bool) -gpupreagg_setup_common(kern_context *kcxt, - kern_gpupreagg *kgpreagg, - kern_data_store *kds_src, - kern_data_store *kds_slot, - cl_uint nvalids, - cl_uint slot_index, - cl_char *tup_dclass, - Datum *tup_values, - cl_int *tup_extra) +INLINE_FUNCTION(uint32_t) +__atomic_write_uint32(uint32_t *ptr, uint32_t ival) { - cl_uint offset; - cl_uint required; - cl_uint extra_sz = 0; - cl_bool suspend_kernel = false; - __shared__ cl_uint nitems_base; - __shared__ cl_uint extra_base; + return atomicExch((unsigned int *)ptr, ival); +} - /* - * calculation of the required extra buffer - */ - if (slot_index != UINT_MAX) +INLINE_FUNCTION(uint64_t) +__atomic_write_uint64(uint64_t *ptr, uint64_t ival) +{ + return atomicExch((unsigned long long int *)ptr, ival); +} + +INLINE_FUNCTION(uint32_t) +__atomic_add_uint32(uint32_t *ptr, uint32_t ival) +{ + return atomicAdd((unsigned int *)ptr, (unsigned int)ival); +} + +INLINE_FUNCTION(uint64_t) +__atomic_add_uint64(uint64_t *ptr, uint64_t ival) +{ + return atomicAdd((unsigned long long *)ptr, (unsigned long long)ival); +} + +INLINE_FUNCTION(int64_t) +__atomic_add_int64(int64_t *ptr, int64_t ival) +{ + return atomicAdd((unsigned long long int *)ptr, (unsigned long long int)ival); +} + +INLINE_FUNCTION(float8_t) +__atomic_add_fp64(float8_t *ptr, float8_t fval) +{ + return atomicAdd((double *)ptr, (double)fval); +} + +INLINE_FUNCTION(int64_t) +__atomic_min_int64(int64_t *ptr, int64_t ival) +{ + return atomicMin((long long int *)ptr, (long long int)ival); +} + +INLINE_FUNCTION(int64_t) +__atomic_max_int64(int64_t *ptr, int64_t ival) +{ + return atomicMax((long long int *)ptr, (long long int)ival); +} + +INLINE_FUNCTION(float8_t) +__atomic_min_fp64(float8_t *ptr, float8_t fval) +{ + union { + unsigned long long ival; + float8_t fval; + } oldval, curval, newval; + + newval.fval = fval; + curval.fval = __volatileRead(ptr); + while (newval.fval < curval.fval) { - if (kds_slot->ncols > 0) - memset(tup_extra, 0, sizeof(cl_int) * kds_slot->ncols); + oldval = curval; + curval.ival = atomicCAS((unsigned long long *)ptr, + oldval.ival, + newval.ival); + if (curval.ival == oldval.ival) + break; + } + return curval.fval; +} - for (int j=0; j < kds_slot->ncols; j++) - { - kern_colmeta *cmeta = &kds_slot->colmeta[j]; - cl_char dclass = tup_dclass[j]; - cl_char *addr; +INLINE_FUNCTION(float8_t) +__atomic_max_fp64(float8_t *ptr, float8_t fval) +{ + union { + unsigned long long ival; + float8_t fval; + } oldval, curval, newval; + + newval.fval = fval; + curval.fval = __volatileRead(ptr); + while (newval.fval > curval.fval) + { + oldval = curval; + curval.ival = atomicCAS((unsigned long long *)ptr, + oldval.ival, + newval.ival); + if (curval.ival == oldval.ival) + break; + } + return curval.fval; +} - if (dclass == DATUM_CLASS__NULL) - continue; - if (cmeta->attbyval) +INLINE_FUNCTION(uint32_t) +__atomic_cas_uint32(uint32_t *ptr, uint32_t comp, uint32_t newval) +{ + return atomicCAS((unsigned int *)ptr, + (unsigned int)comp, + (unsigned int)newval); +} + +INLINE_FUNCTION(uint64_t) +__atomic_cas_uint64(uint64_t *ptr, uint64_t comp, uint64_t newval) +{ + return atomicCAS((unsigned long long int *)ptr, + (unsigned long long int)comp, + (unsigned long long int)newval); +} + +/* + * __writeOutOneTuplePreAgg + */ +STATIC_FUNCTION(int32_t) +__writeOutOneTupleGroupKey(kern_context *kcxt, + kern_colmeta *cmeta, + kern_aggregate_desc *desc, + char *buffer) +{ + kern_variable *kvar; + int vclass; + int32_t nbytes; + + assert(desc->action == KAGG_ACTION__VREF && + desc->arg0_slot_id >= 0 && + desc->arg0_slot_id < kcxt->kvars_nslots); + vclass = kcxt->kvars_class[desc->arg0_slot_id]; + kvar = &kcxt->kvars_slot[desc->arg0_slot_id]; + switch (vclass) + { + case KVAR_CLASS__NULL: + return 0; + + case KVAR_CLASS__INLINE: + assert(cmeta->attlen >= 0 && + cmeta->attlen <= sizeof(kern_variable)); + if (buffer) + memcpy(buffer, kvar, cmeta->attlen); + return cmeta->attlen; + + case KVAR_CLASS__VARLENA: + assert(cmeta->attlen == -1); + nbytes = VARSIZE_ANY(kvar->ptr); + if (buffer) + memcpy(buffer, kvar->ptr, nbytes); + return nbytes; + + case KVAR_CLASS__XPU_DATUM: { - assert(dclass == DATUM_CLASS__NORMAL); - continue; + xpu_datum_t *xdatum = (xpu_datum_t *) + ((char *)kcxt->kvars_slot + kvar->xpu.offset); + const xpu_datum_operators *expr_ops = xdatum->expr_ops; + + if (XPU_DATUM_ISNULL(xdatum)) + return 0; + assert(expr_ops->xpu_type_code == kvar->xpu.type_code); + return expr_ops->xpu_datum_write(kcxt, buffer, xdatum); } - if (cmeta->attlen > 0) + + default: + if (vclass < 0) + return -1; + if (cmeta->attlen >= 0) { - assert(dclass == DATUM_CLASS__NORMAL); - addr = DatumGetPointer(tup_values[j]); - if (addr < (char *)kds_src || - addr >= (char *)kds_src + kds_src->length) + if (buffer) + { + nbytes = Min(vclass, cmeta->attlen); + memcpy(buffer, kvar->ptr, nbytes); + if (nbytes < cmeta->attlen) + memset(buffer + nbytes, 0, cmeta->attlen - nbytes); + } + return cmeta->attlen; + } + else if (cmeta->attlen == -1) + { + nbytes = VARHDRSZ + vclass; + if (buffer) { - tup_extra[j] = cmeta->attlen; - extra_sz += MAXALIGN(cmeta->attlen); + memcpy(buffer+VARHDRSZ, kvar->ptr, vclass); + SET_VARSIZE(buffer, nbytes); } + return nbytes; } - else - { - /* - * NOTE: DATUM_CLASS__* that is not NORMAL only happen when - * Var-node references the kds_src buffer which is not - * a normal heap-tuple (Apache Arrow). So, it is sufficient - * to copy only pg_varlena_t or pg_array_t according to the - * datum class. Unlike gpupreagg_final_data_move(), kds_src - * buffer shall be valid until reduction steps. - */ - assert(cmeta->attlen == -1); - switch (dclass) + } + return -1; +} + +STATIC_FUNCTION(int32_t) +__writeOutOneTuplePreAgg(kern_context *kcxt, + kern_data_store *kds_final, + HeapTupleHeaderData *htup, + kern_expression *kexp_actions) +{ + int nattrs = Min(kds_final->ncols, kexp_actions->u.pagg.nattrs); + uint32_t t_hoff, t_next; + uint16_t t_infomask = HEAP_HASNULL; + char *buffer = NULL; + + t_hoff = MAXALIGN(offsetof(HeapTupleHeaderData, + t_bits) + BITMAPLEN(nattrs)); + if (htup) + { + memset(htup, 0, t_hoff); + htup->t_choice.t_datum.datum_typmod = kds_final->tdtypmod; + htup->t_choice.t_datum.datum_typeid = kds_final->tdtypeid; + htup->t_ctid.ip_blkid.bi_hi = 0xffff; /* InvalidBlockNumber */ + htup->t_ctid.ip_blkid.bi_lo = 0xffff; + htup->t_ctid.ip_posid = 0; /* InvalidOffsetNumber */ + htup->t_infomask2 = (nattrs & HEAP_NATTS_MASK); + htup->t_hoff = t_hoff; + } + /* walk on the columns */ + for (int j=0; j < nattrs; j++) + { + kern_aggregate_desc *desc = &kexp_actions->u.pagg.desc[j]; + kern_colmeta *cmeta = &kds_final->colmeta[j]; + int nbytes; + + assert((char *)cmeta > (char *)kds_final && + (char *)cmeta < (char *)kds_final + kds_final->length); + assert(cmeta->attalign > 0 && cmeta->attalign <= 8); + t_next = TYPEALIGN(cmeta->attalign, t_hoff); + if (htup) + { + if (t_next > t_hoff) + memset((char *)htup + t_hoff, 0, t_next - t_hoff); + buffer = (char *)htup + t_next; + } + + switch (desc->action) + { + case KAGG_ACTION__VREF: + nbytes = __writeOutOneTupleGroupKey(kcxt, cmeta, desc, buffer); + if (nbytes < 0) + return -1; + break; + + case KAGG_ACTION__NROWS_ANY: + case KAGG_ACTION__NROWS_COND: + case KAGG_ACTION__PSUM_INT: + nbytes = sizeof(int64_t); + if (buffer) + *((int64_t *)buffer) = 0; + break; + + case KAGG_ACTION__PSUM_FP: + nbytes = sizeof(float8_t); + if (buffer) + *((float8_t *)buffer) = 0.0; + break; + + case KAGG_ACTION__PMIN_INT: + nbytes = sizeof(kagg_state__pminmax_int64_packed); + if (buffer) { - case DATUM_CLASS__VARLENA: - tup_extra[j] = sizeof(pg_varlena_t); - extra_sz += MAXALIGN(sizeof(pg_varlena_t)); - break; - case DATUM_CLASS__ARRAY: - tup_extra[j] = sizeof(pg_array_t); - extra_sz += MAXALIGN(sizeof(pg_array_t)); - break; - case DATUM_CLASS__COMPOSITE: - tup_extra[j] = sizeof(pg_composite_t); - extra_sz += MAXALIGN(sizeof(pg_composite_t)); - break; - case DATUM_CLASS__GEOMETRY: - tup_extra[j] = sizeof(pg_geometry_t); - extra_sz += MAXALIGN(sizeof(pg_geometry_t)); - break; - default: - assert(dclass == DATUM_CLASS__NORMAL); - addr = DatumGetPointer(tup_values[j]); - if (addr < (char *)kds_src || - addr >= (char *)kds_src + kds_src->length) - { - tup_extra[j] = VARSIZE_ANY(addr); - extra_sz += MAXALIGN(VARSIZE_ANY(addr)); - } - break; + kagg_state__pminmax_int64_packed *r = + (kagg_state__pminmax_int64_packed *)buffer; + r->nitems = 0; + r->value = LONG_MAX; + SET_VARSIZE(r, sizeof(kagg_state__pminmax_int64_packed)); + } + t_infomask |= HEAP_HASVARWIDTH; + break; + + case KAGG_ACTION__PMAX_INT: + nbytes = sizeof(kagg_state__pminmax_int64_packed); + if (buffer) + { + kagg_state__pminmax_int64_packed *r = + (kagg_state__pminmax_int64_packed *)buffer; + r->nitems = 0; + r->value = LONG_MIN; + SET_VARSIZE(r, sizeof(kagg_state__pminmax_int64_packed)); + } + t_infomask |= HEAP_HASVARWIDTH; + break; + + case KAGG_ACTION__PMIN_FP: + nbytes = sizeof(kagg_state__pminmax_fp64_packed); + if (buffer) + { + kagg_state__pminmax_fp64_packed *r = + (kagg_state__pminmax_fp64_packed *)buffer; + r->nitems = 0; + r->value = DBL_MAX; + SET_VARSIZE(r, sizeof(kagg_state__pminmax_fp64_packed)); } - } + t_infomask |= HEAP_HASVARWIDTH; + break; + + case KAGG_ACTION__PMAX_FP: + nbytes = sizeof(kagg_state__pminmax_fp64_packed); + if (buffer) + { + kagg_state__pminmax_fp64_packed *r = + (kagg_state__pminmax_fp64_packed *)buffer; + r->nitems = 0; + r->value = -DBL_MAX; + SET_VARSIZE(r, sizeof(kagg_state__pminmax_fp64_packed)); + } + t_infomask |= HEAP_HASVARWIDTH; + break; + + case KAGG_ACTION__PAVG_INT: + nbytes = sizeof(kagg_state__pavg_int_packed); + if (buffer) + { + memset(buffer, 0, sizeof(kagg_state__pavg_int_packed)); + SET_VARSIZE(buffer, sizeof(kagg_state__pavg_int_packed)); + } + t_infomask |= HEAP_HASVARWIDTH; + break; + + case KAGG_ACTION__PAVG_FP: + nbytes = sizeof(kagg_state__pavg_fp_packed); + if (buffer) + { + memset(buffer, 0, sizeof(kagg_state__pavg_fp_packed)); + SET_VARSIZE(buffer, sizeof(kagg_state__pavg_fp_packed)); + } + t_infomask |= HEAP_HASVARWIDTH; + break; + + case KAGG_ACTION__STDDEV: + nbytes = sizeof(kagg_state__stddev_packed); + if (buffer) + { + memset(buffer, 0, sizeof(kagg_state__stddev_packed)); + SET_VARSIZE(buffer, sizeof(kagg_state__stddev_packed)); + } + t_infomask |= HEAP_HASVARWIDTH; + break; + + case KAGG_ACTION__COVAR: + nbytes = sizeof(kagg_state__covar_packed); + if (buffer) + { + memset(buffer, 0, sizeof(kagg_state__covar_packed)); + SET_VARSIZE(buffer, sizeof(kagg_state__covar_packed)); + } + t_infomask |= HEAP_HASVARWIDTH; + break; + + default: + STROM_ELOG(kcxt, "unknown xpuPreAgg action"); + return -1; } + if (htup && nbytes > 0) + htup->t_bits[j>>3] |= (1<<(j&7)); + t_hoff = t_next + nbytes; } - /* - * allocation of extra buffer for indirect/varlena values - */ - offset = pgstromStairlikeSum(extra_sz, &required); - if (get_local_id() == 0) + if (htup) + htup->t_infomask = t_infomask; + return t_hoff; +} + +/* + * __update_nogroups__nrows_any + */ +INLINE_FUNCTION(void) +__update_nogroups__nrows_any(kern_context *kcxt, + char *buffer, + kern_colmeta *cmeta, + kern_aggregate_desc *desc, + bool kvars_is_valid) +{ + uint32_t mask; + + mask = __ballot_sync(__activemask(), kvars_is_valid); + if (LaneId() == 0) + __atomic_add_uint64((uint64_t *)buffer, __popc(mask)); +} + +/* + * __update_nogroups__nrows_cond + */ +INLINE_FUNCTION(void) +__update_nogroups__nrows_cond(kern_context *kcxt, + char *buffer, + kern_colmeta *cmeta, + kern_aggregate_desc *desc, + bool kvars_is_valid) +{ + uint32_t mask; + + if (kvars_is_valid) { - union { - struct { - cl_uint nitems; - cl_uint usage; - } i; - cl_ulong v64; - } oldval, curval, newval; - - curval.i.nitems = kds_slot->nitems; - curval.i.usage = kds_slot->usage; - do { - newval = oldval = curval; - newval.i.nitems += nvalids; - newval.i.usage += __kds_packed(required); - if (KERN_DATA_STORE_SLOT_LENGTH(kds_slot, newval.i.nitems) + - __kds_unpack(newval.i.usage) > kds_slot->length) - { - suspend_kernel = true; - atomicAdd(&kgpreagg->suspend_count, 1); - break; - } - } while((curval.v64 = atomicCAS((cl_ulong *)&kds_slot->nitems, - oldval.v64, - newval.v64)) != oldval.v64); - nitems_base = oldval.i.nitems; - extra_base = __kds_unpack(oldval.i.usage); + if (kcxt->kvars_class[desc->arg0_slot_id] == KVAR_CLASS__NULL) + kvars_is_valid = false; } - if (__syncthreads_count(suspend_kernel) > 0) - return false; + mask = __ballot_sync(__activemask(), kvars_is_valid); + if (LaneId() == 0) + __atomic_add_uint64((uint64_t *)buffer, __popc(mask)); +} - if (slot_index != UINT_MAX) +/* + * __update_nogroups__XXXX + */ +INLINE_FUNCTION(void) +__update_nogroups__pmin_int(kern_context *kcxt, + char *buffer, + kern_colmeta *cmeta, + kern_aggregate_desc *desc, + bool kvars_is_valid) +{ + int64_t ival = LONG_MAX; + uint32_t mask; + + if (kvars_is_valid) { - assert(slot_index < nvalids); - slot_index += nitems_base; - /* - * Fixup pointers if needed. Please note that any variables on - * kcxt->vlbuf is not visible to other threads. - */ - if (extra_sz > 0) + int slot_id = desc->arg0_slot_id; + int vclass = kcxt->kvars_class[slot_id]; + + if (vclass == KVAR_CLASS__INLINE) + ival = kcxt->kvars_slot[slot_id].i64; + else { - char *extra_pos - = (char *)kds_slot + kds_slot->length - - (extra_base + required) + offset; + assert(vclass == KVAR_CLASS__NULL); + kvars_is_valid = false; + } + } + mask = __ballot_sync(__activemask(), kvars_is_valid); + if (mask != 0) + { + kagg_state__pminmax_int64_packed *r = + (kagg_state__pminmax_int64_packed *)buffer; - for (int j=0; j < kds_slot->ncols; j++) - { - if (tup_extra[j] == 0) - continue; - memcpy(extra_pos, - DatumGetPointer(tup_values[j]), - tup_extra[j]); - tup_values[j] = PointerGetDatum(extra_pos); - extra_pos += MAXALIGN(tup_extra[j]); - } + ival = Min(ival, __shfl_xor_sync(__activemask(), ival, 0x0001)); + ival = Min(ival, __shfl_xor_sync(__activemask(), ival, 0x0002)); + ival = Min(ival, __shfl_xor_sync(__activemask(), ival, 0x0004)); + ival = Min(ival, __shfl_xor_sync(__activemask(), ival, 0x0008)); + ival = Min(ival, __shfl_xor_sync(__activemask(), ival, 0x0010)); + + if (LaneId() == 0) + { + __atomic_add_uint32(&r->nitems, __popc(mask)); + __atomic_min_int64(&r->value, ival); } - memcpy(KERN_DATA_STORE_VALUES(kds_slot, slot_index), - tup_values, sizeof(Datum) * kds_slot->ncols); - memcpy(KERN_DATA_STORE_DCLASS(kds_slot, slot_index), - tup_dclass, sizeof(cl_char) * kds_slot->ncols); } - return true; } /* - * gpupreagg_setup_row + * __update_nogroups__pmax_int */ -DEVICE_FUNCTION(void) -gpupreagg_setup_row(kern_context *kcxt, - kern_gpupreagg *kgpreagg, - kern_data_store *kds_src, /* in: KDS_FORMAT_ROW */ - kern_data_store *kds_slot) /* out: KDS_FORMAT_SLOT */ +INLINE_FUNCTION(void) +__update_nogroups__pmax_int(kern_context *kcxt, + char *buffer, + kern_colmeta *cmeta, + kern_aggregate_desc *desc, + bool kvars_is_valid) { - cl_uint src_nitems = __ldg(&kds_src->nitems); - cl_uint src_base; - cl_uint src_index; - cl_uint slot_index; - cl_uint count; - cl_uint nvalids; - cl_char *vlbuf_base; - cl_char *tup_dclass; - Datum *tup_values; - cl_int *tup_extra; - kern_tupitem *tupitem; - gpupreaggSuspendContext *my_suspend; - cl_bool rc; + int64_t ival = LONG_MIN; + uint32_t mask; - assert(kds_src->format == KDS_FORMAT_ROW && - kds_slot->format == KDS_FORMAT_SLOT); + if (kvars_is_valid) + { + int slot_id = desc->arg0_slot_id; + int vclass = kcxt->kvars_class[slot_id]; - /* resume kernel from the point where suspended, if any */ - my_suspend = KERN_GPUPREAGG_SUSPEND_CONTEXT(kgpreagg, get_group_id()); - if (kgpreagg->resume_context) - src_base = my_suspend->r.src_base; - else - src_base = get_global_base(); - __syncthreads(); - - tup_dclass = (cl_char *) - kern_context_alloc(kcxt, sizeof(cl_char) * kds_slot->ncols); - tup_values = (Datum *) - kern_context_alloc(kcxt, sizeof(Datum) * kds_slot->ncols); - tup_extra = (cl_int *) - kern_context_alloc(kcxt, sizeof(cl_int) * kds_slot->ncols); - if (!tup_dclass || !tup_values || !tup_extra) - STROM_EREPORT(kcxt, ERRCODE_OUT_OF_MEMORY, "out of memory"); - /* bailout if any errors */ - if (__syncthreads_count(kcxt->errcode) > 0) - goto skip; - vlbuf_base = kcxt->vlpos; - - while (src_base < src_nitems) + if (vclass == KVAR_CLASS__INLINE) + ival = kcxt->kvars_slot[slot_id].i64; + else + { + assert(vclass == KVAR_CLASS__NULL); + kvars_is_valid = false; + } + } + mask = __ballot_sync(__activemask(), kvars_is_valid); + if (mask != 0) { - kcxt->vlpos = vlbuf_base; /* rewind */ - src_index = src_base + get_local_id(); - if (src_index < src_nitems) + kagg_state__pminmax_int64_packed *r = + (kagg_state__pminmax_int64_packed *)buffer; + + ival = Max(ival, __shfl_xor_sync(__activemask(), ival, 0x0001)); + ival = Max(ival, __shfl_xor_sync(__activemask(), ival, 0x0002)); + ival = Max(ival, __shfl_xor_sync(__activemask(), ival, 0x0004)); + ival = Max(ival, __shfl_xor_sync(__activemask(), ival, 0x0008)); + ival = Max(ival, __shfl_xor_sync(__activemask(), ival, 0x0010)); + + if (LaneId() == 0) { - tupitem = KERN_DATA_STORE_TUPITEM(kds_src, src_index); - rc = gpupreagg_quals_eval(kcxt, kds_src, - &tupitem->htup.t_ctid, - &tupitem->htup); - kcxt->vlpos = vlbuf_base; /* rewind */ + __atomic_add_uint32(&r->nitems, __popc(mask)); + __atomic_max_int64(&r->value, ival); } + } +} + +/* + * __update_nogroups__pmin_fp + */ +INLINE_FUNCTION(void) +__update_nogroups__pmin_fp(kern_context *kcxt, + char *buffer, + kern_colmeta *cmeta, + kern_aggregate_desc *desc, + bool kvars_is_valid) +{ + float8_t fval = DBL_MAX; + uint32_t mask; + + if (kvars_is_valid) + { + int slot_id = desc->arg0_slot_id; + int vclass = kcxt->kvars_class[slot_id]; + + if (vclass == KVAR_CLASS__INLINE) + fval = kcxt->kvars_slot[slot_id].fp64; else { - tupitem = NULL; - rc = false; + assert(vclass == KVAR_CLASS__NULL); + kvars_is_valid = false; } - /* bailout if any errors on gpupreagg_quals_eval */ - if (__syncthreads_count(kcxt->errcode) > 0) - break; - /* allocation of kds_slot buffer, if any */ - slot_index = pgstromStairlikeBinaryCount(rc, &nvalids); - if (nvalids > 0) + } + mask = __ballot_sync(__activemask(), kvars_is_valid); + if (mask != 0) + { + kagg_state__pminmax_fp64_packed *r = + (kagg_state__pminmax_fp64_packed *)buffer; + + fval = Min(fval, __shfl_xor_sync(__activemask(), fval, 0x0001)); + fval = Min(fval, __shfl_xor_sync(__activemask(), fval, 0x0002)); + fval = Min(fval, __shfl_xor_sync(__activemask(), fval, 0x0004)); + fval = Min(fval, __shfl_xor_sync(__activemask(), fval, 0x0008)); + fval = Min(fval, __shfl_xor_sync(__activemask(), fval, 0x0010)); + + if (LaneId() == 0) { - if (rc) - { - assert(tupitem != NULL); - gpupreagg_projection_row(kcxt, - kds_src, - &tupitem->htup, - tup_dclass, - tup_values); - } - /* bailout if any errors */ - if (__syncthreads_count(kcxt->errcode) > 0) - break; + __atomic_add_uint32(&r->nitems, __popc(mask)); + __atomic_min_fp64(&r->value, fval); + } + } +} - if (!gpupreagg_setup_common(kcxt, - kgpreagg, - kds_src, - kds_slot, - nvalids, - rc ? slot_index : UINT_MAX, - tup_dclass, - tup_values, - tup_extra)) - break; +/* + * __update_nogroups__pmax_fp + */ +INLINE_FUNCTION(void) +__update_nogroups__pmax_fp(kern_context *kcxt, + char *buffer, + kern_colmeta *cmeta, + kern_aggregate_desc *desc, + bool kvars_is_valid) +{ + float8_t fval = -DBL_MAX; + uint32_t mask; + + if (kvars_is_valid) + { + int slot_id = desc->arg0_slot_id; + int vclass = kcxt->kvars_class[slot_id]; + + if (vclass == KVAR_CLASS__INLINE) + fval = kcxt->kvars_slot[slot_id].fp64; + else + { + assert(vclass == KVAR_CLASS__NULL); + kvars_is_valid = false; } - /* update statistics */ - count = __syncthreads_count(tupitem != NULL); - if (get_local_id() == 0) + } + mask = __ballot_sync(__activemask(), kvars_is_valid); + if (mask != 0) + { + kagg_state__pminmax_fp64_packed *r = + (kagg_state__pminmax_fp64_packed *)buffer; + + fval = Min(fval, __shfl_xor_sync(__activemask(), fval, 0x0001)); + fval = Min(fval, __shfl_xor_sync(__activemask(), fval, 0x0002)); + fval = Min(fval, __shfl_xor_sync(__activemask(), fval, 0x0004)); + fval = Min(fval, __shfl_xor_sync(__activemask(), fval, 0x0008)); + fval = Min(fval, __shfl_xor_sync(__activemask(), fval, 0x0010)); + + if (LaneId() == 0) { - atomicAdd(&kgpreagg->nitems_real, count); - atomicAdd(&kgpreagg->nitems_filtered, count - nvalids); + __atomic_add_uint32(&r->nitems, __popc(mask)); + __atomic_max_fp64(&r->value, fval); } - /* move to the next window */ - src_base += get_global_size(); } -skip: - /* save the current execution context */ - if (get_local_id() == 0) - my_suspend->r.src_base = src_base; } -DEVICE_FUNCTION(void) -gpupreagg_setup_block(kern_context *kcxt, - kern_gpupreagg *kgpreagg, - kern_data_store *kds_src, - kern_data_store *kds_slot) +/* + * __update_nogroups__psum_int + */ +INLINE_FUNCTION(void) +__update_nogroups__psum_int(kern_context *kcxt, + char *buffer, + kern_colmeta *cmeta, + kern_aggregate_desc *desc, + bool kvars_is_valid) { - cl_uint window_sz; - cl_uint part_sz; - cl_uint n_parts; - cl_uint count; - cl_uint part_index = 0; - cl_uint line_index = 0; - cl_bool thread_is_valid = false; - cl_char *vlbuf_base; - cl_char *tup_dclass; - Datum *tup_values; - cl_int *tup_extra; - gpupreaggSuspendContext *my_suspend; - - assert(kds_src->format == KDS_FORMAT_BLOCK && - kds_slot->format == KDS_FORMAT_SLOT); - - part_sz = Min((kds_src->nrows_per_block + - warpSize-1) & ~(warpSize-1), get_local_size()); - n_parts = get_local_size() / part_sz; - if (get_local_id() < part_sz * n_parts) - thread_is_valid = true; - window_sz = n_parts * get_num_groups(); - - /* resume kernel from the point where suspended, if any */ - my_suspend = KERN_GPUPREAGG_SUSPEND_CONTEXT(kgpreagg, get_group_id()); - if (kgpreagg->resume_context) + int64_t ival = 0; + + if (kvars_is_valid) { - part_index = my_suspend->b.part_index; - line_index = my_suspend->b.line_index; + int slot_id = desc->arg0_slot_id; + int vclass = kcxt->kvars_class[slot_id]; + + if (vclass == KVAR_CLASS__INLINE) + ival = kcxt->kvars_slot[slot_id].i64; + else + { + assert(vclass == KVAR_CLASS__NULL); + kvars_is_valid = false; + } + } + if (__any_sync(__activemask(), kvars_is_valid)) + { + ival += __shfl_xor_sync(__activemask(), ival, 0x0001); + ival += __shfl_xor_sync(__activemask(), ival, 0x0002); + ival += __shfl_xor_sync(__activemask(), ival, 0x0004); + ival += __shfl_xor_sync(__activemask(), ival, 0x0008); + ival += __shfl_xor_sync(__activemask(), ival, 0x0010); + if (LaneId() == 0) + __atomic_add_int64((int64_t *)buffer, ival); } - __syncthreads(); - - tup_dclass = (cl_char *) - kern_context_alloc(kcxt, sizeof(cl_char) * kds_slot->ncols); - tup_values = (Datum *) - kern_context_alloc(kcxt, sizeof(Datum) * kds_slot->ncols); - tup_extra = (cl_int *) - kern_context_alloc(kcxt, sizeof(cl_int) * kds_slot->ncols); - if (!tup_dclass || !tup_values || !tup_extra) - STROM_EREPORT(kcxt, ERRCODE_OUT_OF_MEMORY, "out of memory"); - /* bailout if any errors */ - if (__syncthreads_count(kcxt->errcode) > 0) - goto out; - vlbuf_base = kcxt->vlpos; +} +/* + * __update_nogroups__psum_fp + */ +INLINE_FUNCTION(void) +__update_nogroups__psum_fp(kern_context *kcxt, + char *buffer, + kern_colmeta *cmeta, + kern_aggregate_desc *desc, + bool kvars_is_valid) +{ + float8_t fval = 0.0; - for (;;) + if (kvars_is_valid) { - cl_uint part_base; - cl_uint part_id; - cl_uint line_no; - cl_uint n_lines; - cl_uint nvalids; - PageHeaderData *pg_page; - ItemPointerData t_self __attribute__ ((unused)); - BlockNumber block_nr; - - part_base = part_index * window_sz + get_group_id() * n_parts; - if (part_base >= kds_src->nitems) - break; - part_id = get_local_id() / part_sz + part_base; - line_no = get_local_id() % part_sz + line_index * part_sz; + int slot_id = desc->arg0_slot_id; + int vclass = kcxt->kvars_class[slot_id]; - do { - HeapTupleHeaderData *htup = NULL; - ItemIdData *curr_lpp = NULL; - cl_uint slot_index; - cl_bool rc = false; + if (vclass == KVAR_CLASS__INLINE) + fval = kcxt->kvars_slot[slot_id].fp64; + else + { + assert(vclass == KVAR_CLASS__NULL); + kvars_is_valid = false; + } + } + if (__any_sync(__activemask(), kvars_is_valid)) + { + fval += __shfl_xor_sync(__activemask(), fval, 0x0001); + fval += __shfl_xor_sync(__activemask(), fval, 0x0002); + fval += __shfl_xor_sync(__activemask(), fval, 0x0004); + fval += __shfl_xor_sync(__activemask(), fval, 0x0008); + fval += __shfl_xor_sync(__activemask(), fval, 0x0010); + if (LaneId() == 0) + __atomic_add_fp64((float8_t *)buffer, fval); + } +} - kcxt->vlpos = vlbuf_base; /* rewind */ - if (thread_is_valid && part_id < kds_src->nitems) - { - pg_page = KERN_DATA_STORE_BLOCK_PGPAGE(kds_src, part_id); - n_lines = PageGetMaxOffsetNumber(pg_page); - block_nr = KERN_DATA_STORE_BLOCK_BLCKNR(kds_src, part_id); - t_self.ip_blkid.bi_hi = block_nr >> 16; - t_self.ip_blkid.bi_lo = block_nr & 0xffff; - t_self.ip_posid = line_no + 1; - - if (line_no < n_lines) - { - curr_lpp = PageGetItemId(pg_page, line_no + 1); - if (ItemIdIsNormal(curr_lpp)) - htup = PageGetItem(pg_page, curr_lpp); - } - } - else - { - pg_page = NULL; - n_lines = 0; - } +/* + * __update_nogroups__pavg_int + */ +INLINE_FUNCTION(void) +__update_nogroups__pavg_int(kern_context *kcxt, + char *buffer, + kern_colmeta *cmeta, + kern_aggregate_desc *desc, + bool kvars_is_valid) +{ + int64_t ival = 0; + uint32_t mask; - /* evaluation of the qualifier */ - if (htup) - { - rc = gpupreagg_quals_eval(kcxt, kds_src, &t_self, htup); - kcxt->vlpos = vlbuf_base; /* rewind */ - } - /* bailout if any errors on gpupreagg_quals_eval */ - if (__syncthreads_count(kcxt->errcode) > 0) - goto out; - /* allocation of the kds_slot buffer */ - slot_index = pgstromStairlikeBinaryCount(rc, &nvalids); - if (nvalids > 0) - { - if (rc) - { - gpupreagg_projection_row(kcxt, - kds_src, - htup, - tup_dclass, - tup_values); - } - /* bailout if any errors */ - if (__syncthreads_count(kcxt->errcode) > 0) - goto out; - - if (!gpupreagg_setup_common(kcxt, - kgpreagg, - kds_src, - kds_slot, - nvalids, - rc ? slot_index : UINT_MAX, - tup_dclass, - tup_values, - tup_extra)) - goto out; - } - /* update statistics */ - count = __syncthreads_count(htup != NULL); - if (get_local_id() == 0) - { - atomicAdd(&kgpreagg->nitems_real, count); - atomicAdd(&kgpreagg->nitems_filtered, count - nvalids); - } + if (kvars_is_valid) + { + int slot_id = desc->arg0_slot_id; + int vclass = kcxt->kvars_class[slot_id]; - /* - * Move to the next window of the line items, if any. - * If no threads in CUDA block wants to continue, exit the loop. - */ - line_index++; - line_no += part_sz; - } while (__syncthreads_count(thread_is_valid && - line_no < n_lines) > 0); - /* move to the next window */ - part_index++; - line_index = 0; + if (vclass == KVAR_CLASS__INLINE) + ival = kcxt->kvars_slot[slot_id].i64; + else + { + assert(vclass == KVAR_CLASS__NULL); + kvars_is_valid = false; + } } -out: - if (get_local_id() == 0) + mask = __ballot_sync(__activemask(), kvars_is_valid); + if (mask != 0) { - my_suspend->b.part_index = part_index; - my_suspend->b.line_index = line_index; + ival += __shfl_xor_sync(__activemask(), ival, 0x0001); + ival += __shfl_xor_sync(__activemask(), ival, 0x0002); + ival += __shfl_xor_sync(__activemask(), ival, 0x0004); + ival += __shfl_xor_sync(__activemask(), ival, 0x0008); + ival += __shfl_xor_sync(__activemask(), ival, 0x0010); + if (LaneId() == 0) + { + kagg_state__pavg_int_packed *r = + (kagg_state__pavg_int_packed *)buffer; + __atomic_add_uint32(&r->nitems, __popc(mask)); + __atomic_add_int64(&r->sum, ival); + } } } /* - * gpupreagg_setup_arrow + * __update_nogroups__pavg_fp */ -DEVICE_FUNCTION(void) -gpupreagg_setup_arrow(kern_context *kcxt, - kern_gpupreagg *kgpreagg, - kern_data_store *kds_src, /* in: KDS_FORMAT_ARROW */ - kern_data_store *kds_slot) /* out: KDS_FORMAT_SLOT */ +INLINE_FUNCTION(void) +__update_nogroups__pavg_fp(kern_context *kcxt, + char *buffer, + kern_colmeta *cmeta, + kern_aggregate_desc *desc, + bool kvars_is_valid) { - cl_uint src_nitems = __ldg(&kds_src->nitems); - cl_uint src_base; - cl_uint src_index; - cl_uint slot_index; - cl_uint count; - cl_uint nvalids; - cl_char *vlbuf_base; - cl_char *tup_dclass; - Datum *tup_values; - cl_int *tup_extra; - gpupreaggSuspendContext *my_suspend; - cl_bool rc; - - assert(kds_src->format == KDS_FORMAT_ARROW && - kds_slot->format == KDS_FORMAT_SLOT); - - /* resume kernel from the point where suspended, if any */ - my_suspend = KERN_GPUPREAGG_SUSPEND_CONTEXT(kgpreagg, get_group_id()); - if (kgpreagg->resume_context) - src_base = my_suspend->c.src_base; - else - src_base = get_global_base(); - __syncthreads(); - - tup_dclass = (cl_char *) - kern_context_alloc(kcxt, sizeof(cl_char) * kds_slot->ncols); - tup_values = (Datum *) - kern_context_alloc(kcxt, sizeof(Datum) * kds_slot->ncols); - tup_extra = (cl_int *) - kern_context_alloc(kcxt, sizeof(cl_int) * kds_slot->ncols); - if (!tup_dclass || !tup_values || !tup_extra) - STROM_EREPORT(kcxt, ERRCODE_OUT_OF_MEMORY, "out of memory"); - /* bailout if any errors */ - if (__syncthreads_count(kcxt->errcode) > 0) - goto skip; - vlbuf_base = kcxt->vlpos; - - while (src_base < src_nitems) + float8_t fval = 0; + uint32_t mask; + + if (kvars_is_valid) { - kcxt->vlpos = vlbuf_base; /* rewind */ - src_index = src_base + get_local_id(); - if (src_index < src_nitems) + int slot_id = desc->arg0_slot_id; + int vclass = kcxt->kvars_class[slot_id]; + + if (vclass == KVAR_CLASS__INLINE) + fval = kcxt->kvars_slot[slot_id].fp64; + else { - rc = gpupreagg_quals_eval_arrow(kcxt, kds_src, src_index); - kcxt->vlpos = vlbuf_base; /* rewind */ + assert(vclass == KVAR_CLASS__NULL); + kvars_is_valid = false; } - else + } + mask = __ballot_sync(__activemask(), kvars_is_valid); + if (mask != 0) + { + fval += __shfl_xor_sync(__activemask(), fval, 0x0001); + fval += __shfl_xor_sync(__activemask(), fval, 0x0002); + fval += __shfl_xor_sync(__activemask(), fval, 0x0004); + fval += __shfl_xor_sync(__activemask(), fval, 0x0008); + fval += __shfl_xor_sync(__activemask(), fval, 0x0010); + if (LaneId() == 0) { - rc = false; + kagg_state__pavg_fp_packed *r = + (kagg_state__pavg_fp_packed *)buffer; + __atomic_add_uint32(&r->nitems, __popc(mask)); + __atomic_add_fp64(&r->sum, fval); } - /* Bailout if any error on gpupreagg_quals_eval_arrow */ - if (__syncthreads_count(kcxt->errcode) > 0) - break; - /* allocation of kds_slot buffer, if any */ - slot_index = pgstromStairlikeBinaryCount(rc ? 1 : 0, &nvalids); - if (nvalids > 0) + } +} +/* + * __update_nogroups__pstddev + */ +INLINE_FUNCTION(void) +__update_nogroups__pstddev(kern_context *kcxt, + char *buffer, + kern_colmeta *cmeta, + kern_aggregate_desc *desc, + bool kvars_is_valid) +{ + float8_t sum_x = 0.0; + uint32_t mask; + + if (kvars_is_valid) + { + int slot_id = desc->arg0_slot_id; + int vclass = kcxt->kvars_class[slot_id]; + + if (vclass == KVAR_CLASS__INLINE) + sum_x = kcxt->kvars_slot[slot_id].fp64; + else { - if (rc) - { - gpupreagg_projection_arrow(kcxt, - kds_src, - src_index, - tup_dclass, - tup_values); - } - /* Bailout if any error */ - if (__syncthreads_count(kcxt->errcode) > 0) - break; - /* common portion */ - if (!gpupreagg_setup_common(kcxt, - kgpreagg, - kds_src, - kds_slot, - nvalids, - rc ? slot_index : UINT_MAX, - tup_dclass, - tup_values, - tup_extra)) - break; + assert(vclass == KVAR_CLASS__NULL); + kvars_is_valid = false; } - /* update statistics */ - count = __syncthreads_count(src_index < src_nitems); - if (get_local_id() == 0) + } + mask = __ballot_sync(__activemask(), kvars_is_valid); + if (mask != 0) + { + float8_t sum_x2 = sum_x * sum_x; + + /* sum_x */ + sum_x += __shfl_xor_sync(__activemask(), sum_x, 0x0001); + sum_x += __shfl_xor_sync(__activemask(), sum_x, 0x0002); + sum_x += __shfl_xor_sync(__activemask(), sum_x, 0x0004); + sum_x += __shfl_xor_sync(__activemask(), sum_x, 0x0008); + sum_x += __shfl_xor_sync(__activemask(), sum_x, 0x0010); + /* sum_x2 */ + sum_x2 += __shfl_xor_sync(__activemask(), sum_x2, 0x0001); + sum_x2 += __shfl_xor_sync(__activemask(), sum_x2, 0x0002); + sum_x2 += __shfl_xor_sync(__activemask(), sum_x2, 0x0004); + sum_x2 += __shfl_xor_sync(__activemask(), sum_x2, 0x0008); + sum_x2 += __shfl_xor_sync(__activemask(), sum_x2, 0x0010); + + if (LaneId() == 0) { - atomicAdd(&kgpreagg->nitems_real, count); - atomicAdd(&kgpreagg->nitems_filtered, count - nvalids); + kagg_state__stddev_packed *r = + (kagg_state__stddev_packed *)buffer; + __atomic_add_uint32(&r->nitems, __popc(mask)); + __atomic_add_fp64(&r->sum_x, sum_x); + __atomic_add_fp64(&r->sum_x2, sum_x2); } - /* move to the next window */ - src_base += get_global_size(); } -skip: - /* save the current execution context */ - if (get_local_id() == 0) - my_suspend->c.src_base = src_base; } /* - * gpupreagg_setup_column + * __update_nogroups__pavg_covar */ -DEVICE_FUNCTION(void) -gpupreagg_setup_column(kern_context *kcxt, - kern_gpupreagg *kgpreagg, - kern_data_store *kds_src, /* in: KDS_FORMAT_COLUMN */ - kern_data_extra *kds_extra, - kern_data_store *kds_slot) +INLINE_FUNCTION(void) +__update_nogroups__pcovar(kern_context *kcxt, + char *buffer, + kern_colmeta *cmeta, + kern_aggregate_desc *desc, + bool kvars_is_valid) { - cl_uint src_base; - cl_char *tup_dclass; - Datum *tup_values; - cl_int *tup_extra; /* !!not related to extra buffer of column format!! */ - cl_char *vlbuf_base; - gpupreaggSuspendContext *my_suspend; - - assert(kds_src->format == KDS_FORMAT_COLUMN && - kds_slot->format == KDS_FORMAT_SLOT); - /* resume kernel from the point where suspended, if any */ - my_suspend = KERN_GPUPREAGG_SUSPEND_CONTEXT(kgpreagg, get_group_id()); - if (kgpreagg->resume_context) - src_base = my_suspend->c.src_base; - else - src_base = get_global_base(); - - tup_dclass = (cl_char *) - kern_context_alloc(kcxt, sizeof(cl_char) * kds_slot->ncols); - tup_values = (Datum *) - kern_context_alloc(kcxt, sizeof(Datum) * kds_slot->ncols); - tup_extra = (cl_int *) - kern_context_alloc(kcxt, sizeof(cl_int) * kds_slot->ncols); - if (!tup_dclass || !tup_values || !tup_extra) - STROM_EREPORT(kcxt, ERRCODE_OUT_OF_MEMORY, "out of memory"); - /* bailout if any errors */ - if (__syncthreads_count(kcxt->errcode) > 0) - goto skip; - vlbuf_base = kcxt->vlpos; - - while (src_base < kds_src->nitems) + float8_t sum_x = 0.0; + float8_t sum_y = 0.0; + uint32_t mask; + + if (kvars_is_valid) { - cl_uint src_index = src_base + get_local_id(); - cl_uint slot_index; - cl_uint nvalids; - cl_uint count; - cl_bool visible = false; - cl_bool rc = false; - - kcxt->vlpos = vlbuf_base; /* rewind */ - if (src_index < kds_src->nitems) + if (kcxt->kvars_class[desc->arg0_slot_id] == KVAR_CLASS__INLINE && + kcxt->kvars_class[desc->arg1_slot_id] == KVAR_CLASS__INLINE) { - visible = kern_check_visibility_column(kcxt, - kds_src, - src_index); - if (visible) - { - rc = gpupreagg_quals_eval_column(kcxt, - kds_src, - kds_extra, - src_index); - } - kcxt->vlpos = vlbuf_base; /* rewind */ + sum_x = kcxt->kvars_slot[desc->arg0_slot_id].fp64; + sum_y = kcxt->kvars_slot[desc->arg0_slot_id].fp64; } - /* bailout if any errors */ - if (__syncthreads_count(kcxt->errcode) > 0) - break; - /* allocation of kds_slot buffer, if any */ - slot_index = pgstromStairlikeBinaryCount(rc ? 1 : 0, &nvalids); - if (nvalids > 0) + else { - if (rc) - { - gpupreagg_projection_column(kcxt, - kds_src, - kds_extra, - src_index, - tup_dclass, - tup_values); - } - /* bailout if any errors */ - if (__syncthreads_count(kcxt->errcode) > 0) - break; - /* common portion */ - if (!gpupreagg_setup_common(kcxt, - kgpreagg, - kds_src, - kds_slot, - nvalids, - rc ? slot_index : UINT_MAX, - tup_dclass, - tup_values, - tup_extra)) - break; + assert(kcxt->kvars_class[desc->arg0_slot_id] == KVAR_CLASS__NULL || + kcxt->kvars_class[desc->arg1_slot_id] == KVAR_CLASS__NULL); + kvars_is_valid = false; } - /* update statistics */ - count = __syncthreads_count(visible); - if (get_local_id() == 0) + } + mask = __ballot_sync(__activemask(), kvars_is_valid); + if (mask != 0) + { + float8_t sum_xx = sum_x * sum_x; + float8_t sum_xy = sum_x * sum_y; + float8_t sum_yy = sum_y * sum_y; + + /* sum_x */ + sum_x += __shfl_xor_sync(__activemask(), sum_x, 0x0001); + sum_x += __shfl_xor_sync(__activemask(), sum_x, 0x0002); + sum_x += __shfl_xor_sync(__activemask(), sum_x, 0x0004); + sum_x += __shfl_xor_sync(__activemask(), sum_x, 0x0008); + sum_x += __shfl_xor_sync(__activemask(), sum_x, 0x0010); + + /* sum_y */ + sum_y += __shfl_xor_sync(__activemask(), sum_y, 0x0001); + sum_y += __shfl_xor_sync(__activemask(), sum_y, 0x0002); + sum_y += __shfl_xor_sync(__activemask(), sum_y, 0x0004); + sum_y += __shfl_xor_sync(__activemask(), sum_y, 0x0008); + sum_y += __shfl_xor_sync(__activemask(), sum_y, 0x0010); + + /* sum_xx */ + sum_xx += __shfl_xor_sync(__activemask(), sum_xx, 0x0001); + sum_xx += __shfl_xor_sync(__activemask(), sum_xx, 0x0002); + sum_xx += __shfl_xor_sync(__activemask(), sum_xx, 0x0004); + sum_xx += __shfl_xor_sync(__activemask(), sum_xx, 0x0008); + sum_xx += __shfl_xor_sync(__activemask(), sum_xx, 0x0010); + + /* sum_xy */ + sum_xy += __shfl_xor_sync(__activemask(), sum_xy, 0x0001); + sum_xy += __shfl_xor_sync(__activemask(), sum_xy, 0x0002); + sum_xy += __shfl_xor_sync(__activemask(), sum_xy, 0x0004); + sum_xy += __shfl_xor_sync(__activemask(), sum_xy, 0x0008); + sum_xy += __shfl_xor_sync(__activemask(), sum_xy, 0x0010); + + /* sum_yy */ + sum_yy += __shfl_xor_sync(__activemask(), sum_yy, 0x0001); + sum_yy += __shfl_xor_sync(__activemask(), sum_yy, 0x0002); + sum_yy += __shfl_xor_sync(__activemask(), sum_yy, 0x0004); + sum_yy += __shfl_xor_sync(__activemask(), sum_yy, 0x0008); + sum_yy += __shfl_xor_sync(__activemask(), sum_yy, 0x0010); + + if (LaneId() == 0) { - atomicAdd(&kgpreagg->nitems_real, count); - atomicAdd(&kgpreagg->nitems_filtered, count - nvalids); + kagg_state__covar_packed *r = + (kagg_state__covar_packed *)buffer; + __atomic_add_uint32(&r->nitems, __popc(mask)); + __atomic_add_fp64(&r->sum_x, sum_x); + __atomic_add_fp64(&r->sum_xx, sum_xx); + __atomic_add_fp64(&r->sum_y, sum_y); + __atomic_add_fp64(&r->sum_yy, sum_yy); + __atomic_add_fp64(&r->sum_xy, sum_xy); } - /* move to the next window */ - src_base += get_global_size(); } -skip: - /* save the current execution context */ - if (get_local_id() == 0) - my_suspend->c.src_base = src_base; } /* - * gpupreagg_nogroup_reduction + * __updateOneTupleNoGroups */ -DEVICE_FUNCTION(void) -gpupreagg_nogroup_reduction(kern_context *kcxt, - kern_gpupreagg *kgpreagg, /* in/out */ - kern_errorbuf *kgjoin_errorbuf, /* in */ - kern_data_store *kds_slot, /* in */ - kern_data_store *kds_final, /* global out */ - cl_char *p_dclass, /* __private__ */ - Datum *p_values, /* __private__ */ - char *p_extras) /* __private__ */ +STATIC_FUNCTION(void) +__updateOneTupleNoGroups(kern_context *kcxt, + kern_data_store *kds_final, + bool kvars_is_valid, + HeapTupleHeaderData *htup, + kern_expression *kexp_groupby_actions) { - cl_bool is_last_reduction = false; - cl_bool try_final_merge = true; - cl_uint lane_id = (get_local_id() & warpSize - 1); - - /* init local/private buffer */ - assert(MAXWARPS_PER_BLOCK <= get_local_size() && - MAXWARPS_PER_BLOCK == warpSize); - gpupreagg_init_local_slot(p_dclass, p_values, p_extras); - - /* skip if previous stage reported an error */ - if (kgjoin_errorbuf && - __syncthreads_count(kgjoin_errorbuf->errcode) != 0) - return; - if (__syncthreads_count(kgpreagg->kerror.errcode) != 0) - return; - - assert(kgpreagg->num_group_keys == 0); - assert(kds_slot->format == KDS_FORMAT_SLOT); - assert(kds_final->format == KDS_FORMAT_SLOT); - assert(kds_slot->ncols == kds_final->ncols); - if (get_global_id() == 0) - kgpreagg->setup_slot_done = true; - - /* start private reduction */ - is_last_reduction = false; - do { - cl_uint index; + int nattrs = (htup->t_infomask2 & HEAP_NATTS_MASK); + bool heap_hasnull = ((htup->t_infomask & HEAP_HASNULL) != 0); + uint32_t t_hoff; + char *buffer = NULL; - if (lane_id == 0) - index = atomicAdd(&kgpreagg->read_slot_pos, warpSize); - index = __shfl_sync(__activemask(), index, 0); - if (index + warpSize >= kds_slot->nitems) - is_last_reduction = true; - index += lane_id; + t_hoff = offsetof(HeapTupleHeaderData, t_bits); + if (heap_hasnull) + t_hoff += BITMAPLEN(nattrs); + t_hoff = MAXALIGN(t_hoff); - /* accumulate to the private buffer */ - if (index < kds_slot->nitems) + for (int j=0; j < nattrs; j++) + { + kern_aggregate_desc *desc = &kexp_groupby_actions->u.pagg.desc[j]; + kern_colmeta *cmeta = &kds_final->colmeta[j]; + + if (heap_hasnull && att_isnull(j, htup->t_bits)) { - gpupreagg_update_normal(p_dclass, - p_values, - GPUPREAGG_ACCUM_MAP_LOCAL, - KERN_DATA_STORE_DCLASS(kds_slot, index), - KERN_DATA_STORE_VALUES(kds_slot, index), - GPUPREAGG_ACCUM_MAP_GLOBAL); + /* only grouping-key may have NULL */ + assert(desc->action == KAGG_ACTION__VREF); + continue; } - } while (!is_last_reduction); - __syncthreads(); - - /* - * inter-warp reduction using shuffle operations - */ - for (cl_uint mask = 1; mask < warpSize; mask += mask) - { - cl_uint buddy_id = ((get_local_id() ^ mask) & (warpSize-1)); + if (cmeta->attlen > 0) + t_hoff = TYPEALIGN(cmeta->attalign, t_hoff); + else if (!VARATT_NOT_PAD_BYTE((char *)htup + t_hoff)) + t_hoff = TYPEALIGN(cmeta->attalign, t_hoff); + buffer = ((char *)htup + t_hoff); + if (cmeta->attlen > 0) + t_hoff += cmeta->attlen; + else + t_hoff += VARSIZE_ANY(buffer); - gpupreagg_merge_shuffle(p_dclass, - p_values, - GPUPREAGG_ACCUM_MAP_LOCAL, - buddy_id); + switch (desc->action) + { + case KAGG_ACTION__NROWS_ANY: + __update_nogroups__nrows_any(kcxt, buffer, + cmeta, desc, + kvars_is_valid); + break; + case KAGG_ACTION__NROWS_COND: + __update_nogroups__nrows_cond(kcxt, buffer, + cmeta, desc, + kvars_is_valid); + break; + case KAGG_ACTION__PMIN_INT: + __update_nogroups__pmin_int(kcxt, buffer, + cmeta, desc, + kvars_is_valid); + break; + case KAGG_ACTION__PMAX_INT: + __update_nogroups__pmax_int(kcxt, buffer, + cmeta, desc, + kvars_is_valid); + break; + case KAGG_ACTION__PMIN_FP: + __update_nogroups__pmin_fp(kcxt, buffer, + cmeta, desc, + kvars_is_valid); + break; + case KAGG_ACTION__PMAX_FP: + __update_nogroups__pmax_fp(kcxt, buffer, + cmeta, desc, + kvars_is_valid); + break; + case KAGG_ACTION__PSUM_INT: + __update_nogroups__psum_int(kcxt, buffer, + cmeta, desc, + kvars_is_valid); + break; + case KAGG_ACTION__PSUM_FP: + __update_nogroups__psum_fp(kcxt, buffer, + cmeta, desc, + kvars_is_valid); + break; + case KAGG_ACTION__PAVG_INT: + __update_nogroups__pavg_int(kcxt, buffer, + cmeta, desc, + kvars_is_valid); + break; + case KAGG_ACTION__PAVG_FP: + __update_nogroups__pavg_fp(kcxt, buffer, + cmeta, desc, + kvars_is_valid); + break; + case KAGG_ACTION__STDDEV: + __update_nogroups__pstddev(kcxt, buffer, + cmeta, desc, + kvars_is_valid); + break; + case KAGG_ACTION__COVAR: + __update_nogroups__pcovar(kcxt, buffer, + cmeta, desc, + kvars_is_valid); + break; + default: + /* + * No more partial aggregation exists after grouping-keys + */ + return; + } } +} - /* - * update the final buffer - */ - try_final_merge = ((get_local_id() & (warpSize - 1)) == 0); - do { - if (try_final_merge) +/* + * __insertOneTupleNoGroups + */ +STATIC_FUNCTION(kern_tupitem *) +__insertOneTupleNoGroups(kern_context *kcxt, + kern_data_store *kds_final, + kern_expression *kexp_groupby_actions) +{ + kern_tupitem *tupitem; + int32_t tupsz; + uint32_t required; + uint32_t usage; + size_t total_sz; + + assert(kds_final->format == KDS_FORMAT_ROW && + kds_final->hash_nslots == 0); + /* estimate length */ + tupsz = __writeOutOneTuplePreAgg(kcxt, kds_final, NULL, + kexp_groupby_actions); + assert(tupsz > 0); + required = MAXALIGN(offsetof(kern_tupitem, htup) + tupsz); + assert(required < 1000); + total_sz = (KDS_HEAD_LENGTH(kds_final) + + MAXALIGN(sizeof(uint32_t)) + + required + __kds_unpack(kds_final->usage)); + if (total_sz > kds_final->length) + return NULL; /* out of memory */ + usage = __atomic_add_uint32(&kds_final->usage, __kds_packed(required)); + tupitem = (kern_tupitem *)((char *)kds_final + + kds_final->length + - __kds_unpack(usage) + - required); + + __writeOutOneTuplePreAgg(kcxt, kds_final, + &tupitem->htup, + kexp_groupby_actions); + tupitem->t_len = tupsz; + tupitem->rowid = 0; + __atomic_write_uint32(KDS_GET_ROWINDEX(kds_final), + __kds_packed((char *)kds_final + + kds_final->length + - (char *)tupitem)); + return tupitem; +} + +STATIC_FUNCTION(bool) +__execGpuPreAggNoGroups(kern_context *kcxt, + kern_data_store *kds_final, + bool kvars_is_valid, + kern_expression *kexp_groupby_actions, + bool *p_try_suspend) +{ + kern_tupitem *tupitem; + bool try_suspend = false; + + assert(kds_final->format == KDS_FORMAT_ROW); + assert(kexp_groupby_actions->opcode == FuncOpCode__AggFuncs); + for (;;) + { + if (LaneId() == 0) { - union { - struct { - cl_uint nitems; - cl_uint usage; - } i; - cl_ulong v64; - } oldval, curval, newval; - - assert((get_local_id() & (warpSize - 1)) == 0); - - oldval.i.nitems = 0; - oldval.i.usage = kds_final->usage; - newval.i.nitems = 0xffffffffU; /* LOCKED */ - newval.i.usage = kds_final->usage - + __kds_packed(GPUPREAGG_ACCUM_EXTRA_BUFSZ); - - curval.v64 = atomicCAS((cl_ulong *)&kds_final->nitems, - oldval.v64, - newval.v64); - if (curval.i.nitems <= 1) - { - cl_char *f_dclass = KERN_DATA_STORE_DCLASS(kds_final, 0); - Datum *f_values = KERN_DATA_STORE_VALUES(kds_final, 0); - char *f_extras; + uint32_t nitems = __volatileRead(&kds_final->nitems); + uint32_t oldval; - if (curval.i.nitems == 0) + if (nitems == 1) + { + /* normal case; destination tuple already exists */ + tupitem = KDS_GET_TUPITEM(kds_final, 0); + assert(tupitem != NULL); + } + else if (nitems == 0) + { + oldval = __atomic_cas_uint32(&kds_final->nitems, 0, UINT_MAX); + if (oldval == 0) { - f_extras = ((char *)kds_final + - kds_final->length - - __kds_unpack(curval.i.usage) - - GPUPREAGG_ACCUM_EXTRA_BUFSZ); - gpupreagg_init_final_slot(f_dclass, f_values, f_extras); - atomicAdd(&kgpreagg->num_groups, 1); - __threadfence(); - atomicExch(&kds_final->nitems, 1); /* UNLOCK */ + /* LOCKED */ + tupitem = __insertOneTupleNoGroups(kcxt, kds_final, + kexp_groupby_actions); + if (!tupitem) + { + try_suspend = true; + /* UNLOCK */ + oldval = __atomic_write_uint32(&kds_final->nitems, 0); + assert(oldval == UINT_MAX); + } + else + { + /* UNLOCK */ + oldval = __atomic_write_uint32(&kds_final->nitems, 1); + assert(oldval == UINT_MAX); + } + } + else + { + assert(oldval == 0 || oldval == UINT_MAX); + tupitem = NULL; } - gpupreagg_merge_atomic(f_dclass, - f_values, - GPUPREAGG_ACCUM_MAP_GLOBAL, - p_dclass, - p_values, - GPUPREAGG_ACCUM_MAP_LOCAL); - try_final_merge = false; - kgpreagg->final_buffer_modified = true; } else { - assert(curval.i.nitems == 0xffffffffU); + assert(nitems == UINT_MAX); + /* works in progress - someone setup the destination tuple */ + tupitem = NULL; } } - } while (__syncthreads_count(try_final_merge) > 0); + /* out of memory? */ + try_suspend = __shfl_sync(__activemask(), try_suspend, 0); + if (try_suspend) + { + *p_try_suspend = true; + return false; + } + /* is the destination tuple ready? */ + tupitem = (kern_tupitem *)__shfl_sync(__activemask(), (uintptr_t)tupitem, 0); + if (tupitem != NULL) + break; + } + /* update partial aggregation */ + __updateOneTupleNoGroups(kcxt, kds_final, + kvars_is_valid, + &tupitem->htup, + kexp_groupby_actions); + return true; } -#define HASHITEM_EMPTY (0xffffffffU) -#define HASHITEM_LOCKED (0xfffffffeU) -static __shared__ cl_bool l_final_buffer_modified; + /* - * gpupreagg_init_final_hash + * __insertOneTupleGroupBy */ -KERNEL_FUNCTION(void) -gpupreagg_init_final_hash(kern_global_hashslot *f_hash, - size_t f_hash_nslots, - size_t f_hash_length) +STATIC_FUNCTION(kern_hashitem *) +__insertOneTupleGroupBy(kern_context *kcxt, + kern_data_store *kds_final, + kern_expression *kexp_groupby_actions) { - if (get_global_id() == 0) + kern_hashitem *hitem; + int32_t tupsz; + uint32_t required; + union { + uint64_t u64; + struct { + uint32_t nitems; + uint32_t usage; + } kds; + } oldval, curval, newval; + + assert(kds_final->format == KDS_FORMAT_HASH && + kds_final->hash_nslots > 0); + /* estimate length */ + tupsz = __writeOutOneTuplePreAgg(kcxt, kds_final, NULL, + kexp_groupby_actions); + assert(tupsz > 0); + required = MAXALIGN(offsetof(kern_hashitem, t.htup) + tupsz); + + /* expand kds_final */ + curval.kds.nitems = __volatileRead(&kds_final->nitems); + curval.kds.usage = __volatileRead(&kds_final->usage); + for (;;) { - f_hash->length = f_hash_length; - f_hash->lock = 0; - f_hash->usage = 0; - f_hash->nslots = f_hash_nslots; + size_t total_sz; + + newval.kds.nitems = curval.kds.nitems + 1; + newval.kds.usage = curval.kds.usage + __kds_packed(required); + total_sz = (KDS_HEAD_LENGTH(kds_final) + + MAXALIGN(sizeof(uint32_t) * (kds_final->hash_nslots + + newval.kds.nitems)) + + __kds_unpack(curval.kds.usage)); + if (total_sz > kds_final->length) + return NULL; /* out of memory */ + oldval.u64 = __atomic_cas_uint64((uint64_t *)&kds_final->nitems, + curval.u64, + newval.u64); + if (oldval.u64 == curval.u64) + break; + curval.u64 = oldval.u64; } - - for (size_t i = get_global_id(); i < f_hash_nslots; i += get_global_size()) - f_hash->slots[i] = HASHITEM_EMPTY; + hitem = (kern_hashitem *)((char *)kds_final + + kds_final->length + - __kds_unpack(newval.kds.usage)); + __writeOutOneTuplePreAgg(kcxt, kds_final, + &hitem->t.htup, + kexp_groupby_actions); + hitem->t.t_len = tupsz; + hitem->t.rowid = newval.kds.nitems - 1; + KDS_GET_ROWINDEX(kds_final)[hitem->t.rowid] + = __kds_packed((char *)kds_final + + kds_final->length + - (char *)&hitem->t); + return hitem; } /* - * gpupreagg_create_final_slot - * - * + * __update_groupby__nrows_any */ -STATIC_FUNCTION(cl_uint) -gpupreagg_create_final_slot(kern_context *kcxt, - kern_data_store *kds_final, - kern_data_store *kds_src, - cl_uint src_index, - cl_char *l_dclass, - Datum *l_values) +INLINE_FUNCTION(void) +__update_groupby__nrows_any(kern_context *kcxt, + char *buffer, + kern_colmeta *cmeta, + kern_aggregate_desc *desc) { - cl_char *src_dclass = KERN_DATA_STORE_DCLASS(kds_src, src_index); - Datum *src_values = KERN_DATA_STORE_VALUES(kds_src, src_index); - cl_char *dst_dclass; - Datum *dst_values; - cl_uint dst_index; - cl_uint alloc_sz; - char *extra = NULL; - union { - struct { - cl_uint nitems; - cl_uint usage; - } i; - cl_ulong v64; - } oldval, curval, newval; - - /* sanity checks */ - assert(kds_final->format == KDS_FORMAT_SLOT && - kds_src->format == KDS_FORMAT_SLOT); - assert(kds_final->ncols == kds_src->ncols); - assert(src_index < kds_src->nitems); - - /* size for extra allocation */ - alloc_sz = GPUPREAGG_ACCUM_EXTRA_BUFSZ; - for (int j=0; j < kds_src->ncols; j++) - { - kern_colmeta *cmeta = &kds_src->colmeta[j]; - cl_char dclass = src_dclass[j]; - cl_uint len; + __atomic_add_uint64((uint64_t *)buffer, 1); +} - if (GPUPREAGG_ATTR_IS_ACCUM_VALUES[j]) - continue; - if (dclass == DATUM_CLASS__NULL) - continue; +INLINE_FUNCTION(void) +__update_groupby__nrows_cond(kern_context *kcxt, + char *buffer, + kern_colmeta *cmeta, + kern_aggregate_desc *desc) +{ + if (kcxt->kvars_class[desc->arg0_slot_id] != KVAR_CLASS__NULL) + __atomic_add_uint64((uint64_t *)buffer, 1); +} - if (cmeta->attbyval) - { - assert(dclass == DATUM_CLASS__NORMAL); - } - else if (cmeta->attlen > 0) - { - assert(dclass == DATUM_CLASS__NORMAL); - alloc_sz += MAXALIGN(cmeta->attlen); - } - else - { - assert(cmeta->attlen == -1); - switch (dclass) - { - case DATUM_CLASS__NORMAL: - len = VARSIZE_ANY(DatumGetPointer(src_values[j])); - break; - case DATUM_CLASS__VARLENA: - len = pg_varlena_datum_length(kcxt, src_values[j]); - break; - case DATUM_CLASS__ARRAY: - len = pg_array_datum_length(kcxt, src_values[j]); - break; - case DATUM_CLASS__COMPOSITE: - len = pg_composite_datum_length(kcxt, src_values[j]); - break; - case DATUM_CLASS__GEOMETRY: - len = pg_geometry_datum_length(kcxt, src_values[j]); - break; - default: - STROM_ELOG(kcxt, "unexpected internal format code"); - return UINT_MAX; - } - alloc_sz += MAXALIGN(len); - } - } +INLINE_FUNCTION(void) +__update_groupby__pmin_int(kern_context *kcxt, + char *buffer, + kern_colmeta *cmeta, + kern_aggregate_desc *desc) +{ + int vclass = kcxt->kvars_class[desc->arg0_slot_id]; - /* - * allocation of a new slot and extra buffer - */ - curval.i.nitems = __volatileRead(&kds_final->nitems); - curval.i.usage = __volatileRead(&kds_final->usage); - do { - newval = oldval = curval; - newval.i.nitems += 1; - newval.i.usage += __kds_packed(alloc_sz); - if (KERN_DATA_STORE_SLOT_LENGTH(kds_final, newval.i.nitems) + - __kds_unpack(newval.i.usage) > kds_final->length) - { - STROM_EREPORT(kcxt, ERRCODE_OUT_OF_MEMORY, - "out of memory (kds_final)"); - return UINT_MAX; - } - } while ((curval.v64 = atomicCAS((cl_ulong *)&kds_final->nitems, - oldval.v64, - newval.v64)) != oldval.v64); - /* - * Move the initial values to kds_final - */ - dst_index = oldval.i.nitems; - dst_dclass = KERN_DATA_STORE_DCLASS(kds_final, dst_index); - dst_values = KERN_DATA_STORE_VALUES(kds_final, dst_index); - if (alloc_sz > 0) - extra = (char *)kds_final + kds_final->length - __kds_unpack(newval.i.usage); - l_final_buffer_modified = true; - - /* init final slot */ - gpupreagg_init_final_slot(dst_dclass, dst_values, extra); - extra += GPUPREAGG_ACCUM_EXTRA_BUFSZ; - - /* copy the grouping keys */ - for (int j=0; j < kds_src->ncols; j++) + if (vclass == KVAR_CLASS__INLINE) { - kern_colmeta *cmeta = &kds_src->colmeta[j]; - cl_char dclass = src_dclass[j]; - Datum datum = src_values[j]; - cl_uint len; + kagg_state__pminmax_int64_packed *r = + (kagg_state__pminmax_int64_packed *)buffer; + int64_t ival = kcxt->kvars_slot[desc->arg0_slot_id].i64; - if (GPUPREAGG_ATTR_IS_ACCUM_VALUES[j]) - continue; - - if (dclass == DATUM_CLASS__NULL || cmeta->attbyval) - { - dst_dclass[j] = dclass; - dst_values[j] = datum; - } - else if (cmeta->attlen > 0) - { - assert(dclass == DATUM_CLASS__NORMAL); - memcpy(extra, DatumGetPointer(datum), cmeta->attlen); - dst_dclass[j] = DATUM_CLASS__NORMAL; - dst_values[j] = PointerGetDatum(extra); - extra += MAXALIGN(cmeta->attlen); - } - else - { - assert(cmeta->attlen == -1); - switch (dclass) - { - case DATUM_CLASS__NORMAL: - len = VARSIZE_ANY(datum); - memcpy(extra, DatumGetPointer(datum), len); - break; - case DATUM_CLASS__VARLENA: - len = pg_varlena_datum_write(kcxt, extra, datum); - break; - case DATUM_CLASS__ARRAY: - len = pg_array_datum_write(kcxt, extra, datum); - break; - case DATUM_CLASS__COMPOSITE: - len = pg_composite_datum_write(kcxt, extra, datum); - break; - case DATUM_CLASS__GEOMETRY: - len = pg_geometry_datum_write(kcxt, extra, datum); - break; - default: - STROM_ELOG(kcxt, "unexpected internal format code"); - return UINT_MAX; - } - dst_dclass[j] = DATUM_CLASS__NORMAL; - dst_values[j] = PointerGetDatum(extra); - extra += MAXALIGN(len); - } + __atomic_add_uint32(&r->nitems, 1); + __atomic_min_int64(&r->value, ival); } - /* copy the accum values */ - if (l_dclass && l_values) - gpupreagg_merge_atomic(dst_dclass, - dst_values, - GPUPREAGG_ACCUM_MAP_GLOBAL, - l_dclass, - l_values, - GPUPREAGG_ACCUM_MAP_LOCAL); else - gpupreagg_update_atomic(dst_dclass, - dst_values, - GPUPREAGG_ACCUM_MAP_GLOBAL, - src_dclass, - src_values, - GPUPREAGG_ACCUM_MAP_GLOBAL); - __threadfence(); - - return dst_index; + { + assert(vclass == KVAR_CLASS__NULL); + } } -/* - * gpupreagg_expand_global_hash - expand size of the global hash slot on demand. - * up to the f_hashlimit. It internally acquires shared lock of the final - * hash-slot, if it returns true. So, caller MUST release it when a series of - * operations get completed. Elsewhere, it returns false. caller MUST retry. - */ -STATIC_FUNCTION(cl_bool) -__expand_global_hash(kern_context *kcxt, kern_global_hashslot *f_hash) +INLINE_FUNCTION(void) +__update_groupby__pmax_int(kern_context *kcxt, + char *buffer, + kern_colmeta *cmeta, + kern_aggregate_desc *desc) { - cl_bool expanded = false; - cl_uint i, j; + int vclass = kcxt->kvars_class[desc->arg0_slot_id]; - /* - * Expand the global hash-slot - */ - if (get_local_id() == 0) + if (vclass == KVAR_CLASS__INLINE) { - cl_uint __nslots = 2 * f_hash->nslots + 2000; - cl_uint __usage = 2 * f_hash->usage + 2000; - size_t consumed; - - /* expand twice and mode */ - consumed = (MAXALIGN(offsetof(kern_global_hashslot, slots[__nslots])) + - MAXALIGN(sizeof(preagg_hash_item) * __usage)); - if (consumed <= f_hash->length) - { - f_hash->nslots = __nslots; - expanded = true; - } - else - { - STROM_EREPORT(kcxt, ERRCODE_STROM_DATASTORE_NOSPACE, - "f_hash has no more space"); - } - } - if (__syncthreads_count(expanded) == 0) - return false; /* failed */ + kagg_state__pminmax_int64_packed *r = + (kagg_state__pminmax_int64_packed *)buffer; + int64_t ival = kcxt->kvars_slot[desc->arg0_slot_id].i64; - /* fix up the global hash-slot */ - for (i = get_local_id(); i < f_hash->nslots; i += get_local_size()) + __atomic_add_uint32(&r->nitems, 1); + __atomic_max_int64(&r->value, ival); + } + else { - f_hash->slots[i] = HASHITEM_EMPTY; + assert(vclass == KVAR_CLASS__NULL); } - __syncthreads(); +} - for (i = 0; i < f_hash->usage; i += get_local_size()) - { - preagg_hash_item *hitem = NULL; - cl_uint hindex = UINT_MAX; - cl_uint next; +INLINE_FUNCTION(void) +__update_groupby__pmin_fp(kern_context *kcxt, + char *buffer, + kern_colmeta *cmeta, + kern_aggregate_desc *desc) +{ + int vclass = kcxt->kvars_class[desc->arg0_slot_id]; - j = i + get_local_id(); - if (j < f_hash->usage) - { - hitem = GLOBAL_HASHSLOT_GETITEM(f_hash, j); - hindex = hitem->hash % f_hash->nslots; - } + if (vclass == KVAR_CLASS__INLINE) + { + kagg_state__pminmax_fp64_packed *r = + (kagg_state__pminmax_fp64_packed *)buffer; + float8_t fval = kcxt->kvars_slot[desc->arg0_slot_id].fp64; - do { - if (hitem) - { - next = __volatileRead(&f_hash->slots[hindex]); - assert(next == HASHITEM_EMPTY || next < f_hash->usage); - hitem->next = next; - if (atomicCAS(&f_hash->slots[hindex], next, j) == next) - hitem = NULL; - } - } while(__syncthreads_count(hitem != NULL) > 0); + __atomic_add_uint32(&r->nitems, 1); + __atomic_min_fp64(&r->value, fval); + } + else + { + assert(vclass == KVAR_CLASS__NULL); } - return true; } -STATIC_INLINE(cl_bool) -gpupreagg_expand_global_hash(kern_context *kcxt, - kern_global_hashslot *f_hash, - cl_uint required) +INLINE_FUNCTION(void) +__update_groupby__pmax_fp(kern_context *kcxt, + char *buffer, + kern_colmeta *cmeta, + kern_aggregate_desc *desc) { - cl_bool lock_wait = false; - cl_bool expand_hash = false; - cl_uint old_lock; - cl_uint new_lock; - cl_uint curr_usage; + int vclass = kcxt->kvars_class[desc->arg0_slot_id]; - /* Get shared/exclusive lock on the final hash slot */ - do { - if (get_local_id() == 0) - { - curr_usage = __volatileRead(&f_hash->usage); - expand_hash = (curr_usage + required > f_hash->nslots); + if (vclass == KVAR_CLASS__INLINE) + { + kagg_state__pminmax_fp64_packed *r = + (kagg_state__pminmax_fp64_packed *)buffer; + float8_t fval = kcxt->kvars_slot[desc->arg0_slot_id].fp64; - old_lock = __volatileRead(&f_hash->lock); - if ((old_lock & 0x0001) != 0) - lock_wait = true; /* someone has exclusive lock */ - else - { - if (expand_hash) - new_lock = old_lock + 3; /* shared + exclusive lock */ - else - new_lock = old_lock + 2; /* shared lock */ + __atomic_add_uint32(&r->nitems, 1); + __atomic_max_fp64(&r->value, fval); + } + else + { + assert(vclass == KVAR_CLASS__NULL); + } +} - if (atomicCAS(&f_hash->lock, - old_lock, - new_lock) == old_lock) - lock_wait = false; /* Ok, lock is acquired */ - else - lock_wait = true; /* Oops, conflict. Retry again. */ - } - } - } while (__syncthreads_count(lock_wait) > 0); +INLINE_FUNCTION(void) +__update_groupby__psum_int(kern_context *kcxt, + char *buffer, + kern_colmeta *cmeta, + kern_aggregate_desc *desc) +{ + int vclass = kcxt->kvars_class[desc->arg0_slot_id]; - if (__syncthreads_count(expand_hash) > 0) + if (vclass == KVAR_CLASS__INLINE) { - /* wait while other threads are running in the critial section */ - lock_wait = false; - do { - if (get_local_id() == 0) - { - old_lock = __volatileRead(&f_hash->lock); - assert((old_lock & 1) == 1); - lock_wait = (old_lock != 3); - } - } while(__syncthreads_count(lock_wait) > 0); + int64_t ival = kcxt->kvars_slot[desc->arg0_slot_id].i64; - /* - * Expand the global hash table - */ - if (!__expand_global_hash(kcxt, f_hash)) - { - /* Error! release exclusive lock */ - __syncthreads(); - if (get_local_id() == 0) - { - old_lock = atomicSub(&f_hash->lock, 3); - assert((old_lock & 0x0001) != 0); - } - return false; - } - /* Ensure the updates of f_hash visible to others */ - __threadfence(); - /* Downgrade the lock */ - __syncthreads(); - if (get_local_id() == 0) - { - old_lock = atomicSub(&f_hash->lock, 1); - assert((old_lock & 0x0001) != 0); - } + __atomic_add_int64((int64_t *)buffer, ival); + } + else + { + assert(vclass == KVAR_CLASS__NULL); } - return true; } -/* - * gpupreagg_global_reduction - */ -STATIC_FUNCTION(cl_bool) -gpupreagg_global_reduction(kern_context *kcxt, - kern_data_store *kds_slot, - cl_uint kds_index, - cl_uint hash, - kern_data_store *kds_final, - kern_global_hashslot *f_hash, - cl_char *l_dclass, /* can be NULL */ - Datum *l_values) /* can be NULL */ +INLINE_FUNCTION(void) +__update_groupby__psum_fp(kern_context *kcxt, + char *buffer, + kern_colmeta *cmeta, + kern_aggregate_desc *desc) { - preagg_hash_item *hitem = NULL; - cl_uint hindex = hash % f_hash->nslots; - cl_uint next; - cl_uint curr; - cl_uint dst_index; - cl_char *dst_dclass; - Datum *dst_values; - cl_bool is_locked = false; + int vclass = kcxt->kvars_class[desc->arg0_slot_id]; - /* - * Step-1: Lookup hash slot without locking - */ - curr = next = __volatileRead(&f_hash->slots[hindex]); - __threadfence(); - if (curr == HASHITEM_LOCKED) - return false; /* locked, try again */ -restart: - while (curr != HASHITEM_EMPTY) + if (vclass == KVAR_CLASS__INLINE) { - assert(curr < __volatileRead(&f_hash->usage)); + float8_t fval = kcxt->kvars_slot[desc->arg0_slot_id].fp64; - hitem = GLOBAL_HASHSLOT_GETITEM(f_hash, curr); - if (hitem->hash == hash && - gpupreagg_keymatch(kcxt, - kds_slot, kds_index, - kds_final, hitem->index)) - { - dst_dclass = KERN_DATA_STORE_DCLASS(kds_final, hitem->index); - dst_values = KERN_DATA_STORE_VALUES(kds_final, hitem->index); - - if (l_dclass && l_values) - gpupreagg_merge_atomic(dst_dclass, - dst_values, - GPUPREAGG_ACCUM_MAP_GLOBAL, - l_dclass, - l_values, - GPUPREAGG_ACCUM_MAP_LOCAL); - else - gpupreagg_update_atomic(dst_dclass, - dst_values, - GPUPREAGG_ACCUM_MAP_GLOBAL, - KERN_DATA_STORE_DCLASS(kds_slot, kds_index), - KERN_DATA_STORE_VALUES(kds_slot, kds_index), - GPUPREAGG_ACCUM_MAP_GLOBAL); - if (is_locked) - atomicExch(&f_hash->slots[hindex], next); //UNLOCK - l_final_buffer_modified = true; - return true; - } - curr = hitem->next; + __atomic_add_fp64((float8_t *)buffer, fval); } - - /* - * Step-2: Ensure that f_hash has no entry under the lock - */ - if (!is_locked) + else { - curr = next = __volatileRead(&f_hash->slots[hindex]); - __threadfence(); - if (curr == HASHITEM_LOCKED || - atomicCAS(&f_hash->slots[hindex], - curr, - HASHITEM_LOCKED) != curr) - return false; /* already locked, try again */ - is_locked = true; - goto restart; + assert(vclass == KVAR_CLASS__NULL); } +} - /* - * Step-3: create a slot on kds_final - */ - dst_index = gpupreagg_create_final_slot(kcxt, - kds_final, - kds_slot, - kds_index, - l_dclass, - l_values); - if (dst_index == UINT_MAX) +INLINE_FUNCTION(void) +__update_groupby__pavg_int(kern_context *kcxt, + char *buffer, + kern_colmeta *cmeta, + kern_aggregate_desc *desc) +{ + int vclass = kcxt->kvars_class[desc->arg0_slot_id]; + + if (vclass == KVAR_CLASS__INLINE) { - /* likely, out of memory */ - atomicExch(&f_hash->slots[hindex], next); //UNLOCK - return false; - } + kagg_state__pavg_int_packed *r = + (kagg_state__pavg_int_packed *)buffer; + int64_t ival = kcxt->kvars_slot[desc->arg0_slot_id].i64; - /* - * Step-4: allocation of hash entry - */ - curr = atomicAdd(&f_hash->usage, 1); - if (offsetof(kern_global_hashslot, slots[f_hash->nslots]) + - sizeof(preagg_hash_item) * (curr + 1) >= f_hash->length) + __atomic_add_uint32(&r->nitems, 1); + __atomic_add_int64(&r->sum, ival); + } + else { - STROM_EREPORT(kcxt, ERRCODE_OUT_OF_MEMORY, "out of memory"); - atomicExch(&f_hash->slots[hindex], next); //UNLOCK - return false; + assert(vclass == KVAR_CLASS__NULL); } - hitem = GLOBAL_HASHSLOT_GETITEM(f_hash, curr); - hitem->index = dst_index; - hitem->hash = hash; - hitem->next = next; +} - /* - * NOTE: Above modification to kds_final/f_hash are weakly-ordered memory - * writes, thus, updates on the hitem and kds_final may not be visible to - * other threads in the device. - * __threadfence() ensures any writes prior to the invocation are visible - * to other threads. Don't eliminate this. - */ - __threadfence(); +INLINE_FUNCTION(void) +__update_groupby__pavg_fp(kern_context *kcxt, + char *buffer, + kern_colmeta *cmeta, + kern_aggregate_desc *desc) +{ + int vclass = kcxt->kvars_class[desc->arg0_slot_id]; - atomicExch(&f_hash->slots[hindex], curr); //UNLOCK; + if (vclass == KVAR_CLASS__INLINE) + { + kagg_state__pavg_fp_packed *r = + (kagg_state__pavg_fp_packed *)buffer; + float8_t fval = kcxt->kvars_slot[desc->arg0_slot_id].fp64; - return true; + __atomic_add_uint32(&r->nitems, 1); + __atomic_add_fp64(&r->sum, fval); + } + else + { + assert(vclass == KVAR_CLASS__NULL); + } } -/* - * gpupreagg_local_reduction - * - * - */ -STATIC_INLINE(int) -gpupreagg_local_reduction(kern_context *kcxt, - kern_data_store *kds_slot, - cl_uint index, - cl_uint hash, - preagg_local_hashtable *l_htable, - preagg_hash_item *l_hitems, - cl_char *l_dclass, /* __shared__ */ - Datum *l_values, /* __shared__ */ - char *l_extras) /* __shared__ */ +INLINE_FUNCTION(void) +__update_groupby__pstddev(kern_context *kcxt, + char *buffer, + kern_colmeta *cmeta, + kern_aggregate_desc *desc) { - cl_uint hindex = hash % GPUPREAGG_LOCAL_HASH_NSLOTS; - cl_uint curr; - cl_uint next; - cl_bool is_locked = false; - - curr = next = __volatileRead(&l_htable->l_hslots[hindex]); - __threadfence_block(); - if (curr == HASHITEM_LOCKED) - return -1; /* locked */ -restart: - while (curr < GPUPREAGG_LOCAL_HASH_NROOMS) + int vclass = kcxt->kvars_class[desc->arg0_slot_id]; + + if (vclass == KVAR_CLASS__INLINE) { - preagg_hash_item *hitem = &l_hitems[curr]; + kagg_state__stddev_packed *r = + (kagg_state__stddev_packed *)buffer; + float8_t fval = kcxt->kvars_slot[desc->arg0_slot_id].fp64; - if (hitem->hash == hash && - gpupreagg_keymatch(kcxt, - kds_slot, index, - kds_slot, hitem->index)) - { - if (is_locked) - atomicExch(&l_htable->l_hslots[hindex], next); //UNLOCK - goto found; - } - curr = hitem->next; + __atomic_add_uint32(&r->nitems, 1); + __atomic_add_fp64(&r->sum_x, fval); + __atomic_add_fp64(&r->sum_x2, fval * fval); } - assert(curr == HASHITEM_EMPTY); - - if (__volatileRead(&l_htable->nitems) >= GPUPREAGG_LOCAL_HASH_NROOMS) + else { - /* - * Here we could not find out the entry on the local hash-table, - * but obviously no space on the local hash-table also. - * In this case, thread goes to the second path for the global-to- - * global reduction. - */ - if (is_locked) - atomicExch(&l_htable->l_hslots[hindex], next); //UNLOCK - return 0; /* not found */ + assert(vclass == KVAR_CLASS__NULL); } - assert(l_hitems && l_dclass && l_values); +} - /* - * Begin critical section - */ - if (!is_locked) +INLINE_FUNCTION(void) +__update_groupby__pcovar(kern_context *kcxt, + char *buffer, + kern_colmeta *cmeta, + kern_aggregate_desc *desc) +{ + if (kcxt->kvars_class[desc->arg0_slot_id] == KVAR_CLASS__INLINE && + kcxt->kvars_class[desc->arg1_slot_id] == KVAR_CLASS__INLINE) { - curr = next = __volatileRead(&l_htable->l_hslots[hindex]); - __threadfence_block(); - if (curr == HASHITEM_LOCKED || - atomicCAS(&l_htable->l_hslots[hindex], - next, - HASHITEM_LOCKED) != next) - return -1; /* lock contension, retry again. */ - is_locked = true; - goto restart; + kagg_state__covar_packed *r = + (kagg_state__covar_packed *)buffer; + float8_t xval = kcxt->kvars_slot[desc->arg0_slot_id].fp64; + float8_t yval = kcxt->kvars_slot[desc->arg0_slot_id].fp64; + + __atomic_add_uint32(&r->nitems, 1); + __atomic_add_fp64(&r->sum_x, xval); + __atomic_add_fp64(&r->sum_xx, xval * xval); + __atomic_add_fp64(&r->sum_y, yval); + __atomic_add_fp64(&r->sum_yy, yval * yval); + __atomic_add_fp64(&r->sum_xy, xval * yval); } - curr = atomicAdd(&l_htable->nitems, 1); - if (curr >= GPUPREAGG_LOCAL_HASH_NROOMS) + else { - /* - * Oops, the local hash-table has no space to save a new - * entry any more. So, unlock the slot, then return to - * the caller to go to the second path for the global-to- - * global reduction. - */ - atomicExch(&l_htable->l_hslots[hindex], next); //UNLOCK - return 0; /* not found */ + assert(kcxt->kvars_class[desc->arg0_slot_id] == KVAR_CLASS__NULL || + kcxt->kvars_class[desc->arg1_slot_id] == KVAR_CLASS__NULL); } - - /* - * initial allocation of the hash-item that is allocated above. - */ - l_hitems[curr].index = index; - l_hitems[curr].hash = hash; - l_hitems[curr].next = next; - - if (l_extras) - l_extras += GPUPREAGG_ACCUM_EXTRA_BUFSZ * curr; - gpupreagg_init_local_slot(l_dclass + GPUPREAGG_NUM_ACCUM_VALUES * curr, - l_values + GPUPREAGG_NUM_ACCUM_VALUES * curr, - l_extras); - /* - * __threadfence_block() makes above updates visible to other concurent - * threads within this block. - */ - __threadfence_block(); - /* UNLOCK */ - atomicExch(&l_htable->l_hslots[hindex], curr); -found: - /* Runs global-to-local reduction */ - gpupreagg_update_atomic(l_dclass + GPUPREAGG_NUM_ACCUM_VALUES * curr, - l_values + GPUPREAGG_NUM_ACCUM_VALUES * curr, - GPUPREAGG_ACCUM_MAP_LOCAL, - KERN_DATA_STORE_DCLASS(kds_slot, index), - KERN_DATA_STORE_VALUES(kds_slot, index), - GPUPREAGG_ACCUM_MAP_GLOBAL); - return 1; /* ok, merged */ } /* - * gpupreagg_group_reduction + * __updateOneTupleGroupBy */ -DEVICE_FUNCTION(void) -gpupreagg_groupby_reduction(kern_context *kcxt, - kern_gpupreagg *kgpreagg, /* in/out */ - kern_errorbuf *kgjoin_errorbuf, /* in */ - kern_data_store *kds_slot, /* in */ - kern_data_store *kds_final, /* out */ - kern_global_hashslot *f_hash, /* out */ - preagg_hash_item *l_hitems, /* __shared__ */ - cl_char *l_dclass, /* __shared__ */ - Datum *l_values, /* __shared__ */ - char *l_extras) /* __shared__ */ +STATIC_FUNCTION(void) +__updateOneTupleGroupBy(kern_context *kcxt, + kern_data_store *kds_final, + HeapTupleHeaderData *htup, + kern_expression *kexp_groupby_actions) { - cl_bool is_last_reduction = false; - cl_uint l_nitems; - __shared__ preagg_local_hashtable l_htable; - __shared__ cl_uint base; - - /* skip if previous stage reported an error */ - if (kgjoin_errorbuf && - __syncthreads_count(kgjoin_errorbuf->errcode) != 0) - return; - if (__syncthreads_count(kgpreagg->kerror.errcode) != 0) - return; - - assert(kgpreagg->num_group_keys > 0); - assert(kds_slot->format == KDS_FORMAT_SLOT); - assert(kds_final->format == KDS_FORMAT_SLOT); - if (get_global_id() == 0) - kgpreagg->setup_slot_done = true; + int nattrs = (htup->t_infomask2 & HEAP_NATTS_MASK); + bool heap_hasnull = ((htup->t_infomask & HEAP_HASNULL) != 0); + uint32_t t_hoff; + char *buffer; - /* - * setup local hash-table - */ - if (get_local_id() == 0) - { - l_final_buffer_modified = false; - l_htable.nitems = 0; - } - for (int i = get_local_id(); i < GPUPREAGG_LOCAL_HASH_NSLOTS; i += get_local_size()) - l_htable.l_hslots[i] = HASHITEM_EMPTY; - __syncthreads(); + t_hoff = offsetof(HeapTupleHeaderData, t_bits); + if (heap_hasnull) + t_hoff += BITMAPLEN(nattrs); + t_hoff = MAXALIGN(t_hoff); - /* - * main loop for the local/global hybrid reduction - */ - do { - cl_uint hash = UINT_MAX; - int status; - int index; - int count; - - /* fetch next items from the kds_slot */ - if (get_local_id() == 0) - base = atomicAdd(&kgpreagg->read_slot_pos, get_local_size()); - __syncthreads(); - if (base >= kds_slot->nitems) - break; - if (base + get_local_size() >= kds_slot->nitems) - is_last_reduction = true; + for (int j=0; j < nattrs; j++) + { + kern_aggregate_desc *desc = &kexp_groupby_actions->u.pagg.desc[j]; + kern_colmeta *cmeta = &kds_final->colmeta[j]; - /* calculation of the hash-value of the item */ - index = base + get_local_id(); - if (index < kds_slot->nitems) + if (heap_hasnull && att_isnull(j, htup->t_bits)) { - cl_char *__dclass = KERN_DATA_STORE_DCLASS(kds_slot, index); - Datum *__values = KERN_DATA_STORE_VALUES(kds_slot, index); - - hash = gpupreagg_hashvalue(kcxt, __dclass, __values); + /* only grouping-key may have NULL */ + assert(desc->action == KAGG_ACTION__VREF); + continue; } - if (__syncthreads_count(kcxt->errcode) > 0) - return; /* error */ - - /* - * 1st path - try local reduction - */ - status = -1; - do { - if (status < 0 && index < kds_slot->nitems) - status = gpupreagg_local_reduction(kcxt, - kds_slot, - index, - hash, - &l_htable, - l_hitems, - l_dclass, - l_values, - l_extras); - else - status = 1; - if (__syncthreads_count(kcxt->errcode) > 0) - return; /* error */ - } while (__syncthreads_count(status < 0) > 0); + if (cmeta->attlen > 0) + t_hoff = TYPEALIGN(cmeta->attalign, t_hoff); + else if (!VARATT_NOT_PAD_BYTE((char *)htup + t_hoff)) + t_hoff = TYPEALIGN(cmeta->attalign, t_hoff); + buffer = ((char *)htup + t_hoff); + if (cmeta->attlen > 0) + t_hoff += cmeta->attlen; + else + t_hoff += VARSIZE_ANY(buffer); - /* - * 2nd path - try global reduction - */ - assert(status >= 0); - while ((count = __syncthreads_count(status == 0)) > 0) - { - if (gpupreagg_expand_global_hash(kcxt, f_hash, count)) - { - if (status == 0) - { - assert(index < kds_slot->nitems); - - if (gpupreagg_global_reduction(kcxt, - kds_slot, - index, - hash, - kds_final, - f_hash, - NULL, - NULL)) - status = 1; /* successfully, merged */ - } - /* unlock global hash slots */ - __syncthreads(); - if (get_local_id() == 0) - atomicSub(&f_hash->lock, 2); - } - /* quick bailout on error */ - if (__syncthreads_count(kcxt->errcode) > 0) + switch (desc->action) + { + case KAGG_ACTION__NROWS_ANY: + __update_groupby__nrows_any(kcxt, buffer, cmeta, desc); + break; + case KAGG_ACTION__NROWS_COND: + __update_groupby__nrows_cond(kcxt, buffer, cmeta, desc); + break; + case KAGG_ACTION__PMIN_INT: + __update_groupby__pmin_int(kcxt, buffer, cmeta, desc); + break; + case KAGG_ACTION__PMAX_INT: + __update_groupby__pmax_int(kcxt, buffer, cmeta, desc); + break; + case KAGG_ACTION__PMIN_FP: + __update_groupby__pmin_fp(kcxt, buffer, cmeta, desc); + break; + case KAGG_ACTION__PMAX_FP: + __update_groupby__pmax_fp(kcxt, buffer, cmeta, desc); + break; + case KAGG_ACTION__PSUM_INT: + __update_groupby__psum_int(kcxt, buffer, cmeta, desc); + break; + case KAGG_ACTION__PSUM_FP: + __update_groupby__psum_fp(kcxt, buffer, cmeta, desc); + break; + case KAGG_ACTION__PAVG_INT: + __update_groupby__pavg_int(kcxt, buffer, cmeta, desc); + break; + case KAGG_ACTION__PAVG_FP: + __update_groupby__pavg_fp(kcxt, buffer, cmeta, desc); + break; + case KAGG_ACTION__STDDEV: + __update_groupby__pstddev(kcxt, buffer, cmeta, desc); + break; + case KAGG_ACTION__COVAR: + __update_groupby__pcovar(kcxt, buffer, cmeta, desc); + break; + default: + /* + * No more partial aggregation exists after grouping-keys + */ return; } - } while (!is_last_reduction); + } +} - __syncthreads(); +STATIC_FUNCTION(int) +__execGpuPreAggGroupBy(kern_context *kcxt, + kern_data_store *kds_final, + bool kvars_is_valid, + kern_expression *kexp_groupby_keyhash, + kern_expression *kexp_groupby_keyload, + kern_expression *kexp_groupby_keycomp, + kern_expression *kexp_groupby_actions, + bool *p_try_suspend) +{ + kern_hashitem *hitem = NULL; + xpu_int4_t hash; + assert(kds_final->format == KDS_FORMAT_HASH); /* - * last path - flush pending local reductions + * compute hash value of the grouping keys */ - l_nitems = Min(l_htable.nitems, GPUPREAGG_LOCAL_HASH_NROOMS); - for (cl_uint i = 0; i < l_nitems; i += get_local_size()) + memset(&hash, 0, sizeof(hash)); + if (kvars_is_valid) { - cl_uint j = i + get_local_id(); - cl_int status = 0; - cl_int count; + if (EXEC_KERN_EXPRESSION(kcxt, kexp_groupby_keyhash, &hash)) + assert(!XPU_DATUM_ISNULL(&hash)); + } + if (__any_sync(__activemask(), kcxt->errcode != ERRCODE_STROM_SUCCESS)) + return false; - while ((count = __syncthreads_count(!status && j < l_nitems)) > 0) + /* + * lookup the destination grouping tuple. if not found, create a new one. + */ + do { + if (!XPU_DATUM_ISNULL(&hash) && !hitem) { - if (gpupreagg_expand_global_hash(kcxt, f_hash, count)) + uint32_t *hslot = KDS_GET_HASHSLOT(kds_final, hash.value); + uint32_t saved; + xpu_bool_t status; + + for (hitem = KDS_HASH_FIRST_ITEM(kds_final, hslot, &saved); + hitem != NULL; + hitem = KDS_HASH_NEXT_ITEM(kds_final, hitem)) { - if (!status && j < l_nitems) + if (hitem->hash != hash.value) + continue; + ExecLoadVarsHeapTuple(kcxt, kexp_groupby_keyload, + -2, + kds_final, + &hitem->t.htup); + if (EXEC_KERN_EXPRESSION(kcxt, kexp_groupby_keycomp, &status)) { - preagg_hash_item *hitem = &l_hitems[j]; - cl_char *my_dclass = l_dclass + GPUPREAGG_NUM_ACCUM_VALUES * j; - Datum *my_values = l_values + GPUPREAGG_NUM_ACCUM_VALUES * j; - - if (gpupreagg_global_reduction(kcxt, - kds_slot, - hitem->index, - hitem->hash, - kds_final, - f_hash, - my_dclass, - my_values)) - status = 1; /* merged */ + assert(!XPU_DATUM_ISNULL(&status)); + if (status.value) + break; } - else + } + + if (!hitem && saved != UINT_MAX) + { + /* try lock */ + if (__atomic_cas_uint32(hslot, saved, UINT_MAX) == saved) { - status = 1; + hitem = __insertOneTupleGroupBy(kcxt, kds_final, + kexp_groupby_actions); + if (hitem) + { + uint32_t offset; + + hitem->hash = hash.value; + hitem->next = saved; + offset = (char *)hitem - (char *)kds_final; + /* insert and unlock */ + __atomic_write_uint32(hslot, __kds_packed(offset)); + } + else + { + /* out of the memory */ + __atomic_write_uint32(hslot, saved); + *p_try_suspend = true; + } } - /* unlock global hash slots */ - __syncthreads(); - if (get_local_id() == 0) - atomicSub(&f_hash->lock, 2); - } - /* quick bailout on error */ - if (__syncthreads_count(kcxt->errcode) > 0) - return; + } } - } - __syncthreads(); - if (get_local_id() == 0 && l_final_buffer_modified) - kgpreagg->final_buffer_modified = true; + /* suspend the kernel? */ + if (__any_sync(__activemask(), *p_try_suspend)) + return false; + /* error checks */ + if (__any_sync(__activemask(), kcxt->errcode != ERRCODE_STROM_SUCCESS)) + return false; + /* retry, if any threads are not ready yet */ + } while (__any_sync(__activemask(), !XPU_DATUM_ISNULL(&hash) && !hitem)); + + /* + * update the partial aggregation + */ + if (hitem) + __updateOneTupleGroupBy(kcxt, kds_final, + &hitem->t.htup, + kexp_groupby_actions); + return true; } -/* - * aggcalc operations for hyper-log-log - */ -DEVICE_FUNCTION(void) -aggcalc_init_hll_sketch(cl_char *p_accum_dclass, - Datum *p_accum_datum, - char *extra_pos) +PUBLIC_FUNCTION(int) +execGpuPreAggGroupBy(kern_context *kcxt, + kern_warp_context *wp, + int n_rels, + kern_data_store *kds_final, + char *kvars_addr_wp, + bool *p_try_suspend) { - cl_uint sz = VARHDRSZ + (1U << GPUPREAGG_HLL_REGISTER_BITS); + kern_session_info *session = kcxt->session; + kern_expression *kexp_groupby_keyhash = SESSION_KEXP_GROUPBY_KEYHASH(session); + kern_expression *kexp_groupby_keyload = SESSION_KEXP_GROUPBY_KEYLOAD(session); + kern_expression *kexp_groupby_keycomp = SESSION_KEXP_GROUPBY_KEYCOMP(session); + kern_expression *kexp_groupby_actions = SESSION_KEXP_GROUPBY_ACTIONS(session); + kern_expression *karg; + uint32_t write_pos = WARP_WRITE_POS(wp,n_rels); + uint32_t read_pos = WARP_READ_POS(wp,n_rels); + uint32_t i, mask; + bool status; + + /* + * The previous depth still may produce new tuples, and number of + * the current result tuples is not sufficient to run projection. + */ + if (wp->scan_done <= n_rels && read_pos + warpSize > write_pos) + return n_rels; - *p_accum_dclass = DATUM_CLASS__NULL; - memset(extra_pos, 0, sz); - SET_VARSIZE(extra_pos, sz); - *p_accum_datum = PointerGetDatum(extra_pos); -} + read_pos += LaneId(); + if (read_pos < write_pos) + { + int index = (read_pos % UNIT_TUPLES_PER_DEPTH); -DEVICE_FUNCTION(void) -aggcalc_shuffle_hll_sketch(cl_char *p_accum_dclass, - Datum *p_accum_datum, - int lane_id) -{ - cl_char my_dclass; - cl_char buddy_dclass; - varlena *hll_state = (varlena *)DatumGetPointer(*p_accum_datum); - cl_uint *hll_regs = (cl_uint *)VARDATA(hll_state); - cl_uint nrooms = (1U << GPUPREAGG_HLL_REGISTER_BITS); - cl_uint index; - - assert(VARSIZE_EXHDR(hll_state) == nrooms); - assert(__activemask() == ~0U); - my_dclass = *p_accum_dclass; - buddy_dclass = __shfl_sync(__activemask(), my_dclass, lane_id); - - nrooms /= sizeof(cl_uint); - for (index=0; index < nrooms; index++) + kcxt->kvars_slot = (kern_variable *) + (kvars_addr_wp + index * kcxt->kvars_nbytes); + kcxt->kvars_class = (int *)(kcxt->kvars_slot + kcxt->kvars_nslots); + } + else { - union { - cl_uchar regs[4]; - cl_uint v32; - } myself, buddy; - - myself.v32 = hll_regs[index]; - buddy.v32 = __shfl_sync(__activemask(), myself.v32, lane_id); - if (my_dclass == DATUM_CLASS__NULL) - { - if (buddy_dclass != DATUM_CLASS__NULL) - { - hll_regs[index] = buddy.v32; - *p_accum_dclass = DATUM_CLASS__NORMAL; - } - } - else + kcxt->kvars_slot = NULL; + kcxt->kvars_class = NULL; + } + mask = __ballot_sync(__activemask(), kcxt->kvars_class != NULL); + if (mask == 0) + goto skip_reduction; + + /* + * fillup the kvars_slot if it involves expressions + */ + if (kcxt->kvars_slot != NULL) + { + for (i=0, karg = KEXP_FIRST_ARG(kexp_groupby_actions); + i < kexp_groupby_actions->nr_args; + i++, karg = KEXP_NEXT_ARG(karg)) { - assert(my_dclass == DATUM_CLASS__NORMAL); - if (buddy_dclass != DATUM_CLASS__NULL) + assert(karg->opcode == FuncOpCode__SaveExpr); + if (!EXEC_KERN_EXPRESSION(kcxt, karg, NULL)) { - assert(buddy_dclass == DATUM_CLASS__NORMAL); - if (myself.regs[0] < buddy.regs[0]) - myself.regs[0] = buddy.regs[0]; - if (myself.regs[1] < buddy.regs[1]) - myself.regs[1] = buddy.regs[1]; - if (myself.regs[2] < buddy.regs[2]) - myself.regs[2] = buddy.regs[2]; - if (myself.regs[3] < buddy.regs[3]) - myself.regs[3] = buddy.regs[3]; - hll_regs[index] = myself.v32; + assert(kcxt->errcode != ERRCODE_STROM_SUCCESS); + break; } } } -} - -DEVICE_FUNCTION(void) -aggcalc_normal_hll_sketch(cl_char *p_accum_dclass, - Datum *p_accum_datum, - cl_char newval_dclass, - Datum newval_datum) /* = int8 hash */ -{ - cl_uint nrooms = (1U << GPUPREAGG_HLL_REGISTER_BITS); - cl_uint index; - cl_uint count; - cl_char *hll_regs; - - if (newval_dclass != DATUM_CLASS__NULL) + if (__any_sync(__activemask(), kcxt->errcode != ERRCODE_STROM_SUCCESS)) + return -1; + /* + * main logic of GpuPreAgg + */ + assert(kexp_groupby_actions != NULL); + if (kexp_groupby_keyhash && + kexp_groupby_keyload && + kexp_groupby_keycomp) { - assert(newval_dclass == DATUM_CLASS__NORMAL); - - - index = (newval_datum & (nrooms - 1)); - count = __clzll(__brevll(newval_datum >> GPUPREAGG_HLL_REGISTER_BITS)) + 1; - hll_regs = VARDATA(*p_accum_datum); - if (hll_regs[index] < count) - hll_regs[index] = count; - *p_accum_dclass = DATUM_CLASS__NORMAL; + status = __execGpuPreAggGroupBy(kcxt, kds_final, + kcxt->kvars_slot != NULL, + kexp_groupby_keyhash, + kexp_groupby_keyload, + kexp_groupby_keycomp, + kexp_groupby_actions, + p_try_suspend); } -} - -DEVICE_FUNCTION(void) -aggcalc_merge_hll_sketch(cl_char *p_accum_dclass, - Datum *p_accum_datum, - cl_char newval_dclass, - Datum newval_datum) /* =bytea sketch */ -{ - if (newval_dclass != DATUM_CLASS__NULL) + else { - cl_uint *dst_regs = (cl_uint *)VARDATA(*p_accum_datum); - cl_uint *new_regs = (cl_uint *)VARDATA(newval_datum); - cl_uint nrooms = (1U << GPUPREAGG_HLL_REGISTER_BITS); - cl_uint index; - - assert(newval_dclass == DATUM_CLASS__NORMAL); - assert(VARSIZE_EXHDR(*p_accum_datum) == nrooms && - VARSIZE_EXHDR(newval_datum) == nrooms); - nrooms /= sizeof(cl_uint); - for (index=0; index < nrooms; index++) - { - union { - cl_uchar regs[4]; - cl_uint v32; - } oldval, curval, newval, tmpval; - - tmpval.v32 = __volatileRead(&new_regs[index]); - curval.v32 = __volatileRead(&dst_regs[index]); - do { - newval = oldval = curval; - if (newval.regs[0] < tmpval.regs[0]) - newval.regs[0] = tmpval.regs[0]; - if (newval.regs[1] < tmpval.regs[1]) - newval.regs[1] = tmpval.regs[1]; - if (newval.regs[2] < tmpval.regs[2]) - newval.regs[2] = tmpval.regs[2]; - if (newval.regs[3] < tmpval.regs[3]) - newval.regs[3] = tmpval.regs[3]; - if (newval.v32 == curval.v32) - break; - } while ((curval.v32 = atomicCAS(&dst_regs[index], - oldval.v32, - newval.v32)) != oldval.v32); - } - *p_accum_dclass = DATUM_CLASS__NORMAL; + status = __execGpuPreAggNoGroups(kcxt, kds_final, + kcxt->kvars_slot != NULL, + kexp_groupby_actions, + p_try_suspend); } -} - -DEVICE_FUNCTION(void) -aggcalc_update_hll_sketch(cl_char *p_accum_dclass, - Datum *p_accum_datum, - cl_char newval_dclass, - Datum newval_datum) /* =int8 hash */ -{ - cl_uint nrooms = (1U << GPUPREAGG_HLL_REGISTER_BITS); - cl_uint index; - cl_uint count; - cl_uint *hll_regs; + if (__any_sync(__activemask(), !status)) + return -1; - if (newval_dclass != DATUM_CLASS__NULL) + /* + * Update the read position + */ +skip_reduction: + if (LaneId() == 0) { - union { - cl_uchar regs[4]; - cl_uint v32; - } oldval, curval, newval; - - assert(newval_dclass == DATUM_CLASS__NORMAL); - - index = (newval_datum & (nrooms - 1)); - count = __clzll(__brevll(newval_datum >> GPUPREAGG_HLL_REGISTER_BITS)) + 1; - hll_regs = (cl_uint *)VARDATA(*p_accum_datum); - hll_regs += (index >> 2); - index &= 3; - - curval.v32 = __volatileRead(hll_regs); - do { - if (count <= curval.regs[index]) - break; - newval = oldval = curval; - newval.regs[index] = count; - } while ((curval.v32 = atomicCAS(hll_regs, - oldval.v32, - newval.v32)) != oldval.v32); - *p_accum_dclass = DATUM_CLASS__NORMAL; + WARP_READ_POS(wp,n_rels) += __popc(mask); + assert(WARP_WRITE_POS(wp,n_rels) >= WARP_READ_POS(wp,n_rels)); + } + __syncwarp(); + if (wp->scan_done <= n_rels) + { + if (WARP_WRITE_POS(wp,n_rels) < WARP_READ_POS(wp,n_rels) + warpSize) + return n_rels; /* back to the previous depth */ + } + else + { + if (WARP_READ_POS(wp,n_rels) >= WARP_WRITE_POS(wp,n_rels)) + return -1; /* ok, end of GpuPreAgg */ } + return n_rels + 1; /* elsewhere, try again? */ } diff --git a/src/cuda_gpuscan.cu b/src/cuda_gpuscan.cu index 90910c5df..7e949fb99 100644 --- a/src/cuda_gpuscan.cu +++ b/src/cuda_gpuscan.cu @@ -1,750 +1,494 @@ /* - * libgpuscan.cu + * cuda_gpuscan.cu * - * GPU implementation of GpuScan + * Device implementation of GpuScan * ---- - * Copyright 2011-2021 (C) KaiGai Kohei - * Copyright 2014-2021 (C) PG-Strom Developers Team + * Copyright 2011-2023 (C) KaiGai Kohei + * Copyright 2014-2023 (C) PG-Strom Developers Team * * This program is free software; you can redistribute it and/or modify * it under the terms of the PostgreSQL License. */ #include "cuda_common.h" -#include "cuda_gpuscan.h" -#include "cuda_gcache.h" -/* - * gpuscan_main_row - GpuScan logic for KDS_FORMAT_ROW +/* ---------------------------------------------------------------- + * + * execGpuScanLoadSource and related + * + * ---------------------------------------------------------------- */ -DEVICE_FUNCTION(void) -gpuscan_main_row(kern_context *kcxt, - kern_gpuscan *kgpuscan, - kern_data_store *kds_src, - kern_data_store *kds_dst, - bool has_device_projection) +STATIC_FUNCTION(int) +__gpuscan_load_source_row(kern_context *kcxt, + kern_warp_context *wp, + kern_data_store *kds_src, + kern_expression *kexp_load_vars, + kern_expression *kexp_scan_quals, + char *kvars_addr_wp, + uint32_t *p_smx_row_count) { - gpuscanSuspendContext *my_suspend - = KERN_GPUSCAN_SUSPEND_CONTEXT(kgpuscan, get_group_id()); - cl_uint part_index = 0; - cl_uint src_index; - cl_uint src_base; - cl_uint total_nitems_in = 0; /* stat */ - cl_uint total_nitems_out = 0; /* stat */ - cl_uint total_extra_size = 0; /* stat */ - __shared__ cl_uint dst_nitems_base; - __shared__ cl_ulong dst_usage_base; - - assert(kds_src->format == KDS_FORMAT_ROW); - assert(kds_dst->format == KDS_FORMAT_SLOT); - /* quick bailout if any error happen on the prior kernel */ - if (__syncthreads_count(kgpuscan->kerror.errcode) != 0) - return; - /* resume kernel from the point where suspended, if any */ - if (kgpuscan->resume_context) + uint32_t count; + uint32_t index; + uint32_t mask; + uint32_t wr_pos; + kern_tupitem *tupitem = NULL; + + /* fetch next warpSize tuples */ + if (LaneId() == 0) + count = atomicAdd(p_smx_row_count, 1); + count = __shfl_sync(__activemask(), count, 0); + index = (get_num_groups() * count + get_group_id()) * warpSize; + if (index >= kds_src->nitems) { - assert(my_suspend != NULL); - part_index = my_suspend->part_index; + if (LaneId() == 0) + wp->scan_done = 1; + __syncwarp(); + return 1; } + index += LaneId(); - for (src_base = get_global_base() + part_index * get_global_size(); - src_base < kds_src->nitems; - src_base += get_global_size(), part_index++) + if (index < kds_src->nitems) { - kern_tupitem *tupitem = NULL; - cl_bool rc = false; - cl_uint nvalids; - cl_uint required = 0; - cl_uint nitems_offset; - cl_uint usage_offset = 0; - cl_uint usage_length = 0; - cl_uint suspend_kernel = 0; - cl_char *tup_dclass = NULL; - Datum *tup_values = NULL; - - /* rewind the varlena buffer */ - kcxt->vlpos = kcxt->vlbuf; - /* Evalidation of the rows by WHERE-clause */ - src_index = src_base + get_local_id(); - if (src_index < kds_src->nitems) - { - tupitem = KERN_DATA_STORE_TUPITEM(kds_src, src_index); - rc = gpuscan_quals_eval(kcxt, kds_src, - &tupitem->htup.t_ctid, - &tupitem->htup); - } - /* bailout if any error */ - if (__syncthreads_count(kcxt->errcode) > 0) - break; - /* how many rows servived WHERE-clause evaluation? */ - nitems_offset = pgstromStairlikeBinaryCount(rc, &nvalids); - if (nvalids > 0) - { - /* extract the source tuple to the private slot, if any */ - if (rc) - { - kcxt->vlpos = kcxt->vlbuf; /* rewind */ - tup_dclass = (cl_char *) - kern_context_alloc(kcxt, sizeof(cl_char) * kds_dst->ncols); - tup_values = (Datum *) - kern_context_alloc(kcxt, sizeof(Datum) * kds_dst->ncols); - - if (!tup_dclass || !tup_values) - { - STROM_CPU_FALLBACK(kcxt, ERRCODE_OUT_OF_MEMORY, - "out of memory"); - } - else - { - gpuscan_projection_tuple(kcxt, - kds_src, - &tupitem->htup, - &tupitem->htup.t_ctid, - tup_dclass, - tup_values); - required = kds_slot_compute_extra(kcxt, - kds_dst, - tup_dclass, - tup_values); - } - } - /* bailout if any error */ - if (__syncthreads_count(kcxt->errcode) > 0) - break;; - /* allocation of the destination buffer */ - usage_offset = pgstromStairlikeSum(__kds_packed(required), - &usage_length); - if (get_local_id() == 0) - { - union { - struct { - cl_uint nitems; - cl_uint usage; - } i; - cl_ulong v64; - } oldval, curval, newval; - - curval.i.nitems = kds_dst->nitems; - curval.i.usage = kds_dst->usage; - do { - newval = oldval = curval; - newval.i.nitems += nvalids; - newval.i.usage += usage_length; - - if (KERN_DATA_STORE_SLOT_LENGTH(kds_dst, newval.i.nitems) + - __kds_unpack(newval.i.usage) > kds_dst->length) - { - atomicAdd(&kgpuscan->suspend_count, 1); - suspend_kernel = 1; - break; - } - } while ((curval.v64 = atomicCAS((cl_ulong *)&kds_dst->nitems, - oldval.v64, - newval.v64)) != oldval.v64); - dst_nitems_base = oldval.i.nitems; - dst_usage_base = oldval.i.usage; - } - if (__syncthreads_count(suspend_kernel) > 0) - break; - /* store the result tuple on the destination buffer */ - if (rc) - { - cl_uint dst_index = dst_nitems_base + nitems_offset; - char *dst_extra = ((char *)kds_dst + kds_dst->length - - __kds_unpack(dst_usage_base + - usage_offset) - required); - kds_slot_store_values(kcxt, - kds_dst, - dst_index, - dst_extra, - tup_dclass, - tup_values); - } - } - /* update statistics */ - if (get_local_id() == 0) - { - total_nitems_in += Min(kds_src->nitems - src_base, - get_local_size()); - total_nitems_out += nvalids; - total_extra_size += __kds_unpack(usage_length); - } + uint32_t offset = KDS_GET_ROWINDEX(kds_src)[index]; + + assert(offset <= kds_src->usage); + tupitem = (kern_tupitem *)((char *)kds_src + + kds_src->length - + __kds_unpack(offset)); + assert((char *)tupitem >= (char *)kds_src && + (char *)tupitem < (char *)kds_src + kds_src->length); + kcxt->kvars_slot = (kern_variable *)alloca(kcxt->kvars_nbytes); + kcxt->kvars_class = (int *)(kcxt->kvars_slot + kcxt->kvars_nslots); + if (!ExecLoadVarsOuterRow(kcxt, + kexp_load_vars, + kexp_scan_quals, + kds_src, + &tupitem->htup)) + tupitem = NULL; } - /* write back statistics */ - if (get_local_id() == 0) + /* error checks */ + if (__any_sync(__activemask(), kcxt->errcode != ERRCODE_STROM_SUCCESS)) + return -1; + /* + * save the private kvars slot on the combination buffer (depth=0) + */ + mask = __ballot_sync(__activemask(), tupitem != NULL); + if (LaneId() == 0) { - atomicAdd(&kgpuscan->nitems_in, total_nitems_in); - atomicAdd(&kgpuscan->nitems_out, total_nitems_out); - atomicAdd(&kgpuscan->extra_size, total_extra_size); + wr_pos = WARP_WRITE_POS(wp,0); + WARP_WRITE_POS(wp,0) += __popc(mask); } - /* suspend the current position (even if normal exit) */ - if (my_suspend && get_local_id() == 0) + wr_pos = __shfl_sync(__activemask(), wr_pos, 0); + mask &= ((1U << LaneId()) - 1); + wr_pos += __popc(mask); + if (tupitem != NULL) { - my_suspend->part_index = part_index; - my_suspend->line_index = 0; + index = (wr_pos % UNIT_TUPLES_PER_DEPTH); + memcpy((char *)kvars_addr_wp + index * kcxt->kvars_nbytes, + kcxt->kvars_slot, + kcxt->kvars_nbytes); } + kcxt->kvars_slot = NULL; + kcxt->kvars_class = NULL; + __syncwarp(); + /* move to the next depth if more than 32 htuples were fetched */ + return (WARP_WRITE_POS(wp,0) >= WARP_READ_POS(wp,0) + warpSize ? 1 : 0); } /* - * gpuscan_main_block - GpuScan logic for KDS_FORMAT_BLOCK + * __gpuscan_load_source_block */ -DEVICE_FUNCTION(void) -gpuscan_main_block(kern_context *kcxt, - kern_gpuscan *kgpuscan, - kern_data_store *kds_src, - kern_data_store *kds_dst, - bool has_device_projection) +STATIC_FUNCTION(int) +__gpuscan_load_source_block(kern_context *kcxt, + kern_warp_context *wp, + kern_data_store *kds_src, + kern_expression *kexp_load_vars, + kern_expression *kexp_scan_quals, + char *kvars_addr_wp, + uint32_t *p_smx_row_count) { - gpuscanSuspendContext *my_suspend - = KERN_GPUSCAN_SUSPEND_CONTEXT(kgpuscan, get_group_id()); - cl_uint part_sz; - cl_uint n_parts; - cl_uint window_sz; - cl_uint part_base; - cl_uint part_index = 0; - cl_uint line_index = 0; - cl_uint total_nitems_in = 0; /* stat */ - cl_uint total_nitems_out = 0; /* stat */ - cl_uint total_extra_size = 0; /* stat */ - cl_bool thread_is_valid = false; - __shared__ cl_uint dst_nitems_base; - __shared__ cl_uint dst_usage_base; - - assert(kds_src->format == KDS_FORMAT_BLOCK); - assert(kds_dst->format == KDS_FORMAT_SLOT); - /* quick bailout if any error happen on the prior kernel */ - if (__syncthreads_count(kgpuscan->kerror.errcode) != 0) - return; + uint32_t block_id = __shfl_sync(__activemask(), wp->block_id, 0); + uint32_t wr_pos = __shfl_sync(__activemask(), wp->lp_wr_pos, 0); + uint32_t rd_pos = __shfl_sync(__activemask(), wp->lp_rd_pos, 0); + uint32_t count; + uint32_t mask; + + assert(wr_pos >= rd_pos); + if (block_id > kds_src->nitems || wr_pos >= rd_pos + warpSize) + { + HeapTupleHeaderData *htup = NULL; + uint32_t off; + int index; - part_sz = KERN_DATA_STORE_PARTSZ(kds_src); - n_parts = get_local_size() / part_sz; - if (get_global_id() == 0) - kgpuscan->part_sz = part_sz; - if (get_local_id() < part_sz * n_parts) - thread_is_valid = true; - window_sz = n_parts * get_num_groups(); + rd_pos += LaneId(); + if (rd_pos < wr_pos) + { + off = wp->lp_items[rd_pos % UNIT_TUPLES_PER_DEPTH]; + htup = (HeapTupleHeaderData *)((char *)kds_src + __kds_unpack(off)); + kcxt->kvars_slot = (kern_variable *)alloca(kcxt->kvars_nbytes); + kcxt->kvars_class = (int *)(kcxt->kvars_slot + kcxt->kvars_nslots); + if (!ExecLoadVarsOuterRow(kcxt, + kexp_load_vars, + kexp_scan_quals, + kds_src, htup)) + htup = NULL; + } + /* error checks */ + if (__any_sync(__activemask(), kcxt->errcode != ERRCODE_STROM_SUCCESS)) + return -1; + if (LaneId() == 0) + wp->lp_rd_pos = Min(wp->lp_wr_pos, + wp->lp_rd_pos + warpSize); + /* + * save the private kvars on the warp-buffer + */ + mask = __ballot_sync(__activemask(), htup != NULL); + if (LaneId() == 0) + { + wr_pos = WARP_WRITE_POS(wp,0); + WARP_WRITE_POS(wp,0) += __popc(mask); + } + wr_pos = __shfl_sync(__activemask(), wr_pos, 0); + mask &= ((1U << LaneId()) - 1); + wr_pos += __popc(mask); + if (htup != NULL) + { + index = (wr_pos % UNIT_TUPLES_PER_DEPTH); + memcpy(kvars_addr_wp + index * kcxt->kvars_nbytes, + kcxt->kvars_slot, + kcxt->kvars_nbytes); + } + kcxt->kvars_slot = NULL; + kcxt->kvars_class = NULL; + __syncwarp(); + /* end-of-scan checks */ + if (block_id > kds_src->nitems && /* no more blocks to fetch */ + wp->lp_rd_pos >= wp->lp_wr_pos) /* no more pending tuples */ + { + if (LaneId() == 0) + wp->scan_done = 1; + return 1; + } + /* move to the next depth if more than 32 htuples were fetched */ + return (WARP_WRITE_POS(wp,0) >= WARP_READ_POS(wp,0) + warpSize ? 1 : 0); + } - /* resume kernel from the point where suspended, if any */ - if (kgpuscan->resume_context) + /* + * Here, number of pending tuples (which is saved in the lp_items[]) is + * not enough to run ScanQuals checks. So, we move to the next bunch of + * line-items or next block. + * The pending tuples just passed the MVCC visivility checks, but + * ScanQuals check is not applied yet. We try to run ScanQuals checks + * with 32 threads simultaneously. + */ + if (block_id == 0) { - part_index = my_suspend->part_index; - line_index = my_suspend->line_index; + /* + * block_id == 0 means this warp is not associated with particular + * block-page, so we try to fetch the next page. + */ + if (LaneId() == 0) + count = atomicAdd(p_smx_row_count, 1); + count = __shfl_sync(__activemask(), count, 0); + block_id = (get_num_groups() * count + get_group_id()) + 1; + if (LaneId() == 0) + wp->block_id = block_id; } - __syncthreads(); - - for (;;) + if (block_id <= kds_src->nitems) { - cl_uint part_id; - cl_uint line_no; - cl_uint n_lines = 0; + PageHeaderData *pg_page = KDS_BLOCK_PGPAGE(kds_src, block_id-1); + HeapTupleHeaderData *htup = NULL; - part_base = part_index * window_sz + get_group_id() * n_parts; - if (part_base >= kds_src->nitems) - break; - part_id = get_local_id() / part_sz + part_base; - line_no = get_local_id() % part_sz + line_index * part_sz; - - do { - HeapTupleHeaderData *htup = NULL; - ItemPointerData t_self; - PageHeaderData *pg_page; - BlockNumber block_nr; - cl_ushort t_len __attribute__((unused)); - cl_uint nvalids; - cl_uint required = 0; - cl_uint nitems_real; - cl_uint nitems_offset; - cl_uint usage_offset = 0; - cl_uint usage_length = 0; - cl_uint suspend_kernel = 0; - cl_bool rc = false; - cl_char *tup_dclass = NULL; - Datum *tup_values = NULL; - - /* rewind the varlena buffer */ - kcxt->vlpos = kcxt->vlbuf; - - /* identify the block */ - if (thread_is_valid && part_id < kds_src->nitems) + count = __shfl_sync(__activemask(), wp->lp_count, 0); + if (count < PageGetMaxOffsetNumber(pg_page)) + { + count += LaneId(); + if (count < PageGetMaxOffsetNumber(pg_page)) { - pg_page = KERN_DATA_STORE_BLOCK_PGPAGE(kds_src, part_id); - n_lines = PageGetMaxOffsetNumber(pg_page); - block_nr = KERN_DATA_STORE_BLOCK_BLCKNR(kds_src, part_id); - t_self.ip_blkid.bi_hi = block_nr >> 16; - t_self.ip_blkid.bi_lo = block_nr & 0xffff; - t_self.ip_posid = line_no + 1; + ItemIdData *lpp = &pg_page->pd_linp[count]; - if (line_no < n_lines) - { - ItemIdData *lpp = PageGetItemId(pg_page, line_no+1); - if (ItemIdIsNormal(lpp)) - htup = PageGetItem(pg_page, lpp); - t_len = ItemIdGetLength(lpp); - } + assert((char *)lpp < (char *)pg_page + BLCKSZ); + if (ItemIdIsNormal(lpp)) + htup = (HeapTupleHeaderData *)PageGetItem(pg_page, lpp); + else + htup = NULL; } - - /* evaluation of the qualifiers */ - if (htup) + /* put visible tuples on the lp_items[] array */ + mask = __ballot_sync(__activemask(), htup != NULL); + if (LaneId() == 0) { - rc = gpuscan_quals_eval(kcxt, - kds_src, - &t_self, - htup); + wr_pos = wp->lp_wr_pos; + wp->lp_wr_pos += __popc(mask); } - /* bailout if any error */ - if (__syncthreads_count(kcxt->errcode) > 0) - goto out_nostat; - - /* how many rows servived WHERE-clause evaluations? */ - nitems_offset = pgstromStairlikeBinaryCount(rc, &nvalids); - if (nvalids > 0) + wr_pos = __shfl_sync(__activemask(), wr_pos, 0); + mask &= ((1U << LaneId()) - 1); + wr_pos += __popc(mask); + if (htup != NULL) { - /* store the result heap-tuple to destination buffer */ - if (rc) - { - tup_dclass = (cl_char *) - kern_context_alloc(kcxt, sizeof(cl_char) * kds_dst->ncols); - tup_values = (Datum *) - kern_context_alloc(kcxt, sizeof(Datum) * kds_dst->ncols); - - if (!tup_dclass || !tup_values) - { - STROM_EREPORT(kcxt, ERRCODE_OUT_OF_MEMORY, - "out of memory"); - } - else - { - gpuscan_projection_tuple(kcxt, - kds_src, - htup, - &t_self, - tup_dclass, - tup_values); - required = kds_slot_compute_extra(kcxt, - kds_dst, - tup_dclass, - tup_values); - } - } - /* bailout if any error */ - if (__syncthreads_count(kcxt->errcode) > 0) - goto out; - /* allocation of the destination buffer */ - usage_offset = pgstromStairlikeSum(__kds_packed(required), - &usage_length); - if (get_local_id() == 0) - { - union { - struct { - cl_uint nitems; - cl_uint usage; - } i; - cl_ulong v64; - } oldval, curval, newval; - - curval.i.nitems = kds_dst->nitems; - curval.i.usage = kds_dst->usage; - do { - newval = oldval = curval; - newval.i.nitems += nvalids; - newval.i.usage += usage_length; - - if (KERN_DATA_STORE_SLOT_LENGTH(kds_dst, newval.i.nitems) + - __kds_unpack(newval.i.usage) > kds_dst->length) - { - atomicAdd(&kgpuscan->suspend_count, 1); - suspend_kernel = 1; - break; - } - } while ((curval.v64 = atomicCAS((cl_ulong *)&kds_dst->nitems, - oldval.v64, - newval.v64)) != oldval.v64); - dst_nitems_base = oldval.i.nitems; - dst_usage_base = oldval.i.usage; - } - if (__syncthreads_count(suspend_kernel) > 0) - goto out; - /* store the result heap tuple */ - if (rc) - { - cl_uint dst_index = dst_nitems_base + nitems_offset; - char *dst_extra = ((char *)kds_dst + kds_dst->length - - __kds_unpack(dst_usage_base + - usage_offset) - required); - kds_slot_store_values(kcxt, - kds_dst, - dst_index, - dst_extra, - tup_dclass, - tup_values); - } + wp->lp_items[wr_pos % UNIT_TUPLES_PER_DEPTH] + = __kds_packed((char *)htup - (char *)kds_src); } - /* update statistics */ - nitems_real = __syncthreads_count(htup != NULL); - if (get_local_id() == 0) + if (LaneId() == 0) + wp->lp_count += warpSize; + } + else + { + /* no more tuples to fetch from the current page */ + if (LaneId() == 0) { - total_nitems_in += nitems_real; - total_nitems_out += nvalids; - total_extra_size += __kds_unpack(usage_length); + wp->block_id = 0; + wp->lp_count = 0; } - - /* - * Move to the next window of the line items, if any. - * If no threads in CUDA block wants to continue, exit the loop. - */ - line_index++; - line_no += part_sz; - } while (__syncthreads_count(thread_is_valid && - line_no < n_lines) > 0); - /* move to the next window */ - part_index++; - line_index = 0; - } -out: - /* update statistics */ - if (get_local_id() == 0) - { - atomicAdd(&kgpuscan->nitems_in, total_nitems_in); - atomicAdd(&kgpuscan->nitems_out, total_nitems_out); - atomicAdd(&kgpuscan->extra_size, total_extra_size); - } -out_nostat: - if (get_local_id() == 0) - { - my_suspend->part_index = part_index; - my_suspend->line_index = line_index; + __syncwarp(); + } } + return 0; /* stay depth-0 */ } /* - * gpuscan_main_arrow - GpuScan logic for KDS_FORMAT_ARROW + * __gpuscan_load_source_arrow */ -DEVICE_FUNCTION(void) -gpuscan_main_arrow(kern_context *kcxt, - kern_gpuscan *kgpuscan, - kern_data_store *kds_src, - kern_data_store *kds_dst, - bool has_device_projection) +STATIC_FUNCTION(int) +__gpuscan_load_source_arrow(kern_context *kcxt, + kern_warp_context *wp, + kern_data_store *kds_src, + kern_expression *kexp_load_vars, + kern_expression *kexp_scan_quals, + char *kvars_addr_wp, + uint32_t *p_smx_row_count) { - gpuscanSuspendContext *my_suspend - = KERN_GPUSCAN_SUSPEND_CONTEXT(kgpuscan, get_group_id()); - cl_uint part_index = 0; - cl_uint src_base; - cl_uint src_index; - cl_uint total_nitems_in = 0; /* stat */ - cl_uint total_nitems_out = 0; /* stat */ - cl_uint total_extra_size = 0; /* stat */ - __shared__ cl_uint dst_nitems_base; - __shared__ cl_uint dst_usage_base; - - assert(kds_src->format == KDS_FORMAT_ARROW); - assert(kds_dst->format == KDS_FORMAT_SLOT); - /* quick bailout if any error happen on the prior kernel */ - if (__syncthreads_count(kgpuscan->kerror.errcode) != 0) - return; - /* resume kernel from the point where suspended, if any */ - if (kgpuscan->resume_context) + uint32_t kds_index; + uint32_t count; + uint32_t mask; + uint32_t wr_pos; + bool is_valid = false; + + /* fetch next warpSize tuples */ + if (LaneId() == 0) + count = atomicAdd(p_smx_row_count, 1); + count = __shfl_sync(__activemask(), count, 0); + kds_index = (get_num_groups() * count + get_group_id()) * warpSize; + if (kds_index >= kds_src->nitems) { - assert(my_suspend != NULL); - part_index = my_suspend->part_index; + wp->scan_done = 1; + __syncwarp(__activemask()); + return 1; } + kds_index += LaneId(); - for (src_base = get_global_base() + part_index * get_global_size(); - src_base < kds_src->nitems; - src_base += get_global_size(), part_index++) + if (kds_index < kds_src->nitems) { - kern_tupitem *tupitem __attribute__((unused)); - cl_bool rc; - cl_uint nvalids; - cl_uint required = 0; - cl_uint nitems_offset; - cl_uint usage_offset = 0; - cl_uint usage_length = 0; - cl_uint suspend_kernel = 0; - cl_char *tup_dclass = NULL; - Datum *tup_values = NULL; - - /* rewind the varlena buffer */ - kcxt->vlpos = kcxt->vlbuf; - - /* Evalidation of the rows by WHERE-clause */ - src_index = src_base + get_local_id(); - if (src_index < kds_src->nitems) - rc = gpuscan_quals_eval_arrow(kcxt, kds_src, src_index); - else - rc = false; - /* bailout if any error */ - if (__syncthreads_count(kcxt->errcode) > 0) - break; - - /* how many rows servived WHERE-clause evaluation? */ - nitems_offset = pgstromStairlikeBinaryCount(rc, &nvalids); - if (nvalids > 0) - { - if (rc) - { - kcxt->vlpos = kcxt->vlbuf; /* rewind */ - tup_dclass = (cl_char *) - kern_context_alloc(kcxt, sizeof(cl_char) * kds_dst->ncols); - tup_values = (Datum *) - kern_context_alloc(kcxt, sizeof(Datum) * kds_dst->ncols); - - if (!tup_dclass || !tup_values) - { - STROM_EREPORT(kcxt, ERRCODE_OUT_OF_MEMORY, - "out of memory"); - } - else - { - gpuscan_projection_arrow(kcxt, - kds_src, - src_index, - tup_dclass, - tup_values); - required = kds_slot_compute_extra(kcxt, - kds_dst, - tup_dclass, - tup_values); - } - } - /* bailout if any error */ - if (__syncthreads_count(kcxt->errcode) > 0) - break; - /* allocation of the destination buffer */ - usage_offset = pgstromStairlikeSum(__kds_packed(required), - &usage_length); - if (get_local_id() == 0) - { - union { - struct { - cl_uint nitems; - cl_uint usage; - } i; - cl_ulong v64; - } oldval, curval, newval; - - curval.i.nitems = kds_dst->nitems; - curval.i.usage = kds_dst->usage; - do { - newval = oldval = curval; - newval.i.nitems += nvalids; - newval.i.usage += usage_length; - - if (KERN_DATA_STORE_SLOT_LENGTH(kds_dst, newval.i.nitems) + - __kds_unpack(newval.i.usage) > kds_dst->length) - { - atomicAdd(&kgpuscan->suspend_count, 1); - suspend_kernel = 1; - break; - } - } while ((curval.v64 = atomicCAS((cl_ulong *)&kds_dst->nitems, - oldval.v64, - newval.v64)) != oldval.v64); - dst_nitems_base = oldval.i.nitems; - dst_usage_base = oldval.i.usage; - } - if (__syncthreads_count(suspend_kernel) > 0) - break; - /* store the result virtual-tuple on the destination buffer */ - if (rc) - { - cl_uint dst_index = dst_nitems_base + nitems_offset; - char *dst_extra = ((char *)kds_dst + kds_dst->length - - __kds_unpack(dst_usage_base + - usage_offset) - required); - kds_slot_store_values(kcxt, - kds_dst, - dst_index, - dst_extra, - tup_dclass, - tup_values); - } - /* bailout if any error */ - if (__syncthreads_count(kcxt->errcode) > 0) - break; - } - /* write back statistics */ - if (get_local_id() == 0) - { - total_nitems_in += Min(kds_src->nitems - src_base, - get_local_size()); - total_nitems_out += nvalids; - total_extra_size += __kds_unpack(usage_length); - } + kcxt->kvars_slot = (kern_variable *)alloca(kcxt->kvars_nbytes); + kcxt->kvars_class = (int *)(kcxt->kvars_slot + kcxt->kvars_nslots); + if (ExecLoadVarsOuterArrow(kcxt, + kexp_load_vars, + kexp_scan_quals, + kds_src, + kds_index)) + is_valid = true; } - /* write back statistics */ - if (get_local_id() == 0) + /* error checks */ + if (__any_sync(__activemask(), kcxt->errcode != 0)) + return -1; + /* + * save the htuple on the local combination buffer (depth=0) + */ + mask = __ballot_sync(__activemask(), is_valid); + if (LaneId() == 0) { - atomicAdd(&kgpuscan->nitems_in, total_nitems_in); - atomicAdd(&kgpuscan->nitems_out, total_nitems_out); - atomicAdd(&kgpuscan->extra_size, total_extra_size); + wr_pos = WARP_WRITE_POS(wp,0); + WARP_WRITE_POS(wp,0) += __popc(mask); } - /* suspend the current position (even if normal exit) */ - if (my_suspend && get_local_id() == 0) + wr_pos = __shfl_sync(__activemask(), wr_pos, 0); + mask &= ((1U << LaneId()) - 1); + wr_pos += __popc(mask); + if (is_valid) { - my_suspend->part_index = part_index; - my_suspend->line_index = 0; + int index = (wr_pos % UNIT_TUPLES_PER_DEPTH); + + memcpy(kvars_addr_wp + index * kcxt->kvars_nbytes, + kcxt->kvars_slot, + kcxt->kvars_nbytes); } + kcxt->kvars_slot = NULL; + kcxt->kvars_class = NULL; + /* move to the next depth if more than 32 htuples were fetched */ + return (WARP_WRITE_POS(wp,0) >= WARP_READ_POS(wp,0) + warpSize ? 1 : 0); } /* - * gpuscan_main_column - GpuScan logic for KDS_FORMAT_COLUMN + * __gpuscan_load_source_column */ -DEVICE_FUNCTION(void) -gpuscan_main_column(kern_context *kcxt, - kern_gpuscan *kgpuscan, - kern_data_store *kds_src, - kern_data_extra *kds_extra, - kern_data_store *kds_dst) +INLINE_FUNCTION(int) +__gpuscan_load_source_column(kern_context *kcxt, + kern_warp_context *wp, + kern_data_store *kds_src, + kern_data_extra *kds_extra, + kern_expression *kexp_load_vars, + kern_expression *kern_scan_quals, + char *kvars_addr_wp, + uint32_t *p_smx_row_count) { - gpuscanSuspendContext *my_suspend - = KERN_GPUSCAN_SUSPEND_CONTEXT(kgpuscan, get_group_id()); - cl_uint part_index = 0; - cl_uint src_base; - cl_uint total_nitems_in = 0; - cl_uint total_nitems_out = 0; - cl_uint total_extra_size = 0; - __shared__ cl_uint dst_nitems_base; - __shared__ cl_uint dst_usage_base; + STROM_ELOG(kcxt, "KDS_FORMAT_COLUMN not implemented"); + return -1; +} - assert(kds_src->format == KDS_FORMAT_COLUMN && - kds_dst->format == KDS_FORMAT_SLOT); - /* quick bailout if any error happen on the prior kernel */ - if (__syncthreads_count(kgpuscan->kerror.errcode) != 0) - return; - /* resume kernel from the point where suspended, if any */ - if (kgpuscan->resume_context) +PUBLIC_FUNCTION(int) +execGpuScanLoadSource(kern_context *kcxt, + kern_warp_context *wp, + kern_data_store *kds_src, + kern_data_extra *kds_extra, + kern_expression *kexp_load_vars, + kern_expression *kexp_scan_quals, + char *kvars_addr_wp, + uint32_t *p_smx_row_count) +{ + /* + * Move to the next depth (or projection), if combination buffer (depth=0) + * may overflow on the next action, or we already reached to the KDS tail. + */ + if (wp->scan_done || WARP_WRITE_POS(wp,0) >= WARP_READ_POS(wp,0) + warpSize) + return 1; + + switch (kds_src->format) { - assert(my_suspend != NULL); - part_index = my_suspend->part_index; + case KDS_FORMAT_ROW: + return __gpuscan_load_source_row(kcxt, wp, + kds_src, + kexp_load_vars, + kexp_scan_quals, + kvars_addr_wp, + p_smx_row_count); + case KDS_FORMAT_BLOCK: + return __gpuscan_load_source_block(kcxt, wp, + kds_src, + kexp_load_vars, + kexp_scan_quals, + kvars_addr_wp, + p_smx_row_count); + case KDS_FORMAT_ARROW: + return __gpuscan_load_source_arrow(kcxt, wp, + kds_src, + kexp_load_vars, + kexp_scan_quals, + kvars_addr_wp, + p_smx_row_count); + case KDS_FORMAT_COLUMN: + return __gpuscan_load_source_column(kcxt, wp, + kds_src, + kds_extra, + kexp_load_vars, + kexp_scan_quals, + kvars_addr_wp, + p_smx_row_count); + default: + STROM_ELOG(kcxt, "Bug? Unknown KDS format"); + break; } + return -1; +} - for (src_base = get_global_base() + part_index * get_global_size(); - src_base < kds_src->nitems; - src_base += get_global_size(), part_index++) +/* + * kern_gpuscan_main + */ +KERNEL_FUNCTION(void) +kern_gpuscan_main(kern_session_info *session, + kern_gputask *kgtask, + kern_multirels *__kmrels, /* should be NULL */ + kern_data_store *kds_src, + kern_data_extra *kds_extra, + kern_data_store *kds_dst) +{ + kern_context *kcxt; + kern_warp_context *wp, *wp_saved; + uint32_t wp_base_sz; + char *kvars_addr_wp; /* only depth-0 */ + int depth; + __shared__ uint32_t smx_row_count; + + assert(kgtask->kvars_nslots == session->kcxt_kvars_nslots && + kgtask->kvars_nbytes == session->kcxt_kvars_nbytes && + kgtask->n_rels == 0 && + __kmrels == NULL); + /* setup execution context */ + INIT_KERNEL_CONTEXT(kcxt, session); + wp_base_sz = __KERN_WARP_CONTEXT_BASESZ(0); + wp = (kern_warp_context *)SHARED_WORKMEM(wp_base_sz, get_local_id() / warpSize); + wp_saved = KERN_GPUTASK_WARP_CONTEXT(kgtask); + if (kgtask->resume_context) + { + /* resume warp-context from the previous execution */ + if (LaneId() == 0) + memcpy(wp, wp_saved, wp_base_sz); + if (get_local_id() == 0) + smx_row_count = wp->smx_row_count; + depth = __shfl_sync(__activemask(), wp->depth, 0); + } + else { - cl_uint src_index = src_base + get_local_id(); - cl_bool rc = false; - cl_uint nvalids; - cl_uint required = 0; - cl_uint nitems_offset; - cl_uint usage_offset = 0; - cl_uint usage_length = 0; - cl_uint suspend_kernel = 0; - cl_char *tup_dclass = NULL; - Datum *tup_values = NULL; + /* zero clear the wp */ + if (LaneId() == 0) + memset(wp, 0, wp_base_sz); + if (get_local_id() == 0) + smx_row_count = 0; + depth = 0; + } + kvars_addr_wp = ((char *)wp_saved + wp_base_sz); + __syncthreads(); - /* rewind the varlena buffer */ - kcxt->vlpos = kcxt->vlbuf; - /* evaluation of the row using WHERE-clause */ - if (src_index < kds_src->nitems) + while (depth >= 0) + { + kcxt_reset(kcxt); + if (depth == 0) { - if (kern_check_visibility_column(kcxt, kds_src, src_index)) - { - rc = gpuscan_quals_eval_column(kcxt, - kds_src, - kds_extra, - src_index); - } + /* LOAD FROM THE SOURCE */ + depth = execGpuScanLoadSource(kcxt, wp, + kds_src, + kds_extra, + SESSION_KEXP_SCAN_LOAD_VARS(session), + SESSION_KEXP_SCAN_QUALS(session), + kvars_addr_wp, + &smx_row_count); } - /* bailout if any error */ - if (__syncthreads_count(kcxt->errcode) > 0) - break; - /* how many rows servived the evaluation above? */ - nitems_offset = pgstromStairlikeBinaryCount(rc, &nvalids); - if (nvalids > 0) + else { - /* Ok, extract the source columns to form a result row */ - kcxt->vlpos = kcxt->vlbuf; /* rewind */ - if (rc) + bool try_suspend = false; + + assert(depth == 1); + if (session->xpucode_projection) { - tup_dclass = (cl_char *) - kern_context_alloc(kcxt, sizeof(cl_char) * kds_dst->ncols); - tup_values = (Datum *) - kern_context_alloc(kcxt, sizeof(Datum) * kds_dst->ncols); - gpuscan_projection_column(kcxt, - kds_src, - kds_extra, - src_index, - tup_dclass, - tup_values); - required = kds_slot_compute_extra(kcxt, - kds_dst, - tup_dclass, - tup_values); + /* PROJECTION */ + depth = execGpuJoinProjection(kcxt, wp, + 0, /* no inner relations */ + kds_dst, + SESSION_KEXP_PROJECTION(session), + kvars_addr_wp, + &try_suspend); } - /* bailout if any error */ - if (__syncthreads_count(kcxt->errcode) > 0) - break; - /* allocation of the destination buffer */ - usage_offset = pgstromStairlikeSum(__kds_packed(required), - &usage_length); - if (get_local_id() == 0) + else { - union { - struct { - cl_uint nitems; - cl_uint usage; - } i; - cl_ulong v64; - } oldval, curval, newval; - - curval.i.nitems = kds_dst->nitems; - curval.i.usage = kds_dst->usage; - do { - newval = oldval = curval; - newval.i.nitems += nvalids; - newval.i.usage += usage_length; - - if (KERN_DATA_STORE_SLOT_LENGTH(kds_dst, newval.i.nitems) + - __kds_unpack(newval.i.usage) > kds_dst->length) - { - atomicAdd(&kgpuscan->suspend_count, 1); - suspend_kernel = 1; - break; - } - } while ((curval.v64 = atomicCAS((cl_ulong *)&kds_dst->nitems, - oldval.v64, - newval.v64)) != oldval.v64); - dst_nitems_base = oldval.i.nitems; - dst_usage_base = oldval.i.usage; + /* PRE-AGG */ + depth = execGpuPreAggGroupBy(kcxt, wp, + 0, /* no inner relations */ + kds_dst, + kvars_addr_wp, + &try_suspend); } - if (__syncthreads_count(suspend_kernel) > 0) - break; - /* store the result tuple on the destination buffer */ - if (rc) + if (__any_sync(__activemask(), try_suspend)) { - cl_uint dst_index = dst_nitems_base + nitems_offset; - char *dst_extra = ((char *)kds_dst + kds_dst->length - - __kds_unpack(dst_usage_base + - usage_offset) - required); - kds_slot_store_values(kcxt, - kds_dst, - dst_index, - dst_extra, - tup_dclass, - tup_values); + if (LaneId() == 0) + atomicAdd(&kgtask->suspend_count, 1); + assert(depth < 0); } } - /* update statistics */ - if (get_local_id() == 0) - { - total_nitems_in += Min(kds_src->nitems - src_base, - get_local_size()); - total_nitems_out += nvalids; - total_extra_size += __kds_unpack(usage_length); - } + __syncwarp(); } - /* write back statistics */ - if (get_local_id() == 0) - { - atomicAdd(&kgpuscan->nitems_in, total_nitems_in); - atomicAdd(&kgpuscan->nitems_out, total_nitems_out); - atomicAdd(&kgpuscan->extra_size, total_extra_size); - } - /* suspend the current position (even if normal exit) */ - if (my_suspend && get_local_id() == 0) + __syncthreads(); + + if (LaneId() == 0) { - my_suspend->part_index = part_index; - my_suspend->line_index = 0; + wp->depth = depth; + wp->smx_row_count = smx_row_count; + memcpy(wp_saved, wp, wp_base_sz); } + STROM_WRITEBACK_ERROR_STATUS(&kgtask->kerror, kcxt); } diff --git a/next/dpu/Makefile b/src/dpu/Makefile similarity index 100% rename from next/dpu/Makefile rename to src/dpu/Makefile diff --git a/next/dpu/arrow_defs.h b/src/dpu/arrow_defs.h similarity index 100% rename from next/dpu/arrow_defs.h rename to src/dpu/arrow_defs.h diff --git a/next/dpu/dpuserv.c b/src/dpu/dpuserv.c similarity index 100% rename from next/dpu/dpuserv.c rename to src/dpu/dpuserv.c diff --git a/next/dpu/dpuserv.h b/src/dpu/dpuserv.h similarity index 100% rename from next/dpu/dpuserv.h rename to src/dpu/dpuserv.h diff --git a/next/dpu/float2.h b/src/dpu/float2.h similarity index 100% rename from next/dpu/float2.h rename to src/dpu/float2.h diff --git a/next/dpu/heterodb_extra.h b/src/dpu/heterodb_extra.h similarity index 100% rename from next/dpu/heterodb_extra.h rename to src/dpu/heterodb_extra.h diff --git a/next/dpu/xpu_basetype.cc b/src/dpu/xpu_basetype.cc similarity index 100% rename from next/dpu/xpu_basetype.cc rename to src/dpu/xpu_basetype.cc diff --git a/next/dpu/xpu_basetype.h b/src/dpu/xpu_basetype.h similarity index 100% rename from next/dpu/xpu_basetype.h rename to src/dpu/xpu_basetype.h diff --git a/next/dpu/xpu_common.cc b/src/dpu/xpu_common.cc similarity index 100% rename from next/dpu/xpu_common.cc rename to src/dpu/xpu_common.cc diff --git a/next/dpu/xpu_common.h b/src/dpu/xpu_common.h similarity index 100% rename from next/dpu/xpu_common.h rename to src/dpu/xpu_common.h diff --git a/next/dpu/xpu_misclib.cc b/src/dpu/xpu_misclib.cc similarity index 100% rename from next/dpu/xpu_misclib.cc rename to src/dpu/xpu_misclib.cc diff --git a/next/dpu/xpu_misclib.h b/src/dpu/xpu_misclib.h similarity index 100% rename from next/dpu/xpu_misclib.h rename to src/dpu/xpu_misclib.h diff --git a/next/dpu/xpu_numeric.cc b/src/dpu/xpu_numeric.cc similarity index 100% rename from next/dpu/xpu_numeric.cc rename to src/dpu/xpu_numeric.cc diff --git a/next/dpu/xpu_numeric.h b/src/dpu/xpu_numeric.h similarity index 100% rename from next/dpu/xpu_numeric.h rename to src/dpu/xpu_numeric.h diff --git a/next/dpu/xpu_opcodes.h b/src/dpu/xpu_opcodes.h similarity index 100% rename from next/dpu/xpu_opcodes.h rename to src/dpu/xpu_opcodes.h diff --git a/next/dpu/xpu_textlib.cc b/src/dpu/xpu_textlib.cc similarity index 100% rename from next/dpu/xpu_textlib.cc rename to src/dpu/xpu_textlib.cc diff --git a/next/dpu/xpu_textlib.h b/src/dpu/xpu_textlib.h similarity index 100% rename from next/dpu/xpu_textlib.h rename to src/dpu/xpu_textlib.h diff --git a/next/dpu/xpu_timelib.cc b/src/dpu/xpu_timelib.cc similarity index 100% rename from next/dpu/xpu_timelib.cc rename to src/dpu/xpu_timelib.cc diff --git a/next/dpu/xpu_timelib.h b/src/dpu/xpu_timelib.h similarity index 100% rename from next/dpu/xpu_timelib.h rename to src/dpu/xpu_timelib.h diff --git a/next/dpu_device.c b/src/dpu_device.c similarity index 100% rename from next/dpu_device.c rename to src/dpu_device.c diff --git a/next/dpu_join.c b/src/dpu_join.c similarity index 100% rename from next/dpu_join.c rename to src/dpu_join.c diff --git a/next/dpu_preagg.c b/src/dpu_preagg.c similarity index 100% rename from next/dpu_preagg.c rename to src/dpu_preagg.c diff --git a/next/dpu_scan.c b/src/dpu_scan.c similarity index 100% rename from next/dpu_scan.c rename to src/dpu_scan.c diff --git a/next/executor.c b/src/executor.c similarity index 100% rename from next/executor.c rename to src/executor.c diff --git a/src/extra.c b/src/extra.c index 38b1383df..e208ce63b 100644 --- a/src/extra.c +++ b/src/extra.c @@ -3,8 +3,8 @@ * * Stuff related to invoke HeteroDB Extra Module * ---- - * Copyright 2011-2021 (C) KaiGai Kohei - * Copyright 2014-2021 (C) PG-Strom Developers Team + * Copyright 2011-2023 (C) KaiGai Kohei + * Copyright 2014-2023 (C) PG-Strom Developers Team * * This program is free software; you can redistribute it and/or modify * it under the terms of the PostgreSQL License. @@ -12,14 +12,6 @@ #include #include "pg_strom.h" -/* pg_strom.gpudirect_driver */ -#define GPUDIRECT_DRIVER_TYPE__NONE 1 -#define GPUDIRECT_DRIVER_TYPE__CUFILE 2 -#define GPUDIRECT_DRIVER_TYPE__NVME_STROM 3 - -static struct config_enum_entry pgstrom_gpudirect_driver_options[4]; -static int __pgstrom_gpudirect_driver; /* GUC */ - PG_FUNCTION_INFO_V1(pgstrom_license_query); /* @@ -45,35 +37,14 @@ heterodbExtraModuleInit(void) */ static heterodb_extra_error_info *p_heterodb_extra_error_data = NULL; -static void +static inline void heterodbExtraEreport(int elevel) { - /* see ereport_domain definition */ -#if PG_VERSION_NUM >= 130000 - pg_prevent_errno_in_scope(); - if (errstart(elevel, TEXTDOMAIN)) - { - errcode(ERRCODE_INTERNAL_ERROR); - errmsg("[extra] %s", p_heterodb_extra_error_data->message); - errfinish(p_heterodb_extra_error_data->filename, - p_heterodb_extra_error_data->lineno, - p_heterodb_extra_error_data->funcname); - } -#else -#if PG_VERSION_NUM >= 120000 - pg_prevent_errno_in_scope(); -#endif - if (errstart(elevel, - p_heterodb_extra_error_data->filename, - p_heterodb_extra_error_data->lineno, - p_heterodb_extra_error_data->funcname, - TEXTDOMAIN)) - { - errcode(ERRCODE_INTERNAL_ERROR); - errmsg("%s", p_heterodb_extra_error_data->message); - errfinish(0); - } -#endif + elog(elevel, "(%s; %s:%d) %s", + p_heterodb_extra_error_data->funcname, + p_heterodb_extra_error_data->filename, + p_heterodb_extra_error_data->lineno, + p_heterodb_extra_error_data->message); } /* @@ -103,6 +74,24 @@ heterodbLicenseQuery(char *buf, size_t bufsz) return p_heterodb_license_query(buf, bufsz); } +/* + * heterodbValidateDevice + */ +static int (*p_heterodb_validate_device)(int gpu_device_id, + const char *gpu_device_name, + const char *gpu_device_uuid) = NULL; +bool +heterodbValidateDevice(int gpu_device_id, + const char *gpu_device_name, + const char *gpu_device_uuid) +{ + if (!p_heterodb_validate_device) + return false; + return (p_heterodb_validate_device(gpu_device_id, + gpu_device_name, + gpu_device_uuid) > 0); +} + /* * pgstrom_license_query */ @@ -147,248 +136,155 @@ pgstrom_license_query(PG_FUNCTION_ARGS) /* * gpuDirectInitDriver */ -static int (*p_gpudirect_init_driver)() = NULL; +static void (*p_cufile__driver_init_v2)() = NULL; -int +static void gpuDirectInitDriver(void) { - int rv = -1; - - if (p_gpudirect_init_driver) - { - rv = p_gpudirect_init_driver(); - if (rv) - heterodbExtraEreport(LOG); - } - return rv; + if (!p_cufile__driver_init_v2) + elog(ERROR, "heterodb_extra: cufile__driver_init_v2 is missing"); + p_cufile__driver_init_v2(); } /* * gpuDirectOpenDriver */ -static int (*p_gpudirect_open_driver)() = NULL; -static void +static int (*p_cufile__driver_open_v2)() = NULL; + +bool gpuDirectOpenDriver(void) { - if (!p_gpudirect_open_driver) + if (!p_cufile__driver_open_v2) { - if (p_gpudirect_open_driver()) - heterodbExtraEreport(ERROR); + elog(ERROR, "heterodb_extra: p_cufile__driver_open_v2 is missing"); + return false; } + return (p_cufile__driver_open_v2() == 0); } /* * gpuDirectCloseDriver */ -static int (*p_gpudirect_close_driver)() = NULL; -static bool gpudirect_close_driver_is_registered = false; - -static void -gpuDirectCloseDriverOnExit(int code, Datum arg) -{ - if (p_gpudirect_close_driver) - { - if (p_gpudirect_close_driver()) - heterodbExtraEreport(LOG); - } -} - -/* - * gpuDirectFileDescOpen - */ -static int (*p_gpudirect_file_desc_open)( - GPUDirectFileDesc *gds_fdesc, - int rawfd, const char *pathname) = NULL; - -void -gpuDirectFileDescOpen(GPUDirectFileDesc *gds_fdesc, File pg_fdesc) -{ - int rawfd = FileGetRawDesc(pg_fdesc); - char *pathname = FilePathName(pg_fdesc); - - if (!gpudirect_close_driver_is_registered) - { - gpuDirectOpenDriver(); - on_proc_exit(gpuDirectCloseDriverOnExit, 0); - gpudirect_close_driver_is_registered = true; - } - if (p_gpudirect_file_desc_open(gds_fdesc, rawfd, pathname)) - heterodbExtraEreport(ERROR); -} - -/* - * gpuDirectFileDescOpenByPath - */ -static int (*p_gpudirect_file_desc_open_by_path)( - GPUDirectFileDesc *gds_fdesc, - const char *pathname) = NULL; +static int (*p_cufile__driver_close_v2)() = NULL; void -gpuDirectFileDescOpenByPath(GPUDirectFileDesc *gds_fdesc, - const char *pathname) +gpuDirectCloseDriver(void) { - if (!gpudirect_close_driver_is_registered) + if (p_cufile__driver_close_v2) { - gpuDirectOpenDriver(); - on_proc_exit(gpuDirectCloseDriverOnExit, 0); - gpudirect_close_driver_is_registered = true; + if (p_cufile__driver_close_v2() != 0) + heterodbExtraEreport(LOG); } - if (p_gpudirect_file_desc_open_by_path(gds_fdesc, pathname)) - heterodbExtraEreport(ERROR); -} - -/* - * gpuDirectFileDescClose - */ -static void (*p_gpudirect_file_desc_close)( - const GPUDirectFileDesc *gds_fdesc) = NULL; - -void -gpuDirectFileDescClose(const GPUDirectFileDesc *gds_fdesc) -{ - Assert(p_gpudirect_file_desc_close != NULL); - p_gpudirect_file_desc_close(gds_fdesc); } /* * gpuDirectMapGpuMemory */ -static CUresult (*p_gpudirect_map_gpu_memory)( - CUdeviceptr m_segment, - size_t m_segment_sz, - unsigned long *p_iomap_handle) = NULL; - -CUresult +static int (*p_cufile__map_gpu_memory_v2)(CUdeviceptr m_segment, + size_t segment_sz) = NULL; +bool gpuDirectMapGpuMemory(CUdeviceptr m_segment, - size_t m_segment_sz, - unsigned long *p_iomap_handle) + size_t segment_sz) { - Assert(p_gpudirect_map_gpu_memory != NULL); - return p_gpudirect_map_gpu_memory(m_segment, m_segment_sz, p_iomap_handle); + if (!p_cufile__map_gpu_memory_v2) + return false; + return (p_cufile__map_gpu_memory_v2(m_segment, segment_sz) == 0); } /* * gpuDirectUnmapGpuMemory */ -static CUresult (*p_gpudirect_unmap_gpu_memory)( - CUdeviceptr m_segment, - unsigned long iomap_handle) = NULL; +static int (*p_cufile__unmap_gpu_memory_v2)(CUdeviceptr m_segment) = NULL; -CUresult -gpuDirectUnmapGpuMemory(CUdeviceptr m_segment, - unsigned long iomap_handle) +bool +gpuDirectUnmapGpuMemory(CUdeviceptr m_segment) { - Assert(p_gpudirect_unmap_gpu_memory != NULL); - return p_gpudirect_unmap_gpu_memory(m_segment, iomap_handle); + if (!p_cufile__unmap_gpu_memory_v2) + return false; + return (p_cufile__unmap_gpu_memory_v2(m_segment) == 0); } /* * gpuDirectFileReadIOV */ -static int (*p_gpudirect_file_read_iov)( - const GPUDirectFileDesc *gds_fdesc, +static int (*p_cufile__read_file_iov_v2)( + const char *pathname, CUdeviceptr m_segment, - unsigned long iomap_handle, off_t m_offset, - strom_io_vector *iovec) = NULL; + const strom_io_vector *iovec) = NULL; -void -gpuDirectFileReadIOV(const GPUDirectFileDesc *gds_fdesc, +bool +gpuDirectFileReadIOV(const char *pathname, CUdeviceptr m_segment, - unsigned long iomap_handle, off_t m_offset, - strom_io_vector *iovec) + const strom_io_vector *iovec) { - Assert(p_gpudirect_file_read_iov != NULL); - if (p_gpudirect_file_read_iov(gds_fdesc, - m_segment, - iomap_handle, - m_offset, - iovec)) - werror("failed on gpuDirectFileReadIOV"); + if (!p_cufile__read_file_iov_v2) + return false; + return (p_cufile__read_file_iov_v2(pathname, + m_segment, + m_offset, + iovec) == 0); } /* - * extraSysfsSetupDistanceMap + * gpuDirectGetProperty */ -static int (*p_sysfs_setup_distance_map)( - int gpu_count, - GpuPciDevItem *gpu_array, - const char *manual_config) = NULL; - -void -extraSysfsSetupDistanceMap(const char *manual_config) +static int (*p_cufile__get_property_v2)(char *buffer, + size_t buffer_sz) = NULL; +char * +gpuDirectGetProperty(void) { - GpuPciDevItem *gpu_array; - int i; + char buffer[2000]; - if (!p_sysfs_setup_distance_map) - return; /* nothing to do */ - - gpu_array = alloca(numDevAttrs * sizeof(GpuPciDevItem)); - memset(gpu_array, 0, numDevAttrs * sizeof(GpuPciDevItem)); - for (i=0; i < numDevAttrs; i++) - { - DevAttributes *dattr = &devAttrs[i]; - GpuPciDevItem *gpu = &gpu_array[i]; - - gpu->device_id = dattr->DEV_ID; - strncpy(gpu->device_name, dattr->DEV_NAME, - sizeof(gpu->device_name)); - gpu->pci_domain = dattr->PCI_DOMAIN_ID; - gpu->pci_bus_id = dattr->PCI_BUS_ID; - gpu->pci_dev_id = dattr->PCI_DEVICE_ID; - if (dattr->MULTI_GPU_BOARD) - gpu->pci_func_id = dattr->MULTI_GPU_BOARD_GROUP_ID; - } - if (p_sysfs_setup_distance_map(numDevAttrs, - gpu_array, - manual_config) < 0) + if (!p_cufile__get_property_v2) + elog(ERROR, "heterodb_extra: cufile__get_property_v2 is missing"); + if (p_cufile__get_property_v2(buffer, sizeof(buffer)) < 0) heterodbExtraEreport(ERROR); + return pstrdup(buffer); } /* - * extraSysfsLookupOptimalGpu + * gpuDirectSetProperty */ -static int (*p_sysfs_lookup_optimal_gpus)(int fdesc, - int nrooms, - int *optimal_gpus) = NULL; -Bitmapset * -extraSysfsLookupOptimalGpus(File filp) +static int (*p_cufile__set_property_v2)(const char *key, + const char *value) = NULL; +void +gpuDirectSetProperty(const char *key, const char *value) { - Bitmapset *optimal_gpus = NULL; - int fdesc = FileGetRawDesc(filp); - int i, nitems; - int *__gpus; - - if (!p_sysfs_lookup_optimal_gpus || numDevAttrs == 0) - return NULL; - __gpus = alloca(sizeof(int) * numDevAttrs); - nitems = p_sysfs_lookup_optimal_gpus(fdesc, numDevAttrs, __gpus); - if (nitems < 0) + if (!p_cufile__set_property_v2) + elog(ERROR, "heterodb_extra: cufile__set_property_v2 is missing"); + if (p_cufile__set_property_v2(key, value) != 0) heterodbExtraEreport(ERROR); - for (i=0; i < nitems; i++) - { - Assert(__gpus[i] >= 0 && __gpus[i] < numDevAttrs); - optimal_gpus = bms_add_member(optimal_gpus, __gpus[i]); - } - return optimal_gpus; } /* - * extraSysfsPrintNvmeInfo + * gpuDirectIsSupported */ -static ssize_t (*p_sysfs_print_nvme_info)( - int index, - char *buffer, - ssize_t buffer_sz) = NULL; - -ssize_t -extraSysfsPrintNvmeInfo(int index, char *buffer, ssize_t buffer_sz) +bool +gpuDirectIsAvailable(void) { - if (!p_sysfs_print_nvme_info) - return -1; - return p_sysfs_print_nvme_info(index, buffer, buffer_sz); + bool has_gpudirectsql_supported = false; + + if (p_cufile__driver_init_v2 && + p_cufile__driver_open_v2 && + p_cufile__driver_close_v2 && + p_cufile__map_gpu_memory_v2 && + p_cufile__unmap_gpu_memory_v2 && + p_cufile__read_file_iov_v2 && + p_cufile__get_property_v2 && + p_cufile__set_property_v2) + { + for (int i=0; i < numGpuDevAttrs; i++) + { + if (gpuDevAttrs[i].DEV_SUPPORT_GPUDIRECTSQL) + { + has_gpudirectsql_supported = true; + break; + } + } + } + return has_gpudirectsql_supported; } /* lookup_heterodb_extra_function */ @@ -406,36 +302,18 @@ lookup_heterodb_extra_function(void *handle, const char *symbol) #define LOOKUP_HETERODB_EXTRA_FUNCTION(symbol) \ p_##symbol = lookup_heterodb_extra_function(handle, #symbol) -/* lookup_gpudirect_function */ -static void * -lookup_gpudirect_function(void *handle, const char *prefix, const char *func_name) -{ - char symbol[128]; - - snprintf(symbol, sizeof(symbol), "%s__%s", prefix, func_name); - return lookup_heterodb_extra_function(handle, symbol); -} - -#define LOOKUP_GPUDIRECT_EXTRA_FUNCTION(prefix,func_name) \ - p_gpudirect_##func_name = lookup_gpudirect_function(handle, prefix, #func_name) - /* * parse_heterodb_extra_module_info */ static void parse_heterodb_extra_module_info(const char *extra_module_info, uint32 *p_api_version, - bool *p_has_cufile, - bool *p_has_nvme_strom, - int *p_default_gpudirect_driver) + bool *p_has_cufile) { char *buffer; long api_version = 0; bool has_cufile = false; - bool has_nvme_strom = false; - int default_gpudirect_driver = GPUDIRECT_DRIVER_TYPE__NONE; char *tok, *pos, *end; - struct config_enum_entry *entry; buffer = alloca(strlen(extra_module_info) + 1); strcpy(buffer, extra_module_info); @@ -458,50 +336,12 @@ parse_heterodb_extra_module_info(const char *extra_module_info, else elog(ERROR, "invalid extra module token [%s]", tok); } - else if (strncmp(tok, "nvme_strom=", 11) == 0) - { - if (strcmp(tok+11, "on") == 0) - has_nvme_strom = true; - else if (strcmp(tok+11, "off") == 0) - has_nvme_strom = false; - else - elog(ERROR, "invalid extra module token [%s]", tok); - } } - if (api_version < HETERODB_EXTRA_API_VERSION) elog(ERROR, "HeteroDB Extra Module has Unsupported API version [%08lu]", api_version); - - /* setup pgstrom.gpudirect_driver options */ - entry = pgstrom_gpudirect_driver_options; - entry->name = "none"; - entry->val = GPUDIRECT_DRIVER_TYPE__NONE; - entry->hidden = false; - entry++; - - if (has_nvme_strom) - { - default_gpudirect_driver = GPUDIRECT_DRIVER_TYPE__NVME_STROM; - entry->name = "nvme_strom"; - entry->val = GPUDIRECT_DRIVER_TYPE__NVME_STROM; - entry->hidden = false; - entry++; - } - if (has_cufile) - { - default_gpudirect_driver = GPUDIRECT_DRIVER_TYPE__CUFILE; - entry->name = "cufile"; - entry->val = GPUDIRECT_DRIVER_TYPE__CUFILE; - entry->hidden = false; - entry++; - } - memset(entry, 0, sizeof(struct config_enum_entry)); - *p_api_version = api_version; *p_has_cufile = has_cufile; - *p_has_nvme_strom = has_nvme_strom; - *p_default_gpudirect_driver = default_gpudirect_driver; } /* @@ -510,7 +350,6 @@ parse_heterodb_extra_module_info(const char *extra_module_info, void pgstrom_init_extra(void) { - const char *prefix = NULL; void *handle; char *license; char *extra_module_info; @@ -532,71 +371,45 @@ pgstrom_init_extra(void) { uint32 api_version = 0; bool has_cufile = false; - bool has_nvme_strom = false; - int default_gpudirect_driver; LOOKUP_HETERODB_EXTRA_FUNCTION(heterodb_extra_error_data); LOOKUP_HETERODB_EXTRA_FUNCTION(heterodb_extra_module_init); extra_module_info = heterodbExtraModuleInit(); parse_heterodb_extra_module_info(extra_module_info, &api_version, - &has_cufile, - &has_nvme_strom, - &default_gpudirect_driver); - if (api_version < HETERODB_EXTRA_API_VERSION) - elog(ERROR, "HeteroDB Extra module is too old [API version=%u]", - api_version); - /* pg_strom.gpudirect_driver */ - DefineCustomEnumVariable("pg_strom.gpudirect_driver", - "Selection of the GPUDirectSQL Driver", - NULL, - &__pgstrom_gpudirect_driver, - default_gpudirect_driver, - pgstrom_gpudirect_driver_options, - PGC_POSTMASTER, - GUC_NOT_IN_SAMPLE, - NULL, NULL, NULL); - if (__pgstrom_gpudirect_driver == GPUDIRECT_DRIVER_TYPE__CUFILE) - prefix = "cufile"; - else if (__pgstrom_gpudirect_driver == GPUDIRECT_DRIVER_TYPE__NVME_STROM) - prefix = "nvme_strom"; - - if (prefix) + &has_cufile); + if (has_cufile) { - LOOKUP_GPUDIRECT_EXTRA_FUNCTION(prefix, init_driver); - LOOKUP_GPUDIRECT_EXTRA_FUNCTION(prefix, open_driver); - LOOKUP_GPUDIRECT_EXTRA_FUNCTION(prefix, close_driver); - LOOKUP_GPUDIRECT_EXTRA_FUNCTION(prefix, file_desc_open); - LOOKUP_GPUDIRECT_EXTRA_FUNCTION(prefix, file_desc_open_by_path); - LOOKUP_GPUDIRECT_EXTRA_FUNCTION(prefix, file_desc_close); - LOOKUP_GPUDIRECT_EXTRA_FUNCTION(prefix, map_gpu_memory); - LOOKUP_GPUDIRECT_EXTRA_FUNCTION(prefix, unmap_gpu_memory); - LOOKUP_GPUDIRECT_EXTRA_FUNCTION(prefix, file_read_iov); + LOOKUP_HETERODB_EXTRA_FUNCTION(cufile__driver_init_v2); + LOOKUP_HETERODB_EXTRA_FUNCTION(cufile__driver_open_v2); + LOOKUP_HETERODB_EXTRA_FUNCTION(cufile__driver_close_v2); + LOOKUP_HETERODB_EXTRA_FUNCTION(cufile__map_gpu_memory_v2); + LOOKUP_HETERODB_EXTRA_FUNCTION(cufile__unmap_gpu_memory_v2); + LOOKUP_HETERODB_EXTRA_FUNCTION(cufile__read_file_iov_v2); + LOOKUP_HETERODB_EXTRA_FUNCTION(cufile__get_property_v2); + LOOKUP_HETERODB_EXTRA_FUNCTION(cufile__set_property_v2); + + gpuDirectInitDriver(); } - LOOKUP_HETERODB_EXTRA_FUNCTION(sysfs_setup_distance_map); - LOOKUP_HETERODB_EXTRA_FUNCTION(sysfs_lookup_optimal_gpus); - LOOKUP_HETERODB_EXTRA_FUNCTION(sysfs_print_nvme_info); LOOKUP_HETERODB_EXTRA_FUNCTION(heterodb_license_reload); LOOKUP_HETERODB_EXTRA_FUNCTION(heterodb_license_query); + LOOKUP_HETERODB_EXTRA_FUNCTION(heterodb_validate_device); } PG_CATCH(); { p_heterodb_extra_error_data = NULL; p_heterodb_extra_module_init = NULL; - p_gpudirect_init_driver = NULL; - p_gpudirect_open_driver = NULL; - p_gpudirect_close_driver = NULL; - p_gpudirect_file_desc_open = NULL; - p_gpudirect_file_desc_open_by_path = NULL; - p_gpudirect_file_desc_close = NULL; - p_gpudirect_map_gpu_memory = NULL; - p_gpudirect_unmap_gpu_memory = NULL; - p_gpudirect_file_read_iov = NULL; - p_sysfs_setup_distance_map = NULL; - p_sysfs_lookup_optimal_gpus = NULL; - p_sysfs_print_nvme_info = NULL; + p_cufile__driver_init_v2 = NULL; + p_cufile__driver_open_v2 = NULL; + p_cufile__driver_close_v2 = NULL; + p_cufile__map_gpu_memory_v2 = NULL; + p_cufile__unmap_gpu_memory_v2 = NULL; + p_cufile__read_file_iov_v2 = NULL; + p_cufile__get_property_v2 = NULL; + p_cufile__set_property_v2 = NULL; p_heterodb_license_reload = NULL; p_heterodb_license_query = NULL; + p_heterodb_validate_device = NULL; PG_RE_THROW(); } PG_END_TRY(); diff --git a/src/float2.c b/src/float2.c index 4d3d1bfa4..3e591b48c 100644 --- a/src/float2.c +++ b/src/float2.c @@ -3,8 +3,8 @@ * * half-precision floating point data type support * ---- - * Copyright 2011-2021 (C) KaiGai Kohei - * Copyright 2014-2021 (C) PG-Strom Developers Team + * Copyright 2011-2023 (C) KaiGai Kohei + * Copyright 2014-2023 (C) PG-Strom Developers Team * * This program is free software; you can redistribute it and/or modify * it under the terms of the PostgreSQL License. @@ -12,16 +12,27 @@ #include "pg_strom.h" #include "float2.h" -#define PG_GETARG_FLOAT2(x) PG_GETARG_INT16(x) -#define PG_RETURN_FLOAT2(x) PG_RETURN_INT16(x) -#define DatumGetFloat2(x) DatumGetInt16(x) -#define Float2GetDatum(x) Int16GetDatum(x) +#ifndef EMULATE_FLOAT2 +#define PG_GETARG_FP16(x) __short_as_half__(PG_GETARG_UINT16(x)) +#define PG_GETARG_FP16_AS_FP32(x) ((float)PG_GETARG_FP16(x)) +#define PG_GETARG_FP16_AS_FP64(x) ((double)PG_GETARG_FP16(x)) +#define PG_RETURN_FP16(x) PG_RETURN_UINT16(__half_as_short__(x)) +#define PG_RETURN_FP32_AS_FP16(x) PG_RETURN_FP16((float2_t)(x)) +#define PG_RETURN_FP64_AS_FP16(x) PG_RETURN_FP16((float2_t)(x)) +#else +#define PG_GETARG_FP16(x) PG_GETARG_UINT16(x) +#define PG_GETARG_FP16_AS_FP32(x) fp16_to_fp32(PG_GETARG_FP16(x)) +#define PG_GETARG_FP16_AS_FP64(x) fp16_to_fp64(PG_GETARG_FP16(x)) +#define PG_RETURN_FP16(x) PG_RETURN_UINT16(x) +#define PG_RETURN_FP32_AS_FP16(x) PG_RETURN_FP16(fp32_to_fp16(x)) +#define PG_RETURN_FP64_AS_FP16(x) PG_RETURN_FP16(fp64_to_fp16(x)) +#endif /* type i/o handler */ -PG_FUNCTION_INFO_V1(pgstrom_float2_in); -PG_FUNCTION_INFO_V1(pgstrom_float2_out); -PG_FUNCTION_INFO_V1(pgstrom_float2_recv); -PG_FUNCTION_INFO_V1(pgstrom_float2_send); +PG_FUNCTION_INFO_V1(pgstrom_float2in); +PG_FUNCTION_INFO_V1(pgstrom_float2out); +PG_FUNCTION_INFO_V1(pgstrom_float2recv); +PG_FUNCTION_INFO_V1(pgstrom_float2send); /* type cast */ PG_FUNCTION_INFO_V1(pgstrom_float2_to_float4); PG_FUNCTION_INFO_V1(pgstrom_float2_to_float8); @@ -38,95 +49,88 @@ PG_FUNCTION_INFO_V1(pgstrom_int4_to_float2); PG_FUNCTION_INFO_V1(pgstrom_int8_to_float2); PG_FUNCTION_INFO_V1(pgstrom_numeric_to_float2); /* type comparison */ -PG_FUNCTION_INFO_V1(pgstrom_float2_eq); -PG_FUNCTION_INFO_V1(pgstrom_float2_ne); -PG_FUNCTION_INFO_V1(pgstrom_float2_lt); -PG_FUNCTION_INFO_V1(pgstrom_float2_le); -PG_FUNCTION_INFO_V1(pgstrom_float2_gt); -PG_FUNCTION_INFO_V1(pgstrom_float2_ge); -PG_FUNCTION_INFO_V1(pgstrom_float2_cmp); -PG_FUNCTION_INFO_V1(pgstrom_float2_larger); -PG_FUNCTION_INFO_V1(pgstrom_float2_smaller); -PG_FUNCTION_INFO_V1(pgstrom_float2_hash); - -PG_FUNCTION_INFO_V1(pgstrom_float42_eq); -PG_FUNCTION_INFO_V1(pgstrom_float42_ne); -PG_FUNCTION_INFO_V1(pgstrom_float42_lt); -PG_FUNCTION_INFO_V1(pgstrom_float42_le); -PG_FUNCTION_INFO_V1(pgstrom_float42_gt); -PG_FUNCTION_INFO_V1(pgstrom_float42_ge); -PG_FUNCTION_INFO_V1(pgstrom_float42_cmp); - -PG_FUNCTION_INFO_V1(pgstrom_float82_eq); -PG_FUNCTION_INFO_V1(pgstrom_float82_ne); -PG_FUNCTION_INFO_V1(pgstrom_float82_lt); -PG_FUNCTION_INFO_V1(pgstrom_float82_le); -PG_FUNCTION_INFO_V1(pgstrom_float82_gt); -PG_FUNCTION_INFO_V1(pgstrom_float82_ge); -PG_FUNCTION_INFO_V1(pgstrom_float82_cmp); - -PG_FUNCTION_INFO_V1(pgstrom_float24_eq); -PG_FUNCTION_INFO_V1(pgstrom_float24_ne); -PG_FUNCTION_INFO_V1(pgstrom_float24_lt); -PG_FUNCTION_INFO_V1(pgstrom_float24_le); -PG_FUNCTION_INFO_V1(pgstrom_float24_gt); -PG_FUNCTION_INFO_V1(pgstrom_float24_ge); -PG_FUNCTION_INFO_V1(pgstrom_float24_cmp); - -PG_FUNCTION_INFO_V1(pgstrom_float28_eq); -PG_FUNCTION_INFO_V1(pgstrom_float28_ne); -PG_FUNCTION_INFO_V1(pgstrom_float28_lt); -PG_FUNCTION_INFO_V1(pgstrom_float28_le); -PG_FUNCTION_INFO_V1(pgstrom_float28_gt); -PG_FUNCTION_INFO_V1(pgstrom_float28_ge); -PG_FUNCTION_INFO_V1(pgstrom_float28_cmp); +PG_FUNCTION_INFO_V1(pgstrom_float2eq); +PG_FUNCTION_INFO_V1(pgstrom_float2ne); +PG_FUNCTION_INFO_V1(pgstrom_float2lt); +PG_FUNCTION_INFO_V1(pgstrom_float2le); +PG_FUNCTION_INFO_V1(pgstrom_float2gt); +PG_FUNCTION_INFO_V1(pgstrom_float2ge); +PG_FUNCTION_INFO_V1(pgstrom_float2cmp); +PG_FUNCTION_INFO_V1(pgstrom_float2larger); +PG_FUNCTION_INFO_V1(pgstrom_float2smaller); +PG_FUNCTION_INFO_V1(pgstrom_float2hash); + +PG_FUNCTION_INFO_V1(pgstrom_float42eq); +PG_FUNCTION_INFO_V1(pgstrom_float42ne); +PG_FUNCTION_INFO_V1(pgstrom_float42lt); +PG_FUNCTION_INFO_V1(pgstrom_float42le); +PG_FUNCTION_INFO_V1(pgstrom_float42gt); +PG_FUNCTION_INFO_V1(pgstrom_float42ge); +PG_FUNCTION_INFO_V1(pgstrom_float42cmp); + +PG_FUNCTION_INFO_V1(pgstrom_float82eq); +PG_FUNCTION_INFO_V1(pgstrom_float82ne); +PG_FUNCTION_INFO_V1(pgstrom_float82lt); +PG_FUNCTION_INFO_V1(pgstrom_float82le); +PG_FUNCTION_INFO_V1(pgstrom_float82gt); +PG_FUNCTION_INFO_V1(pgstrom_float82ge); +PG_FUNCTION_INFO_V1(pgstrom_float82cmp); + +PG_FUNCTION_INFO_V1(pgstrom_float24eq); +PG_FUNCTION_INFO_V1(pgstrom_float24ne); +PG_FUNCTION_INFO_V1(pgstrom_float24lt); +PG_FUNCTION_INFO_V1(pgstrom_float24le); +PG_FUNCTION_INFO_V1(pgstrom_float24gt); +PG_FUNCTION_INFO_V1(pgstrom_float24ge); +PG_FUNCTION_INFO_V1(pgstrom_float24cmp); + +PG_FUNCTION_INFO_V1(pgstrom_float28eq); +PG_FUNCTION_INFO_V1(pgstrom_float28ne); +PG_FUNCTION_INFO_V1(pgstrom_float28lt); +PG_FUNCTION_INFO_V1(pgstrom_float28le); +PG_FUNCTION_INFO_V1(pgstrom_float28gt); +PG_FUNCTION_INFO_V1(pgstrom_float28ge); +PG_FUNCTION_INFO_V1(pgstrom_float28cmp); /* unary operators */ -PG_FUNCTION_INFO_V1(pgstrom_float2_up); -PG_FUNCTION_INFO_V1(pgstrom_float2_um); -PG_FUNCTION_INFO_V1(pgstrom_float2_abs); +PG_FUNCTION_INFO_V1(pgstrom_float2up); +PG_FUNCTION_INFO_V1(pgstrom_float2um); +PG_FUNCTION_INFO_V1(pgstrom_float2abs); /* arithmetric operators */ -PG_FUNCTION_INFO_V1(pgstrom_float2_pl); -PG_FUNCTION_INFO_V1(pgstrom_float2_mi); -PG_FUNCTION_INFO_V1(pgstrom_float2_mul); -PG_FUNCTION_INFO_V1(pgstrom_float2_div); - -PG_FUNCTION_INFO_V1(pgstrom_float24_pl); -PG_FUNCTION_INFO_V1(pgstrom_float24_mi); -PG_FUNCTION_INFO_V1(pgstrom_float24_mul); -PG_FUNCTION_INFO_V1(pgstrom_float24_div); - -PG_FUNCTION_INFO_V1(pgstrom_float28_pl); -PG_FUNCTION_INFO_V1(pgstrom_float28_mi); -PG_FUNCTION_INFO_V1(pgstrom_float28_mul); -PG_FUNCTION_INFO_V1(pgstrom_float28_div); - -PG_FUNCTION_INFO_V1(pgstrom_float42_pl); -PG_FUNCTION_INFO_V1(pgstrom_float42_mi); -PG_FUNCTION_INFO_V1(pgstrom_float42_mul); -PG_FUNCTION_INFO_V1(pgstrom_float42_div); - -PG_FUNCTION_INFO_V1(pgstrom_float82_pl); -PG_FUNCTION_INFO_V1(pgstrom_float82_mi); -PG_FUNCTION_INFO_V1(pgstrom_float82_mul); -PG_FUNCTION_INFO_V1(pgstrom_float82_div); +PG_FUNCTION_INFO_V1(pgstrom_float2pl); +PG_FUNCTION_INFO_V1(pgstrom_float2mi); +PG_FUNCTION_INFO_V1(pgstrom_float2mul); +PG_FUNCTION_INFO_V1(pgstrom_float2div); + +PG_FUNCTION_INFO_V1(pgstrom_float24pl); +PG_FUNCTION_INFO_V1(pgstrom_float24mi); +PG_FUNCTION_INFO_V1(pgstrom_float24mul); +PG_FUNCTION_INFO_V1(pgstrom_float24div); + +PG_FUNCTION_INFO_V1(pgstrom_float28pl); +PG_FUNCTION_INFO_V1(pgstrom_float28mi); +PG_FUNCTION_INFO_V1(pgstrom_float28mul); +PG_FUNCTION_INFO_V1(pgstrom_float28div); + +PG_FUNCTION_INFO_V1(pgstrom_float42pl); +PG_FUNCTION_INFO_V1(pgstrom_float42mi); +PG_FUNCTION_INFO_V1(pgstrom_float42mul); +PG_FUNCTION_INFO_V1(pgstrom_float42div); + +PG_FUNCTION_INFO_V1(pgstrom_float82pl); +PG_FUNCTION_INFO_V1(pgstrom_float82mi); +PG_FUNCTION_INFO_V1(pgstrom_float82mul); +PG_FUNCTION_INFO_V1(pgstrom_float82div); /* misc functions */ PG_FUNCTION_INFO_V1(pgstrom_cash_mul_flt2); PG_FUNCTION_INFO_V1(pgstrom_flt2_mul_cash); PG_FUNCTION_INFO_V1(pgstrom_cash_div_flt2); -PG_FUNCTION_INFO_V1(pgstrom_float8_as_int8); -PG_FUNCTION_INFO_V1(pgstrom_float4_as_int4); -PG_FUNCTION_INFO_V1(pgstrom_float2_as_int2); -PG_FUNCTION_INFO_V1(pgstrom_int8_as_float8); -PG_FUNCTION_INFO_V1(pgstrom_int4_as_float4); -PG_FUNCTION_INFO_V1(pgstrom_int2_as_float2); PG_FUNCTION_INFO_V1(pgstrom_float2_accum); PG_FUNCTION_INFO_V1(pgstrom_float2_sum); -PG_FUNCTION_INFO_V1(pgstrom_define_shell_type); static inline void -print_fp16(const char *prefix, cl_uint value) +print_fp16(const char *prefix, uint32 value) { elog(INFO, "%sFP16 0x%04x = %d + %d + 0x%04x", prefix ? prefix : "", @@ -137,7 +141,7 @@ print_fp16(const char *prefix, cl_uint value) } static inline void -print_fp32(const char *prefix, cl_uint value) +print_fp32(const char *prefix, uint32 value) { elog(INFO, "%sFP32 0x%08x = %d + %d + 0x%08x", prefix ? prefix : "", @@ -148,7 +152,7 @@ print_fp32(const char *prefix, cl_uint value) } static inline void -print_fp64(const char *prefix, cl_ulong value) +print_fp64(const char *prefix, uint64 value) { elog(INFO, "%sFP64 0x%016lx = %d + %ld + %014lx", prefix ? prefix : "", @@ -175,46 +179,41 @@ print_fp64(const char *prefix, cl_ulong value) } while(0) /* - * pgstrom_float2_in + * pgstrom_float2in */ Datum -pgstrom_float2_in(PG_FUNCTION_ARGS) +pgstrom_float2in(PG_FUNCTION_ARGS) { - Datum datum = float4in(fcinfo); - float fval; - - if (fcinfo->isnull) - PG_RETURN_NULL(); - fval = DatumGetFloat4(datum); + float fval = DatumGetFloat4(float4in(fcinfo)); - PG_RETURN_FLOAT2(fp32_to_fp16(fval)); + PG_RETURN_FP32_AS_FP16(fval); } /* - * pgstrom_float2_out + * pgstrom_float2out */ Datum -pgstrom_float2_out(PG_FUNCTION_ARGS) +pgstrom_float2out(PG_FUNCTION_ARGS) { - float fval = fp16_to_fp32((half_t)PG_GETARG_FLOAT2(0)); + float fval = PG_GETARG_FP16_AS_FP32(0); return DirectFunctionCall1(float4out, Float4GetDatum(fval)); } /* - * pgstrom_float2_recv + * pgstrom_float2recv */ Datum -pgstrom_float2_recv(PG_FUNCTION_ARGS) +pgstrom_float2recv(PG_FUNCTION_ARGS) { return int2recv(fcinfo); } /* - * pgstrom_float2_send + * pgstrom_float2send */ Datum -pgstrom_float2_send(PG_FUNCTION_ARGS) +pgstrom_float2send(PG_FUNCTION_ARGS) { return int2send(fcinfo); } @@ -225,9 +224,9 @@ pgstrom_float2_send(PG_FUNCTION_ARGS) Datum pgstrom_float2_to_float4(PG_FUNCTION_ARGS) { - half_t fval = PG_GETARG_FLOAT2(0); + float fval = PG_GETARG_FP16_AS_FP32(0); - PG_RETURN_FLOAT4(fp16_to_fp32(fval)); + PG_RETURN_FLOAT4(fval); } /* @@ -236,9 +235,9 @@ pgstrom_float2_to_float4(PG_FUNCTION_ARGS) Datum pgstrom_float2_to_float8(PG_FUNCTION_ARGS) { - half_t fval = PG_GETARG_FLOAT2(0); + double fval = PG_GETARG_FP16_AS_FP64(0); - PG_RETURN_FLOAT8(fp16_to_fp64(fval)); + PG_RETURN_FLOAT8(fval); } /* @@ -247,7 +246,7 @@ pgstrom_float2_to_float8(PG_FUNCTION_ARGS) Datum pgstrom_float2_to_int1(PG_FUNCTION_ARGS) { - float fval = fp16_to_fp32(PG_GETARG_FLOAT2(0)); + float fval = PG_GETARG_FP16_AS_FP32(0); Datum ival = DirectFunctionCall1(ftoi4, Float4GetDatum(fval)); if (DatumGetInt32(ival) < SCHAR_MIN || @@ -264,7 +263,7 @@ pgstrom_float2_to_int1(PG_FUNCTION_ARGS) Datum pgstrom_float2_to_int2(PG_FUNCTION_ARGS) { - float fval = fp16_to_fp32(PG_GETARG_FLOAT2(0)); + float fval = PG_GETARG_FP16_AS_FP32(0); return DirectFunctionCall1(ftoi2, Float4GetDatum(fval)); } @@ -275,7 +274,7 @@ pgstrom_float2_to_int2(PG_FUNCTION_ARGS) Datum pgstrom_float2_to_int4(PG_FUNCTION_ARGS) { - float fval = fp16_to_fp32(PG_GETARG_FLOAT2(0)); + float fval = PG_GETARG_FP16_AS_FP32(0); return DirectFunctionCall1(ftoi4, Float4GetDatum(fval)); } @@ -286,9 +285,9 @@ pgstrom_float2_to_int4(PG_FUNCTION_ARGS) Datum pgstrom_float2_to_int8(PG_FUNCTION_ARGS) { - double fval = fp16_to_fp64(PG_GETARG_FLOAT2(0)); + float fval = PG_GETARG_FP16_AS_FP32(0); - return DirectFunctionCall1(dtoi8, Float8GetDatum(fval)); + return DirectFunctionCall1(ftoi8, Float4GetDatum(fval)); } /* @@ -297,7 +296,7 @@ pgstrom_float2_to_int8(PG_FUNCTION_ARGS) Datum pgstrom_float2_to_numeric(PG_FUNCTION_ARGS) { - float fval = fp16_to_fp32(PG_GETARG_FLOAT2(0)); + float fval = PG_GETARG_FP16_AS_FP32(0); return DirectFunctionCall1(float4_numeric, Float4GetDatum(fval)); } @@ -310,7 +309,7 @@ pgstrom_float4_to_float2(PG_FUNCTION_ARGS) { float fval = PG_GETARG_FLOAT4(0); - PG_RETURN_FLOAT2(fp32_to_fp16(fval)); + PG_RETURN_FP32_AS_FP16(fval); } /* @@ -321,7 +320,7 @@ pgstrom_float8_to_float2(PG_FUNCTION_ARGS) { double fval = PG_GETARG_FLOAT8(0); - PG_RETURN_FLOAT2(fp64_to_fp16(fval)); + PG_RETURN_FP64_AS_FP16(fval); } /* @@ -330,9 +329,9 @@ pgstrom_float8_to_float2(PG_FUNCTION_ARGS) Datum pgstrom_int1_to_float2(PG_FUNCTION_ARGS) { - float fval = (float)((int32)PG_GETARG_DATUM(0)); + int32 ival = (int32)PG_GETARG_DATUM(0); - PG_RETURN_FLOAT2(fp32_to_fp16(fval)); + PG_RETURN_FP32_AS_FP16((float)ival); } /* @@ -343,7 +342,7 @@ pgstrom_int2_to_float2(PG_FUNCTION_ARGS) { float fval = (float) PG_GETARG_INT16(0); - PG_RETURN_FLOAT2(fp32_to_fp16(fval)); + PG_RETURN_FP32_AS_FP16(fval); } /* @@ -354,7 +353,7 @@ pgstrom_int4_to_float2(PG_FUNCTION_ARGS) { double fval = (double) PG_GETARG_INT32(0); - PG_RETURN_FLOAT2(fp64_to_fp16(fval)); + PG_RETURN_FP64_AS_FP16(fval); } /* @@ -365,7 +364,7 @@ pgstrom_int8_to_float2(PG_FUNCTION_ARGS) { double fval = (double) PG_GETARG_INT64(0); - PG_RETURN_FLOAT2(fp64_to_fp16(fval)); + PG_RETURN_FP64_AS_FP16(fval); } /* @@ -376,101 +375,101 @@ pgstrom_numeric_to_float2(PG_FUNCTION_ARGS) { float fval = DatumGetFloat4(numeric_float4(fcinfo)); - PG_RETURN_FLOAT2(fp32_to_fp16(fval)); + PG_RETURN_FP32_AS_FP16(fval); } /* * Comparison operators */ Datum -pgstrom_float2_eq(PG_FUNCTION_ARGS) +pgstrom_float2eq(PG_FUNCTION_ARGS) { - float arg1 = fp16_to_fp32(PG_GETARG_FLOAT2(0)); - float arg2 = fp16_to_fp32(PG_GETARG_FLOAT2(1)); + float arg1 = PG_GETARG_FP16_AS_FP32(0); + float arg2 = PG_GETARG_FP16_AS_FP32(1); PG_RETURN_BOOL(float4_cmp_internal(arg1, arg2) == 0); } Datum -pgstrom_float2_ne(PG_FUNCTION_ARGS) +pgstrom_float2ne(PG_FUNCTION_ARGS) { - float arg1 = fp16_to_fp32(PG_GETARG_FLOAT2(0)); - float arg2 = fp16_to_fp32(PG_GETARG_FLOAT2(1)); + float arg1 = PG_GETARG_FP16_AS_FP32(0); + float arg2 = PG_GETARG_FP16_AS_FP32(1); PG_RETURN_BOOL(float4_cmp_internal(arg1, arg2) != 0); } Datum -pgstrom_float2_lt(PG_FUNCTION_ARGS) +pgstrom_float2lt(PG_FUNCTION_ARGS) { - float arg1 = fp16_to_fp32(PG_GETARG_FLOAT2(0)); - float arg2 = fp16_to_fp32(PG_GETARG_FLOAT2(1)); + float arg1 = PG_GETARG_FP16_AS_FP32(0); + float arg2 = PG_GETARG_FP16_AS_FP32(1); PG_RETURN_BOOL(float4_cmp_internal(arg1, arg2) < 0); } Datum -pgstrom_float2_le(PG_FUNCTION_ARGS) +pgstrom_float2le(PG_FUNCTION_ARGS) { - float arg1 = fp16_to_fp32(PG_GETARG_FLOAT2(0)); - float arg2 = fp16_to_fp32(PG_GETARG_FLOAT2(1)); + float arg1 = PG_GETARG_FP16_AS_FP32(0); + float arg2 = PG_GETARG_FP16_AS_FP32(1); PG_RETURN_BOOL(float4_cmp_internal(arg1, arg2) <= 0); } Datum -pgstrom_float2_gt(PG_FUNCTION_ARGS) +pgstrom_float2gt(PG_FUNCTION_ARGS) { - float arg1 = fp16_to_fp32(PG_GETARG_FLOAT2(0)); - float arg2 = fp16_to_fp32(PG_GETARG_FLOAT2(1)); + float arg1 = PG_GETARG_FP16_AS_FP32(0); + float arg2 = PG_GETARG_FP16_AS_FP32(1); PG_RETURN_BOOL(float4_cmp_internal(arg1, arg2) > 0); } Datum -pgstrom_float2_ge(PG_FUNCTION_ARGS) +pgstrom_float2ge(PG_FUNCTION_ARGS) { - float arg1 = fp16_to_fp32(PG_GETARG_FLOAT2(0)); - float arg2 = fp16_to_fp32(PG_GETARG_FLOAT2(1)); + float arg1 = PG_GETARG_FP16_AS_FP32(0); + float arg2 = PG_GETARG_FP16_AS_FP32(1); PG_RETURN_BOOL(float4_cmp_internal(arg1, arg2) >= 0); } Datum -pgstrom_float2_cmp(PG_FUNCTION_ARGS) +pgstrom_float2cmp(PG_FUNCTION_ARGS) { - float arg1 = fp16_to_fp32(PG_GETARG_FLOAT2(0)); - float arg2 = fp16_to_fp32(PG_GETARG_FLOAT2(1)); + float arg1 = PG_GETARG_FP16_AS_FP32(0); + float arg2 = PG_GETARG_FP16_AS_FP32(1); int comp = float4_cmp_internal(arg1, arg2); PG_RETURN_INT32(comp > 0 ? 1 : (comp < 0 ? -1 : 0)); } Datum -pgstrom_float2_larger(PG_FUNCTION_ARGS) +pgstrom_float2larger(PG_FUNCTION_ARGS) { - half_t arg1 = PG_GETARG_FLOAT2(0); - half_t arg2 = PG_GETARG_FLOAT2(1); + float arg1 = PG_GETARG_FP16_AS_FP32(0); + float arg2 = PG_GETARG_FP16_AS_FP32(1); - PG_RETURN_FLOAT2(fp16_to_fp32(arg1) > fp16_to_fp32(arg2) ? arg1 : arg2); + PG_RETURN_DATUM(arg1 > arg2 ? PG_GETARG_DATUM(0) : PG_GETARG_DATUM(1)); } Datum -pgstrom_float2_smaller(PG_FUNCTION_ARGS) +pgstrom_float2smaller(PG_FUNCTION_ARGS) { - half_t arg1 = PG_GETARG_FLOAT2(0); - half_t arg2 = PG_GETARG_FLOAT2(1); + float arg1 = PG_GETARG_FP16_AS_FP32(0); + float arg2 = PG_GETARG_FP16_AS_FP32(1); - PG_RETURN_FLOAT2(fp16_to_fp32(arg1) < fp16_to_fp32(arg2) ? arg1 : arg2); + PG_RETURN_DATUM(arg1 < arg2 ? PG_GETARG_DATUM(0) : PG_GETARG_DATUM(1)); } Datum -pgstrom_float2_hash(PG_FUNCTION_ARGS) +pgstrom_float2hash(PG_FUNCTION_ARGS) { - half_t fval = PG_GETARG_FLOAT2(0); - cl_int sign = (fval & 0x8000); - cl_int expo = (fval & 0x7c00) >> 10; - cl_int frac = (fval & 0x03ff); + half_t fval = PG_GETARG_UINT16(0); + int32 sign = (fval & 0x8000); + int32 expo = (fval & 0x7c00) >> 10; + int32 frac = (fval & 0x03ff); if (expo == 0x1f) { @@ -487,191 +486,191 @@ pgstrom_float2_hash(PG_FUNCTION_ARGS) } Datum -pgstrom_float42_eq(PG_FUNCTION_ARGS) +pgstrom_float42eq(PG_FUNCTION_ARGS) { float arg1 = PG_GETARG_FLOAT4(0); - float arg2 = fp16_to_fp32(PG_GETARG_FLOAT2(1)); + float arg2 = PG_GETARG_FP16_AS_FP32(1); PG_RETURN_BOOL(float4_cmp_internal(arg1, arg2) == 0); } Datum -pgstrom_float42_ne(PG_FUNCTION_ARGS) +pgstrom_float42ne(PG_FUNCTION_ARGS) { float arg1 = PG_GETARG_FLOAT4(0); - float arg2 = fp16_to_fp32(PG_GETARG_FLOAT2(1)); + float arg2 = PG_GETARG_FP16_AS_FP32(1); PG_RETURN_BOOL(float4_cmp_internal(arg1, arg2) != 0); } Datum -pgstrom_float42_lt(PG_FUNCTION_ARGS) +pgstrom_float42lt(PG_FUNCTION_ARGS) { float arg1 = PG_GETARG_FLOAT4(0); - float arg2 = fp16_to_fp32(PG_GETARG_FLOAT2(1)); + float arg2 = PG_GETARG_FP16_AS_FP32(1); PG_RETURN_BOOL(float4_cmp_internal(arg1, arg2) < 0); } Datum -pgstrom_float42_le(PG_FUNCTION_ARGS) +pgstrom_float42le(PG_FUNCTION_ARGS) { float arg1 = PG_GETARG_FLOAT4(0); - float arg2 = fp16_to_fp32(PG_GETARG_FLOAT2(1)); + float arg2 = PG_GETARG_FP16_AS_FP32(1); PG_RETURN_BOOL(float4_cmp_internal(arg1, arg2) <= 0); } Datum -pgstrom_float42_gt(PG_FUNCTION_ARGS) +pgstrom_float42gt(PG_FUNCTION_ARGS) { float arg1 = PG_GETARG_FLOAT4(0); - float arg2 = fp16_to_fp32(PG_GETARG_FLOAT2(1)); + float arg2 = PG_GETARG_FP16_AS_FP32(1); PG_RETURN_BOOL(float4_cmp_internal(arg1, arg2) > 0); } Datum -pgstrom_float42_ge(PG_FUNCTION_ARGS) +pgstrom_float42ge(PG_FUNCTION_ARGS) { float arg1 = PG_GETARG_FLOAT4(0); - float arg2 = fp16_to_fp32(PG_GETARG_FLOAT2(1)); + float arg2 = PG_GETARG_FP16_AS_FP32(1); PG_RETURN_BOOL(float4_cmp_internal(arg1, arg2) >= 0); } Datum -pgstrom_float42_cmp(PG_FUNCTION_ARGS) +pgstrom_float42cmp(PG_FUNCTION_ARGS) { float arg1 = PG_GETARG_FLOAT4(0); - float arg2 = fp16_to_fp32(PG_GETARG_FLOAT2(1)); + float arg2 = PG_GETARG_FP16_AS_FP32(1); int comp = float4_cmp_internal(arg1, arg2); PG_RETURN_INT32(comp > 0 ? 1 : (comp < 0 ? -1 : 0)); } Datum -pgstrom_float82_eq(PG_FUNCTION_ARGS) +pgstrom_float82eq(PG_FUNCTION_ARGS) { double arg1 = PG_GETARG_FLOAT8(0); - double arg2 = fp16_to_fp64(PG_GETARG_FLOAT2(1)); + double arg2 = PG_GETARG_FP16_AS_FP64(1); PG_RETURN_BOOL(float8_cmp_internal(arg1, arg2) == 0); } Datum -pgstrom_float82_ne(PG_FUNCTION_ARGS) +pgstrom_float82ne(PG_FUNCTION_ARGS) { double arg1 = PG_GETARG_FLOAT8(0); - double arg2 = fp16_to_fp64(PG_GETARG_FLOAT2(1)); + double arg2 = PG_GETARG_FP16_AS_FP64(1); PG_RETURN_BOOL(float8_cmp_internal(arg1, arg2) != 0); } Datum -pgstrom_float82_lt(PG_FUNCTION_ARGS) +pgstrom_float82lt(PG_FUNCTION_ARGS) { double arg1 = PG_GETARG_FLOAT8(0); - double arg2 = fp16_to_fp64(PG_GETARG_FLOAT2(1)); + double arg2 = PG_GETARG_FP16_AS_FP64(1); PG_RETURN_BOOL(float8_cmp_internal(arg1, arg2) < 0); } Datum -pgstrom_float82_le(PG_FUNCTION_ARGS) +pgstrom_float82le(PG_FUNCTION_ARGS) { double arg1 = PG_GETARG_FLOAT8(0); - double arg2 = fp16_to_fp64(PG_GETARG_FLOAT2(1)); + double arg2 = PG_GETARG_FP16_AS_FP64(1); PG_RETURN_BOOL(float8_cmp_internal(arg1, arg2) <= 0); } Datum -pgstrom_float82_gt(PG_FUNCTION_ARGS) +pgstrom_float82gt(PG_FUNCTION_ARGS) { double arg1 = PG_GETARG_FLOAT8(0); - double arg2 = fp16_to_fp64(PG_GETARG_FLOAT2(1)); + double arg2 = PG_GETARG_FP16_AS_FP64(1); PG_RETURN_BOOL(float8_cmp_internal(arg1, arg2) > 0); } Datum -pgstrom_float82_ge(PG_FUNCTION_ARGS) +pgstrom_float82ge(PG_FUNCTION_ARGS) { double arg1 = PG_GETARG_FLOAT8(0); - double arg2 = fp16_to_fp64(PG_GETARG_FLOAT2(1)); + double arg2 = PG_GETARG_FP16_AS_FP64(1); PG_RETURN_BOOL(float8_cmp_internal(arg1, arg2) >= 0); } Datum -pgstrom_float82_cmp(PG_FUNCTION_ARGS) +pgstrom_float82cmp(PG_FUNCTION_ARGS) { double arg1 = PG_GETARG_FLOAT8(0); - double arg2 = fp16_to_fp64(PG_GETARG_FLOAT2(1)); + double arg2 = PG_GETARG_FP16_AS_FP64(1); int comp = float8_cmp_internal(arg1, arg2); PG_RETURN_INT32(comp > 0 ? 1 : (comp < 0 ? -1 : 0)); } Datum -pgstrom_float24_eq(PG_FUNCTION_ARGS) +pgstrom_float24eq(PG_FUNCTION_ARGS) { - float arg1 = fp16_to_fp32(PG_GETARG_FLOAT2(0)); + float arg1 = PG_GETARG_FP16_AS_FP32(0); float arg2 = PG_GETARG_FLOAT4(1); PG_RETURN_BOOL(float4_cmp_internal(arg1, arg2) == 0); } Datum -pgstrom_float24_ne(PG_FUNCTION_ARGS) +pgstrom_float24ne(PG_FUNCTION_ARGS) { - float arg1 = fp16_to_fp32(PG_GETARG_FLOAT2(0)); + float arg1 = PG_GETARG_FP16_AS_FP32(0); float arg2 = PG_GETARG_FLOAT4(1); PG_RETURN_BOOL(float4_cmp_internal(arg1, arg2) != 0); } Datum -pgstrom_float24_lt(PG_FUNCTION_ARGS) +pgstrom_float24lt(PG_FUNCTION_ARGS) { - float arg1 = fp16_to_fp32(PG_GETARG_FLOAT2(0)); + float arg1 = PG_GETARG_FP16_AS_FP32(0); float arg2 = PG_GETARG_FLOAT4(1); PG_RETURN_BOOL(float4_cmp_internal(arg1, arg2) < 0); } Datum -pgstrom_float24_le(PG_FUNCTION_ARGS) +pgstrom_float24le(PG_FUNCTION_ARGS) { - float arg1 = fp16_to_fp32(PG_GETARG_FLOAT2(0)); + float arg1 = PG_GETARG_FP16_AS_FP32(0); float arg2 = PG_GETARG_FLOAT4(1); PG_RETURN_BOOL(float4_cmp_internal(arg1, arg2) <= 0); } Datum -pgstrom_float24_gt(PG_FUNCTION_ARGS) +pgstrom_float24gt(PG_FUNCTION_ARGS) { - float arg1 = fp16_to_fp32(PG_GETARG_FLOAT2(0)); + float arg1 = PG_GETARG_FP16_AS_FP32(0); float arg2 = PG_GETARG_FLOAT4(1); PG_RETURN_BOOL(float4_cmp_internal(arg1, arg2) > 0); } Datum -pgstrom_float24_ge(PG_FUNCTION_ARGS) +pgstrom_float24ge(PG_FUNCTION_ARGS) { - float arg1 = fp16_to_fp32(PG_GETARG_FLOAT2(0)); + float arg1 = PG_GETARG_FP16_AS_FP32(0); float arg2 = PG_GETARG_FLOAT4(1); PG_RETURN_BOOL(float4_cmp_internal(arg1, arg2) >= 0); } Datum -pgstrom_float24_cmp(PG_FUNCTION_ARGS) +pgstrom_float24cmp(PG_FUNCTION_ARGS) { - float arg1 = fp16_to_fp32(PG_GETARG_FLOAT2(0)); + float arg1 = PG_GETARG_FP16_AS_FP32(0); float arg2 = PG_GETARG_FLOAT4(1); int comp = float4_cmp_internal(arg1, arg2); @@ -679,63 +678,63 @@ pgstrom_float24_cmp(PG_FUNCTION_ARGS) } Datum -pgstrom_float28_eq(PG_FUNCTION_ARGS) +pgstrom_float28eq(PG_FUNCTION_ARGS) { - double arg1 = fp16_to_fp64(PG_GETARG_FLOAT2(0)); + double arg1 = PG_GETARG_FP16_AS_FP64(0); double arg2 = PG_GETARG_FLOAT8(1); PG_RETURN_BOOL(float8_cmp_internal(arg1, arg2) == 0); } Datum -pgstrom_float28_ne(PG_FUNCTION_ARGS) +pgstrom_float28ne(PG_FUNCTION_ARGS) { - double arg1 = fp16_to_fp64(PG_GETARG_FLOAT2(0)); + double arg1 = PG_GETARG_FP16_AS_FP64(0); double arg2 = PG_GETARG_FLOAT8(1); PG_RETURN_BOOL(float8_cmp_internal(arg1, arg2) != 0); } Datum -pgstrom_float28_lt(PG_FUNCTION_ARGS) +pgstrom_float28lt(PG_FUNCTION_ARGS) { - double arg1 = fp16_to_fp64(PG_GETARG_FLOAT2(0)); + double arg1 = PG_GETARG_FP16_AS_FP64(0); double arg2 = PG_GETARG_FLOAT8(1); PG_RETURN_BOOL(float8_cmp_internal(arg1, arg2) < 0); } Datum -pgstrom_float28_le(PG_FUNCTION_ARGS) +pgstrom_float28le(PG_FUNCTION_ARGS) { - double arg1 = fp16_to_fp64(PG_GETARG_FLOAT2(0)); + double arg1 = PG_GETARG_FP16_AS_FP64(0); double arg2 = PG_GETARG_FLOAT8(1); PG_RETURN_BOOL(float8_cmp_internal(arg1, arg2) <= 0); } Datum -pgstrom_float28_gt(PG_FUNCTION_ARGS) +pgstrom_float28gt(PG_FUNCTION_ARGS) { - double arg1 = fp16_to_fp64(PG_GETARG_FLOAT2(0)); + double arg1 = PG_GETARG_FP16_AS_FP64(0); double arg2 = PG_GETARG_FLOAT8(1); PG_RETURN_BOOL(float8_cmp_internal(arg1, arg2) > 0); } Datum -pgstrom_float28_ge(PG_FUNCTION_ARGS) +pgstrom_float28ge(PG_FUNCTION_ARGS) { - double arg1 = fp16_to_fp64(PG_GETARG_FLOAT2(0)); + double arg1 = PG_GETARG_FP16_AS_FP64(0); double arg2 = PG_GETARG_FLOAT8(1); PG_RETURN_BOOL(float8_cmp_internal(arg1, arg2) >= 0); } Datum -pgstrom_float28_cmp(PG_FUNCTION_ARGS) +pgstrom_float28cmp(PG_FUNCTION_ARGS) { - double arg1 = fp16_to_fp64(PG_GETARG_FLOAT2(0)); + double arg1 = PG_GETARG_FP16_AS_FP64(0); double arg2 = PG_GETARG_FLOAT8(1); int comp = float8_cmp_internal(arg1, arg2); @@ -746,41 +745,45 @@ pgstrom_float28_cmp(PG_FUNCTION_ARGS) * unary operators */ Datum -pgstrom_float2_up(PG_FUNCTION_ARGS) +pgstrom_float2up(PG_FUNCTION_ARGS) { - half_t fval = PG_GETARG_FLOAT2(0); + float2_t fval = PG_GETARG_FP16(0); - PG_RETURN_FLOAT2(fval); + PG_RETURN_FP16(fval); } Datum -pgstrom_float2_um(PG_FUNCTION_ARGS) +pgstrom_float2um(PG_FUNCTION_ARGS) { - half_t fval = PG_GETARG_FLOAT2(0); - - fval ^= 0x8000; - - PG_RETURN_FLOAT2(fval); + float2_t fval = PG_GETARG_FP16(0); +#ifndef EMULATE_FLOAT2 + fval = -fval; +#else + fval ^= ~0x8000; +#endif + PG_RETURN_FP16(fval); } Datum -pgstrom_float2_abs(PG_FUNCTION_ARGS) +pgstrom_float2abs(PG_FUNCTION_ARGS) { - half_t fval = PG_GETARG_FLOAT2(0); - + float2_t fval = PG_GETARG_FP16(0); +#ifndef EMULATE_FLOAT2 + fval = abs(fval); +#else fval &= ~0x8000; - - PG_RETURN_FLOAT2(fval); +#endif + PG_RETURN_FP16(fval); } /* * arithmetic operations */ Datum -pgstrom_float2_pl(PG_FUNCTION_ARGS) +pgstrom_float2pl(PG_FUNCTION_ARGS) { - float arg1 = fp16_to_fp32(PG_GETARG_FLOAT2(0)); - float arg2 = fp16_to_fp32(PG_GETARG_FLOAT2(1)); + float arg1 = PG_GETARG_FP16_AS_FP32(0); + float arg2 = PG_GETARG_FP16_AS_FP32(1); float result; result = arg1 + arg2; @@ -790,10 +793,10 @@ pgstrom_float2_pl(PG_FUNCTION_ARGS) } Datum -pgstrom_float2_mi(PG_FUNCTION_ARGS) +pgstrom_float2mi(PG_FUNCTION_ARGS) { - float arg1 = fp16_to_fp32(PG_GETARG_FLOAT2(0)); - float arg2 = fp16_to_fp32(PG_GETARG_FLOAT2(1)); + float arg1 = PG_GETARG_FP16_AS_FP32(0); + float arg2 = PG_GETARG_FP16_AS_FP32(1); float result; result = arg1 - arg2; @@ -802,10 +805,10 @@ pgstrom_float2_mi(PG_FUNCTION_ARGS) } Datum -pgstrom_float2_mul(PG_FUNCTION_ARGS) +pgstrom_float2mul(PG_FUNCTION_ARGS) { - float arg1 = fp16_to_fp32(PG_GETARG_FLOAT2(0)); - float arg2 = fp16_to_fp32(PG_GETARG_FLOAT2(1)); + float arg1 = PG_GETARG_FP16_AS_FP32(0); + float arg2 = PG_GETARG_FP16_AS_FP32(1); float result; result = arg1 * arg2; @@ -817,10 +820,10 @@ pgstrom_float2_mul(PG_FUNCTION_ARGS) } Datum -pgstrom_float2_div(PG_FUNCTION_ARGS) +pgstrom_float2div(PG_FUNCTION_ARGS) { - float arg1 = fp16_to_fp32(PG_GETARG_FLOAT2(0)); - float arg2 = fp16_to_fp32(PG_GETARG_FLOAT2(1)); + float arg1 = PG_GETARG_FP16_AS_FP32(0); + float arg2 = PG_GETARG_FP16_AS_FP32(1); float result; if (arg2 == 0.0) @@ -834,9 +837,9 @@ pgstrom_float2_div(PG_FUNCTION_ARGS) } Datum -pgstrom_float24_pl(PG_FUNCTION_ARGS) +pgstrom_float24pl(PG_FUNCTION_ARGS) { - float arg1 = fp16_to_fp32(PG_GETARG_FLOAT2(0)); + float arg1 = PG_GETARG_FP16_AS_FP32(0); float arg2 = PG_GETARG_FLOAT4(1); float result; @@ -847,9 +850,9 @@ pgstrom_float24_pl(PG_FUNCTION_ARGS) } Datum -pgstrom_float24_mi(PG_FUNCTION_ARGS) +pgstrom_float24mi(PG_FUNCTION_ARGS) { - float arg1 = fp16_to_fp32(PG_GETARG_FLOAT2(0)); + float arg1 = PG_GETARG_FP16_AS_FP32(0); float arg2 = PG_GETARG_FLOAT4(1); float result; @@ -860,9 +863,9 @@ pgstrom_float24_mi(PG_FUNCTION_ARGS) } Datum -pgstrom_float24_mul(PG_FUNCTION_ARGS) +pgstrom_float24mul(PG_FUNCTION_ARGS) { - float arg1 = fp16_to_fp32(PG_GETARG_FLOAT2(0)); + float arg1 = PG_GETARG_FP16_AS_FP32(0); float arg2 = PG_GETARG_FLOAT4(1); float result; @@ -874,9 +877,9 @@ pgstrom_float24_mul(PG_FUNCTION_ARGS) } Datum -pgstrom_float24_div(PG_FUNCTION_ARGS) +pgstrom_float24div(PG_FUNCTION_ARGS) { - float arg1 = fp16_to_fp32(PG_GETARG_FLOAT2(0)); + float arg1 = PG_GETARG_FP16_AS_FP32(0); float arg2 = PG_GETARG_FLOAT4(1); float result; @@ -892,9 +895,9 @@ pgstrom_float24_div(PG_FUNCTION_ARGS) } Datum -pgstrom_float28_pl(PG_FUNCTION_ARGS) +pgstrom_float28pl(PG_FUNCTION_ARGS) { - double arg1 = fp16_to_fp64(PG_GETARG_FLOAT2(0)); + double arg1 = PG_GETARG_FP16_AS_FP32(0); double arg2 = PG_GETARG_FLOAT8(1); double result; @@ -905,9 +908,9 @@ pgstrom_float28_pl(PG_FUNCTION_ARGS) } Datum -pgstrom_float28_mi(PG_FUNCTION_ARGS) +pgstrom_float28mi(PG_FUNCTION_ARGS) { - double arg1 = fp16_to_fp64(PG_GETARG_FLOAT2(0)); + double arg1 = PG_GETARG_FP16_AS_FP64(0); double arg2 = PG_GETARG_FLOAT8(1); double result; @@ -918,9 +921,9 @@ pgstrom_float28_mi(PG_FUNCTION_ARGS) } Datum -pgstrom_float28_mul(PG_FUNCTION_ARGS) +pgstrom_float28mul(PG_FUNCTION_ARGS) { - double arg1 = fp16_to_fp64(PG_GETARG_FLOAT2(0)); + double arg1 = PG_GETARG_FP16_AS_FP64(0); double arg2 = PG_GETARG_FLOAT8(1); double result; @@ -932,9 +935,9 @@ pgstrom_float28_mul(PG_FUNCTION_ARGS) } Datum -pgstrom_float28_div(PG_FUNCTION_ARGS) +pgstrom_float28div(PG_FUNCTION_ARGS) { - double arg1 = fp16_to_fp64(PG_GETARG_FLOAT2(0)); + double arg1 = PG_GETARG_FP16_AS_FP64(0); double arg2 = PG_GETARG_FLOAT8(1); double result; @@ -950,10 +953,10 @@ pgstrom_float28_div(PG_FUNCTION_ARGS) } Datum -pgstrom_float42_pl(PG_FUNCTION_ARGS) +pgstrom_float42pl(PG_FUNCTION_ARGS) { float arg1 = PG_GETARG_FLOAT4(0); - float arg2 = fp16_to_fp32(PG_GETARG_FLOAT2(1)); + float arg2 = PG_GETARG_FP16_AS_FP32(1); float result; result = arg1 + arg2; @@ -962,10 +965,10 @@ pgstrom_float42_pl(PG_FUNCTION_ARGS) } Datum -pgstrom_float42_mi(PG_FUNCTION_ARGS) +pgstrom_float42mi(PG_FUNCTION_ARGS) { float arg1 = PG_GETARG_FLOAT4(0); - float arg2 = fp16_to_fp32(PG_GETARG_FLOAT2(1)); + float arg2 = PG_GETARG_FP16_AS_FP32(1); float result; result = arg1 - arg2; @@ -974,10 +977,10 @@ pgstrom_float42_mi(PG_FUNCTION_ARGS) } Datum -pgstrom_float42_mul(PG_FUNCTION_ARGS) +pgstrom_float42mul(PG_FUNCTION_ARGS) { float arg1 = PG_GETARG_FLOAT4(0); - float arg2 = fp16_to_fp32(PG_GETARG_FLOAT2(1)); + float arg2 = PG_GETARG_FP16_AS_FP32(1); float result; result = arg1 * arg2; @@ -988,10 +991,10 @@ pgstrom_float42_mul(PG_FUNCTION_ARGS) } Datum -pgstrom_float42_div(PG_FUNCTION_ARGS) +pgstrom_float42div(PG_FUNCTION_ARGS) { float arg1 = PG_GETARG_FLOAT4(0); - float arg2 = fp16_to_fp32(PG_GETARG_FLOAT2(1)); + float arg2 = PG_GETARG_FP16_AS_FP32(1); float result; if (arg2 == 0.0) @@ -1005,10 +1008,10 @@ pgstrom_float42_div(PG_FUNCTION_ARGS) } Datum -pgstrom_float82_pl(PG_FUNCTION_ARGS) +pgstrom_float82pl(PG_FUNCTION_ARGS) { double arg1 = PG_GETARG_FLOAT8(0); - double arg2 = fp16_to_fp64(PG_GETARG_FLOAT2(1)); + double arg2 = PG_GETARG_FP16_AS_FP64(1); double result; result = arg1 + arg2; @@ -1017,10 +1020,10 @@ pgstrom_float82_pl(PG_FUNCTION_ARGS) } Datum -pgstrom_float82_mi(PG_FUNCTION_ARGS) +pgstrom_float82mi(PG_FUNCTION_ARGS) { double arg1 = PG_GETARG_FLOAT8(0); - double arg2 = fp16_to_fp64(PG_GETARG_FLOAT2(1)); + double arg2 = PG_GETARG_FP16_AS_FP64(1); double result; result = arg1 - arg2; @@ -1029,10 +1032,10 @@ pgstrom_float82_mi(PG_FUNCTION_ARGS) } Datum -pgstrom_float82_mul(PG_FUNCTION_ARGS) +pgstrom_float82mul(PG_FUNCTION_ARGS) { double arg1 = PG_GETARG_FLOAT8(0); - double arg2 = fp16_to_fp64(PG_GETARG_FLOAT2(1)); + double arg2 = PG_GETARG_FP16_AS_FP64(1); double result; result = arg1 * arg2; @@ -1044,10 +1047,10 @@ pgstrom_float82_mul(PG_FUNCTION_ARGS) } Datum -pgstrom_float82_div(PG_FUNCTION_ARGS) +pgstrom_float82div(PG_FUNCTION_ARGS) { double arg1 = PG_GETARG_FLOAT8(0); - double arg2 = fp16_to_fp64(PG_GETARG_FLOAT2(1)); + double arg2 = PG_GETARG_FP16_AS_FP64(1); double result; if (arg2 == 0.0) @@ -1068,7 +1071,7 @@ Datum pgstrom_cash_mul_flt2(PG_FUNCTION_ARGS) { Cash c = PG_GETARG_CASH(0); - float8 f = fp16_to_fp64(PG_GETARG_FLOAT2(1)); + float8 f = PG_GETARG_FP16_AS_FP64(1); Cash result; result = rint(c * f); @@ -1078,7 +1081,7 @@ pgstrom_cash_mul_flt2(PG_FUNCTION_ARGS) Datum pgstrom_flt2_mul_cash(PG_FUNCTION_ARGS) { - float8 f = fp16_to_fp64(PG_GETARG_FLOAT2(0)); + float8 f = PG_GETARG_FP16_AS_FP64(0); Cash c = PG_GETARG_CASH(1); Cash result; @@ -1090,7 +1093,7 @@ Datum pgstrom_cash_div_flt2(PG_FUNCTION_ARGS) { Cash c = PG_GETARG_CASH(0); - float8 f = fp16_to_fp64(PG_GETARG_FLOAT2(1)); + float8 f = PG_GETARG_FP16_AS_FP64(1); Cash result; if (f == 0.0) @@ -1102,60 +1105,12 @@ pgstrom_cash_div_flt2(PG_FUNCTION_ARGS) PG_RETURN_CASH(result); } -Datum -pgstrom_float8_as_int8(PG_FUNCTION_ARGS) -{ - float8 fval = PG_GETARG_FLOAT8(0); - - PG_RETURN_INT64(double_as_long(fval)); -} - -Datum -pgstrom_float4_as_int4(PG_FUNCTION_ARGS) -{ - float4 fval = PG_GETARG_FLOAT4(0); - - PG_RETURN_INT32(float_as_int(fval)); -} - -Datum -pgstrom_float2_as_int2(PG_FUNCTION_ARGS) -{ - half_t fval = PG_GETARG_FLOAT2(0); - - PG_RETURN_INT16(fval); /* actually, half_t is unsigned short */ -} - -Datum -pgstrom_int8_as_float8(PG_FUNCTION_ARGS) -{ - int64 ival = PG_GETARG_INT64(0); - - PG_RETURN_FLOAT8(long_as_double(ival)); -} - -Datum -pgstrom_int4_as_float4(PG_FUNCTION_ARGS) -{ - int32 ival = PG_GETARG_INT32(0); - - PG_RETURN_FLOAT4(int_as_float(ival)); -} - -Datum -pgstrom_int2_as_float2(PG_FUNCTION_ARGS) -{ - int16 ival = PG_GETARG_INT16(0); - - PG_RETURN_FLOAT2(ival); /* actually, half_t is unsigned short */ -} - Datum pgstrom_float2_accum(PG_FUNCTION_ARGS) { ArrayType *transarray = PG_GETARG_ARRAYTYPE_P(0); /* do computations as float8 */ - float8 newval = fp16_to_fp64(PG_GETARG_FLOAT2(1)); + float8 newval = PG_GETARG_FP16_AS_FP64(1); float8 *transvalues; float8 N, sumX, sumX2; @@ -1216,96 +1171,14 @@ pgstrom_float2_sum(PG_FUNCTION_ARGS) { if (PG_ARGISNULL(1)) PG_RETURN_NULL(); /* still no non-null */ - newval = fp16_to_fp64(PG_GETARG_FLOAT2(1)); + newval = PG_GETARG_FP16_AS_FP64(1); } else { newval = PG_GETARG_FLOAT8(0); if (!PG_ARGISNULL(1)) - newval += fp16_to_fp64(PG_GETARG_FLOAT2(1)); + newval += PG_GETARG_FP16_AS_FP64(1); } PG_RETURN_FLOAT8(newval); } - -Datum -pgstrom_define_shell_type(PG_FUNCTION_ARGS) -{ - Name type_name = PG_GETARG_NAME(0); - Oid type_oid = PG_GETARG_OID(1); - Oid type_namespace = PG_GETARG_OID(2); - Relation type_rel; - TupleDesc tupdesc; - HeapTuple tup; - Datum values[Natts_pg_type]; - bool isnull[Natts_pg_type]; - - if (!superuser()) - ereport(ERROR, - (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), - errmsg("must be superuser to create a shell type"))); - /* see TypeShellMake */ - type_rel = table_open(TypeRelationId, RowExclusiveLock); - tupdesc = RelationGetDescr(type_rel); - - memset(values, 0, sizeof(values)); - memset(isnull, 0, sizeof(isnull)); -#if PG_VERSION_NUM >= 120000 - values[Anum_pg_type_oid-1] = type_oid; -#endif - values[Anum_pg_type_typname-1] = NameGetDatum(type_name); - values[Anum_pg_type_typnamespace-1] = ObjectIdGetDatum(type_namespace); - values[Anum_pg_type_typowner-1] = ObjectIdGetDatum(GetUserId()); - values[Anum_pg_type_typlen-1] = Int16GetDatum(sizeof(int32)); - values[Anum_pg_type_typbyval-1] = BoolGetDatum(true); - values[Anum_pg_type_typtype-1] = CharGetDatum(TYPTYPE_PSEUDO); - values[Anum_pg_type_typcategory-1] =CharGetDatum(TYPCATEGORY_PSEUDOTYPE); - values[Anum_pg_type_typispreferred-1] = BoolGetDatum(false); - values[Anum_pg_type_typisdefined-1] = BoolGetDatum(false); - values[Anum_pg_type_typdelim-1] = CharGetDatum(DEFAULT_TYPDELIM); - values[Anum_pg_type_typrelid-1] = ObjectIdGetDatum(InvalidOid); - values[Anum_pg_type_typelem-1] = ObjectIdGetDatum(InvalidOid); - values[Anum_pg_type_typarray-1] = ObjectIdGetDatum(InvalidOid); - values[Anum_pg_type_typinput-1] = ObjectIdGetDatum(F_SHELL_IN); - values[Anum_pg_type_typoutput-1] = ObjectIdGetDatum(F_SHELL_OUT); - values[Anum_pg_type_typreceive-1] = ObjectIdGetDatum(InvalidOid); - values[Anum_pg_type_typsend-1] = ObjectIdGetDatum(InvalidOid); - values[Anum_pg_type_typmodin-1] = ObjectIdGetDatum(InvalidOid); - values[Anum_pg_type_typmodout-1] = ObjectIdGetDatum(InvalidOid); - values[Anum_pg_type_typanalyze-1] = ObjectIdGetDatum(InvalidOid); - values[Anum_pg_type_typalign-1] = CharGetDatum('i'); - values[Anum_pg_type_typstorage-1] = CharGetDatum('p'); - values[Anum_pg_type_typnotnull-1] = BoolGetDatum(false); - values[Anum_pg_type_typbasetype-1] = ObjectIdGetDatum(InvalidOid); - values[Anum_pg_type_typtypmod-1] = Int32GetDatum(-1); - values[Anum_pg_type_typndims-1] = Int32GetDatum(0); - values[Anum_pg_type_typcollation-1] = ObjectIdGetDatum(InvalidOid); - isnull[Anum_pg_type_typdefaultbin-1] = true; - isnull[Anum_pg_type_typdefault-1] = true; - isnull[Anum_pg_type_typacl-1] = true; - - /* create a new type tuple, and insert */ - tup = heap_form_tuple(tupdesc, values, isnull); -#if PG_VERSION_NUM < 120000 - HeapTupleSetOid(tup, type_oid); -#endif - CatalogTupleInsert(type_rel, tup); - - /* create dependencies */ - GenerateTypeDependencies(tup, - type_rel, - NULL, - NULL, - 0, - false, - false, - true, - false); - /* Post creation hook for new shell type */ - InvokeObjectPostCreateHook(TypeRelationId, type_oid, 0); - - heap_freetuple(tup); - table_close(type_rel, RowExclusiveLock); - - PG_RETURN_OID(type_oid); -} diff --git a/src/float2.h b/src/float2.h index 00f445e7e..daca297ce 100644 --- a/src/float2.h +++ b/src/float2.h @@ -3,8 +3,8 @@ * * Definition of half-precision floating-point * -- - * Copyright 2011-2021 (C) KaiGai Kohei - * Copyright 2014-2021 (C) PG-Strom Developers Team + * Copyright 2011-2023 (C) KaiGai Kohei + * Copyright 2014-2023 (C) PG-Strom Developers Team * * This program is free software; you can redistribute it and/or modify * it under the terms of the PostgreSQL License. @@ -15,6 +15,18 @@ typedef uint16_t half_t; +#if defined(__CUDACC__) +#include +typedef __half float2_t; +#elif defined(HAVE_FLOAT2) +typedef _Float16 float2_t; +#else +#define EMULATE_FLOAT2 1 +typedef half_t float2_t; +#endif +typedef float float4_t; +typedef double float8_t; + /* parameters of floating-point */ #define FP16_FRAC_BITS (10) #define FP16_EXPO_BITS (5) @@ -35,57 +47,98 @@ typedef uint16_t half_t; #define FP64_EXPO_BIAS (1023) /* int/float reinterpret functions */ -static inline double -long_as_double(uint64_t ival) +INLINE_FUNCTION(double) +__longlong_as_double__(const uint64_t ival) { +#ifdef __CUDACC__ + return __longlong_as_double(ival); +#else union { uint64_t ival; double fval; } datum; datum.ival = ival; return datum.fval; +#endif } -static inline uint64_t -double_as_long(double fval) +INLINE_FUNCTION(uint64_t) +__double_as_longlong__(const double fval) { +#ifdef __CUDACC__ + return __double_as_longlong(fval); +#else union { uint64_t ival; double fval; } datum; datum.fval = fval; return datum.ival; +#endif } -static inline float -int_as_float(uint32_t ival) +INLINE_FUNCTION(float) +__int_as_float__(const uint32_t ival) { +#ifdef __CUDACC__ + return __uint_as_float(ival); +#else union { uint32_t ival; float fval; } datum; datum.ival = ival; return datum.fval; +#endif } -static inline uint32_t -float_as_int(float fval) +INLINE_FUNCTION(uint32_t) +__float_as_int__(const float fval) { +#ifdef __CUDACC__ + return __float_as_uint(fval); +#else union { uint32_t ival; float fval; } datum; datum.fval = fval; return datum.ival; +#endif +} + +INLINE_FUNCTION(float2_t) +__short_as_half__(const uint16_t ival) +{ + union { + uint16_t ival; + float2_t fval; + } datum; + datum.ival = ival; + return datum.fval; +} + +INLINE_FUNCTION(uint16_t) +__half_as_short__(const float2_t fval) +{ + union { + uint16_t ival; + float2_t fval; + } datum; + datum.fval = fval; + return datum.ival; } /* - * cast functions across floating point + * cast functions across floating point if emulation mode */ -static inline half_t -fp32_to_fp16(float value) +INLINE_FUNCTION(float2_t) +fp32_to_fp16(const float value) { - uint32_t x = float_as_int(value); +#ifndef EMULATE_FLOAT2 + return (float2_t)value; +#else + uint32_t x = __float_as_int__(value); uint32_t u = (x & 0x7fffffffU); uint32_t sign = ((x >> 16U) & 0x8000U); uint32_t remainder; @@ -132,17 +185,21 @@ fp32_to_fp16(float value) result++; return result; +#endif } -static inline half_t +INLINE_FUNCTION(float2_t) fp64_to_fp16(double fval) { return fp32_to_fp16((float)fval); } -static inline float -fp16_to_fp32(half_t fp16val) +INLINE_FUNCTION(float4_t) +fp16_to_fp32(float2_t fp16val) { +#ifndef EMULATE_FLOAT2 + return (float4_t)fp16val; +#else uint32_t sign = ((uint32_t)(fp16val & 0x8000) << 16); int32_t expo = ((fp16val & 0x7c00) >> 10); int32_t frac = ((fp16val & 0x03ff)); @@ -176,12 +233,16 @@ fp16_to_fp32(half_t fp16val) result = (sign | (expo << FP32_FRAC_BITS) | (frac << 13)); } - return int_as_float(result); + return __int_as_float__(result); +#endif } -static inline double +INLINE_FUNCTION(float8_t) fp16_to_fp64(half_t fp16val) { +#ifndef EMULATE_FLOAT2 + return (float8_t)fp16val; +#else uint64_t sign = ((uint64_t)(fp16val & 0x8000) << 48); int64_t expo = ((fp16val & 0x7c00) >> 10); int64_t frac = ((fp16val & 0x03ff)); @@ -214,7 +275,38 @@ fp16_to_fp64(half_t fp16val) expo += FP64_EXPO_BIAS; result = (sign | (expo << FP64_FRAC_BITS) | (frac << 42)); } - return long_as_double(result); + return __longlong_as_double__(result); +#endif +} + +#ifdef __cplusplus +INLINE_FUNCTION(float2_t) __to_fp16(float2_t fval) { return fval; } +INLINE_FUNCTION(float2_t) __to_fp16(float4_t fval) { return fp32_to_fp16(fval); } +INLINE_FUNCTION(float2_t) __to_fp16(float8_t fval) { return fp64_to_fp16(fval); } + +INLINE_FUNCTION(float4_t) __to_fp32(float2_t fval) { return fp16_to_fp32(fval); } +INLINE_FUNCTION(float4_t) __to_fp32(float4_t fval) { return fval; } +INLINE_FUNCTION(float4_t) __to_fp32(float8_t fval) { return (float)fval; } + +INLINE_FUNCTION(float8_t) __to_fp64(float2_t fval) { return fp16_to_fp64(fval); } +INLINE_FUNCTION(float8_t) __to_fp64(float4_t fval) { return (double)fval; } +INLINE_FUNCTION(float8_t) __to_fp64(float8_t fval) { return fval; } + +INLINE_FUNCTION(float2_t) +__fp16_unary_plus(float2_t fval) +{ + return fval; +} +INLINE_FUNCTION(float2_t) +__fp16_unary_minus(float2_t fval) +{ + return __short_as_half__(__half_as_short__(fval) ^ 0x8000U); +} +INLINE_FUNCTION(float2_t) +__fp16_unary_abs(float2_t fval) +{ + return __short_as_half__(__half_as_short__(fval) & 0x7fffU); } +#endif #endif /* FLOAT2_H */ diff --git a/src/gpu_device.c b/src/gpu_device.c index a55b7995e..f07334e0e 100644 --- a/src/gpu_device.c +++ b/src/gpu_device.c @@ -3,19 +3,22 @@ * * Routines to collect GPU device information. * ---- - * Copyright 2011-2021 (C) KaiGai Kohei - * Copyright 2014-2021 (C) PG-Strom Developers Team + * Copyright 2011-2023 (C) KaiGai Kohei + * Copyright 2014-2023 (C) PG-Strom Developers Team * * This program is free software; you can redistribute it and/or modify * it under the terms of the PostgreSQL License. */ #include "pg_strom.h" +#include "cuda_common.h" /* variable declarations */ -DevAttributes *devAttrs = NULL; -cl_int numDevAttrs = 0; -cl_uint devBaselineMaxThreadsPerBlock = UINT_MAX; - +GpuDevAttributes *gpuDevAttrs = NULL; +int numGpuDevAttrs = 0; +double pgstrom_gpu_setup_cost; /* GUC */ +double pgstrom_gpu_tuple_cost; /* GUC */ +double pgstrom_gpu_operator_cost; /* GUC */ +double pgstrom_gpu_direct_seq_page_cost; /* GUC */ /* catalog of device attributes */ typedef enum { DEVATTRKIND__INT, @@ -29,312 +32,361 @@ typedef enum { static struct { CUdevice_attribute attr_id; - DevAttrKind attr_kind; size_t attr_offset; + const char *attr_label; const char *attr_desc; -} DevAttrCatalog[] = { -#define DEV_ATTR(LABEL,KIND,a,DESC) \ +} GpuDevAttrCatalog[] = { +#define DEV_ATTR(LABEL,DESC) \ { CU_DEVICE_ATTRIBUTE_##LABEL, \ - DEVATTRKIND__##KIND, \ - offsetof(struct DevAttributes, LABEL), \ - DESC }, -#include "device_attrs.h" + offsetof(struct GpuDevAttributes, LABEL), \ + #LABEL, DESC }, +#include "gpu_devattrs.h" #undef DEV_ATTR }; /* declaration */ -Datum pgstrom_device_info(PG_FUNCTION_ARGS); -Datum pgstrom_gpu_device_name(PG_FUNCTION_ARGS); -Datum pgstrom_gpu_global_memsize(PG_FUNCTION_ARGS); -Datum pgstrom_gpu_max_blocksize(PG_FUNCTION_ARGS); -Datum pgstrom_gpu_warp_size(PG_FUNCTION_ARGS); -Datum pgstrom_gpu_max_shared_memory_perblock(PG_FUNCTION_ARGS); -Datum pgstrom_gpu_num_registers_perblock(PG_FUNCTION_ARGS); -Datum pgstrom_gpu_num_multiptocessors(PG_FUNCTION_ARGS); -Datum pgstrom_gpu_num_cuda_cores(PG_FUNCTION_ARGS); -Datum pgstrom_gpu_cc_major(PG_FUNCTION_ARGS); -Datum pgstrom_gpu_cc_minor(PG_FUNCTION_ARGS); -Datum pgstrom_gpu_pci_id(PG_FUNCTION_ARGS); - -/* static variables */ -static bool gpudirect_driver_is_initialized = false; -static bool __pgstrom_gpudirect_enabled; /* GUC */ -static int __pgstrom_gpudirect_threshold; /* GUC */ +Datum pgstrom_gpu_device_info(PG_FUNCTION_ARGS); /* - * pgstrom_gpudirect_enabled + * collectGpuDevAttrs */ -bool -pgstrom_gpudirect_enabled(void) +static void +__collectGpuDevAttrs(GpuDevAttributes *dattrs, CUdevice cuda_device) { - return __pgstrom_gpudirect_enabled; -} + CUresult rc; + char path[1024]; + char linebuf[1024]; + FILE *filp; + struct stat stat_buf; -/* - * pgstrom_gpudirect_enabled_checker - */ -static bool -pgstrom_gpudirect_enabled_checker(bool *p_newval, void **extra, GucSource source) -{ - bool newval = *p_newval; + rc = cuDeviceGetName(dattrs->DEV_NAME, sizeof(dattrs->DEV_NAME), cuda_device); + if (rc != CUDA_SUCCESS) + __FATAL("failed on cuDeviceGetName: %s", cuStrError(rc)); + rc = cuDeviceGetUuid((CUuuid *)dattrs->DEV_UUID, cuda_device); + if (rc != CUDA_SUCCESS) + __FATAL("failed on cuDeviceGetUuid: %s", cuStrError(rc)); + rc = cuDeviceTotalMem(&dattrs->DEV_TOTAL_MEMSZ, cuda_device); + if (rc != CUDA_SUCCESS) + __FATAL("failed on cuDeviceTotalMem: %s", cuStrError(rc)); +#define DEV_ATTR(LABEL,DESC) \ + rc = cuDeviceGetAttribute(&dattrs->LABEL, \ + CU_DEVICE_ATTRIBUTE_##LABEL, \ + cuda_device); \ + if (rc != CUDA_SUCCESS) \ + __FATAL("failed on cuDeviceGetAttribute(" #LABEL "): %s", \ + cuStrError(rc)); +#include "gpu_devattrs.h" +#undef DEV_ATTR + /* + * Some other fields to be fetched from Sysfs + */ + snprintf(path, sizeof(path), + "/sys/bus/pci/devices/%04x:%02x:%02x.0/numa_node", + dattrs->PCI_DOMAIN_ID, + dattrs->PCI_BUS_ID, + dattrs->PCI_DEVICE_ID); + filp = fopen(path, "r"); + if (!filp) + dattrs->NUMA_NODE_ID = -1; /* unknown */ + else + { + if (!fgets(linebuf, sizeof(linebuf), filp)) + dattrs->NUMA_NODE_ID = -1; /* unknown */ + else + dattrs->NUMA_NODE_ID = atoi(linebuf); + fclose(filp); + } - if (newval && !gpudirect_driver_is_initialized) - elog(ERROR, "cannot enable GPUDirectSQL without driver module loaded"); - return true; + snprintf(path, sizeof(path), + "/sys/bus/pci/devices/%04x:%02x:%02x.0/resource1", + dattrs->PCI_DOMAIN_ID, + dattrs->PCI_BUS_ID, + dattrs->PCI_DEVICE_ID); + if (stat(path, &stat_buf) == 0) + dattrs->DEV_BAR1_MEMSZ = stat_buf.st_size; + else + dattrs->DEV_BAR1_MEMSZ = 0; /* unknown */ + + /* + * GPU-Direct SQL is supported? + */ + if (dattrs->GPU_DIRECT_RDMA_SUPPORTED) + { + if (dattrs->DEV_BAR1_MEMSZ == 0 /* unknown */ || + dattrs->DEV_BAR1_MEMSZ > (256UL << 20)) + dattrs->DEV_SUPPORT_GPUDIRECTSQL = true; + } } -/* - * pgstrom_gpudirect_threshold - */ -Size -pgstrom_gpudirect_threshold(void) +static int +collectGpuDevAttrs(int fdesc) { - return (Size)__pgstrom_gpudirect_threshold << 10; + GpuDevAttributes dattrs; + CUdevice cuda_device; + CUresult rc; + int i, nr_gpus; + + rc = cuInit(0); + if (rc != CUDA_SUCCESS) + __FATAL("failed on cuInit: %s", cuStrError(rc)); + rc = cuDeviceGetCount(&nr_gpus); + if (rc != CUDA_SUCCESS) + __FATAL("failed on cuDeviceGetCount: %s", cuStrError(rc)); + + for (i=0; i < nr_gpus; i++) + { + ssize_t offset, nbytes; + + rc = cuDeviceGet(&cuda_device, i); + if (rc != CUDA_SUCCESS) + __FATAL("failed on cuDeviceGet: %s", cuStrError(rc)); + memset(&dattrs, 0, sizeof(GpuDevAttributes)); + dattrs.DEV_ID = i; + __collectGpuDevAttrs(&dattrs, cuda_device); + + for (offset=0; offset < sizeof(GpuDevAttributes); offset += nbytes) + { + nbytes = write(fdesc, ((char *)&dattrs) + offset, + sizeof(GpuDevAttributes) - offset); + if (nbytes == 0) + break; + if (nbytes < 0) + __FATAL("failed on write(pipefd): %m"); + } + } + return 0; } /* - * pgstrom_collect_gpu_device + * receiveGpuDevAttrs */ -static bool -pgstrom_collect_gpu_device(void) +static void +receiveGpuDevAttrs(int fdesc) { - StringInfoData str; - const char *cmdline = (CMD_GPUINFO_PATH " -md"); - char linebuf[2048]; - FILE *filp; - char *tok_attr; - char *tok_val; - char *pos; - char *cuda_runtime_version = NULL; - char *nvidia_driver_version = NULL; - int num_devices = -1; /* total num of GPUs; incl legacy models */ - int i, cuda_dindex; - - Assert(numDevAttrs == 0); - filp = OpenPipeStream(cmdline, PG_BINARY_R); - if (!filp) - return false; + GpuDevAttributes *__devAttrs = NULL; + GpuDevAttributes dattrs_saved; + int nitems = 0; + int nrooms = 0; + bool is_saved = false; - initStringInfo(&str); - while (fgets(linebuf, sizeof(linebuf), filp) != NULL) + for (;;) { - /* trim '\n' on the tail */ - pos = linebuf + strlen(linebuf); - while (pos > linebuf && isspace(*--pos)) - *pos = '\0'; - /* empty line? */ - if (linebuf[0] == '\0') + GpuDevAttributes dtemp; + ssize_t nbytes; + + nbytes = __readFile(fdesc, &dtemp, sizeof(GpuDevAttributes)); + if (nbytes == 0) + break; /* end */ + if (nbytes != sizeof(GpuDevAttributes)) + elog(ERROR, "failed on collect GPU device attributes"); + if (dtemp.COMPUTE_CAPABILITY_MAJOR < 6) + { + elog(LOG, "PG-Strom: GPU%d %s - CC %d.%d is not supported", + dtemp.DEV_ID, + dtemp.DEV_NAME, + dtemp.COMPUTE_CAPABILITY_MAJOR, + dtemp.COMPUTE_CAPABILITY_MINOR); continue; - - tok_attr = strchr(linebuf, ':'); - if (!tok_attr) - elog(ERROR, "unexpected gpuinfo -md format"); - *tok_attr++ = '\0'; - - tok_val = strchr(tok_attr, '='); - if (!tok_val) - elog(ERROR, "incorrect gpuinfo -md format"); - *tok_val++ = '\0'; - - if (strcmp(linebuf, "PLATFORM") == 0) + } + if (heterodbValidateDevice(dtemp.DEV_ID, + dtemp.DEV_NAME, + dtemp.DEV_UUID)) { - if (strcmp(tok_attr, "CUDA_RUNTIME_VERSION") == 0) - cuda_runtime_version = pstrdup(tok_val); - else if (strcmp(tok_attr, "NVIDIA_DRIVER_VERSION") == 0) - nvidia_driver_version = pstrdup(tok_val); - else if (strcmp(tok_attr, "NUMBER_OF_DEVICES") == 0) + if (nitems >= nrooms) { - num_devices = atoi(tok_val); - if (num_devices < 0) - elog(ERROR, "NUMBER_OF_DEVICES is not correct"); + nrooms += 10; + __devAttrs = realloc(__devAttrs, sizeof(GpuDevAttributes) * nrooms); + if (!__devAttrs) + elog(ERROR, "out of memory"); } - else - elog(ERROR, "unknown PLATFORM attribute"); + memcpy(&__devAttrs[nitems++], &dtemp, sizeof(GpuDevAttributes)); } - else if (strncmp(linebuf, "DEVICE", 6) == 0) + else if (!is_saved) { - int dindex = atoi(linebuf + 6); - - if (!devAttrs) - { - if (!cuda_runtime_version || - !nvidia_driver_version || - num_devices < 0) - elog(ERROR, "incorrect gpuinfo -md format"); - Assert(num_devices > 0); - devAttrs = MemoryContextAllocZero(TopMemoryContext, - sizeof(DevAttributes) * - num_devices); - } - - if (dindex < 0 || dindex >= num_devices) - elog(ERROR, "device index out of range"); - -#define DEV_ATTR(LABEL,a,b,c) \ - else if (strcmp(tok_attr, #LABEL) == 0) \ - devAttrs[dindex].LABEL = atoi(tok_val); - - if (strcmp(tok_attr, "DEVICE_ID") == 0) - { - devAttrs[dindex].DEV_ID = atoi(tok_val); - } - else if (strcmp(tok_attr, "DEVICE_NAME") == 0) - { - strncpy(devAttrs[dindex].DEV_NAME, tok_val, - sizeof(devAttrs[dindex].DEV_NAME)); - } - else if (strcmp(tok_attr, "DEVICE_BRAND") == 0) - { - strncpy(devAttrs[dindex].DEV_BRAND, tok_val, - sizeof(devAttrs[dindex].DEV_BRAND)); - } - else if (strcmp(tok_attr, "DEVICE_UUID") == 0) - { - strncpy(devAttrs[dindex].DEV_UUID, tok_val, - sizeof(devAttrs[dindex].DEV_UUID)); - } - else if (strcmp(tok_attr, "GLOBAL_MEMORY_SIZE") == 0) - devAttrs[dindex].DEV_TOTAL_MEMSZ = atol(tok_val); - else if (strcmp(tok_attr, "PCI_BAR1_MEMORY_SIZE") == 0) - devAttrs[dindex].DEV_BAR1_MEMSZ = atol(tok_val); -#include "device_attrs.h" - else - elog(ERROR, "incorrect gpuinfo -md format"); -#undef DEV_ATTR + memcpy(&dattrs_saved, &dtemp, sizeof(GpuDevAttributes)); + is_saved = true; } - else - elog(ERROR, "unexpected gpuinfo -md input:\n%s", linebuf); } - ClosePipeStream(filp); - for (i=0, cuda_dindex=0; i < num_devices; i++) + if (nitems == 0 && is_saved) + { + __devAttrs = malloc(sizeof(GpuDevAttributes)); + if (!__devAttrs) + elog(ERROR, "out of memory"); + memcpy(&__devAttrs[nitems++], &dattrs_saved, sizeof(GpuDevAttributes)); + } + numGpuDevAttrs = nitems; + gpuDevAttrs = __devAttrs; +} + +/* + * pgstrom_collect_gpu_devices + */ +static void +pgstrom_collect_gpu_devices(void) +{ + int i, pipefd[2]; + pid_t child; + StringInfoData buf; + + if (pipe(pipefd) != 0) + elog(ERROR, "failed on pipe(2): %m"); + child = fork(); + if (child == 0) { - DevAttributes *dattrs = &devAttrs[i]; - char path[MAXPGPATH]; - char linebuf[2048]; - FILE *filp; + close(pipefd[0]); + _exit(collectGpuDevAttrs(pipefd[1])); + } + else if (child > 0) + { + int status; - /* Recommend to use Pascal or later */ - if (dattrs->COMPUTE_CAPABILITY_MAJOR < 6) + close(pipefd[1]); + PG_TRY(); { - elog(LOG, "PG-Strom: GPU%d %s - CC %d.%d is not supported", - dattrs->DEV_ID, - dattrs->DEV_NAME, - dattrs->COMPUTE_CAPABILITY_MAJOR, - dattrs->COMPUTE_CAPABILITY_MINOR); - continue; + receiveGpuDevAttrs(pipefd[0]); } - - /* Update the baseline device capability */ - devBaselineMaxThreadsPerBlock = Min(devBaselineMaxThreadsPerBlock, - dattrs->MAX_THREADS_PER_BLOCK); - - /* - * Only Tesla or Quadro which have PCI Bar1 more than 256MB - * supports GPUDirectSQL - */ - dattrs->DEV_SUPPORT_GPUDIRECTSQL = false; - if (dattrs->DEV_BAR1_MEMSZ > (256UL << 20)) + PG_CATCH(); { -#if CUDA_VERSION < 11030 - if (strcmp(dattrs->DEV_BRAND, "TESLA") == 0 || - strcmp(dattrs->DEV_BRAND, "QUADRO") == 0 || - strcmp(dattrs->DEV_BRAND, "NVIDIA") == 0) - dattrs->DEV_SUPPORT_GPUDIRECTSQL = true; -#else - if (dattrs->GPU_DIRECT_RDMA_SUPPORTED) - dattrs->DEV_SUPPORT_GPUDIRECTSQL = true; -#endif + /* cleanup */ + kill(child, SIGKILL); + close(pipefd[0]); + PG_RE_THROW(); } + PG_END_TRY(); + close(pipefd[0]); - /* - * read the numa node-id from the sysfs entry - * - * Note that we assume device function-id is 0, because it is - * uncertain whether MULTI_GPU_BOARD_GROUP_ID is an adequate value - * to query, and these sibling devices obviously belongs to same - * numa-node, even if function-id is not identical. - */ - snprintf(path, sizeof(path), - "/sys/bus/pci/devices/%04x:%02x:%02x.0/numa_node", - dattrs->PCI_DOMAIN_ID, - dattrs->PCI_BUS_ID, - dattrs->PCI_DEVICE_ID); - filp = fopen(path, "r"); - if (!filp) - dattrs->NUMA_NODE_ID = -1; /* unknown */ - else + while (waitpid(child, &status, 0) < 0) { - if (!fgets(linebuf, sizeof(linebuf), filp)) - dattrs->NUMA_NODE_ID = -1; /* unknown */ - else - dattrs->NUMA_NODE_ID = atoi(linebuf); - fclose(filp); + if (errno != EINTR) + { + kill(child, SIGKILL); + elog(ERROR, "failed on waitpid: %m"); + } } + if (WEXITSTATUS(status) != 0) + elog(ERROR, "GPU device attribute collector exited with %d", + WEXITSTATUS(status)); + } + else + { + close(pipefd[0]); + close(pipefd[1]); + elog(ERROR, "failed on fork(2): %m"); + } + initStringInfo(&buf); + for (i=0; i < numGpuDevAttrs; i++) + { + GpuDevAttributes *dattrs = &gpuDevAttrs[i]; - /* Log brief CUDA device properties */ - resetStringInfo(&str); - appendStringInfo(&str, "GPU%d %s (%d SMs; %dMHz, L2 %dkB)", + resetStringInfo(&buf); + appendStringInfo(&buf, "GPU%d %s (%d SMs; %dMHz, L2 %dkB)", dattrs->DEV_ID, dattrs->DEV_NAME, dattrs->MULTIPROCESSOR_COUNT, dattrs->CLOCK_RATE / 1000, dattrs->L2_CACHE_SIZE >> 10); if (dattrs->DEV_TOTAL_MEMSZ > (4UL << 30)) - appendStringInfo(&str, ", RAM %.2fGB", + appendStringInfo(&buf, ", RAM %.2fGB", ((double)dattrs->DEV_TOTAL_MEMSZ / (double)(1UL << 30))); else - appendStringInfo(&str, ", RAM %zuMB", + appendStringInfo(&buf, ", RAM %zuMB", dattrs->DEV_TOTAL_MEMSZ >> 20); if (dattrs->MEMORY_CLOCK_RATE > (1UL << 20)) - appendStringInfo(&str, " (%dbits, %.2fGHz)", + appendStringInfo(&buf, " (%dbits, %.2fGHz)", dattrs->GLOBAL_MEMORY_BUS_WIDTH, ((double)dattrs->MEMORY_CLOCK_RATE / (double)(1UL << 20))); else - appendStringInfo(&str, " (%dbits, %dMHz)", + appendStringInfo(&buf, " (%dbits, %dMHz)", dattrs->GLOBAL_MEMORY_BUS_WIDTH, dattrs->MEMORY_CLOCK_RATE >> 10); - if (dattrs->DEV_BAR1_MEMSZ > (1UL << 30)) - appendStringInfo(&str, ", PCI-E Bar1 %luGB", + appendStringInfo(&buf, ", PCI-E Bar1 %luGB", dattrs->DEV_BAR1_MEMSZ >> 30); else if (dattrs->DEV_BAR1_MEMSZ > (1UL << 20)) - appendStringInfo(&str, ", PCI-E Bar1 %luMB", + appendStringInfo(&buf, ", PCI-E Bar1 %luMB", dattrs->DEV_BAR1_MEMSZ >> 30); - - appendStringInfo(&str, ", CC %d.%d", + appendStringInfo(&buf, ", CC %d.%d", dattrs->COMPUTE_CAPABILITY_MAJOR, dattrs->COMPUTE_CAPABILITY_MINOR); - elog(LOG, "PG-Strom: %s", str.data); - - if (i != cuda_dindex) - memcpy(&devAttrs[cuda_dindex], - &devAttrs[i], sizeof(DevAttributes)); - cuda_dindex++; + elog(LOG, "PG-Strom: %s", buf.data); } + pfree(buf.data); +} - if (num_devices > 0) +/* + * pgstrom_gpu_operator_ratio + */ +double +pgstrom_gpu_operator_ratio(void) +{ + if (cpu_operator_cost > 0.0) { - if (cuda_dindex == 0) - elog(ERROR, "PG-Strom: no supported GPU devices found"); - numDevAttrs = cuda_dindex; - return true; + return pgstrom_gpu_operator_cost / cpu_operator_cost; } - return false; + return (pgstrom_gpu_operator_cost == 0.0 ? 1.0 : disable_cost); +} + +/* + * pgstrom_init_gpu_options - init GUC options related to GPUs + */ +static void +pgstrom_init_gpu_options(void) +{ + /* cost factor for GPU setup */ + DefineCustomRealVariable("pg_strom.gpu_setup_cost", + "Cost to setup GPU device to run", + NULL, + &pgstrom_gpu_setup_cost, + 100 * DEFAULT_SEQ_PAGE_COST, + 0, + DBL_MAX, + PGC_USERSET, + GUC_NOT_IN_SAMPLE, + NULL, NULL, NULL); + /* cost factor for each Gpu task */ + DefineCustomRealVariable("pg_strom.gpu_tuple_cost", + "Default cost to transfer GPU<->Host per tuple", + NULL, + &pgstrom_gpu_tuple_cost, + DEFAULT_CPU_TUPLE_COST, + 0, + DBL_MAX, + PGC_USERSET, + GUC_NOT_IN_SAMPLE, + NULL, NULL, NULL); + /* cost factor for GPU operator */ + DefineCustomRealVariable("pg_strom.gpu_operator_cost", + "Cost of processing each operators by GPU", + NULL, + &pgstrom_gpu_operator_cost, + DEFAULT_CPU_OPERATOR_COST / 16.0, + 0, + DBL_MAX, + PGC_USERSET, + GUC_NOT_IN_SAMPLE, + NULL, NULL, NULL); + /* cost factor for GPU-Direct SQL */ + DefineCustomRealVariable("pg_strom.gpu_direct_seq_page_cost", + "Cost for sequential page read by GPU-Direct SQL", + NULL, + &pgstrom_gpu_direct_seq_page_cost, + DEFAULT_SEQ_PAGE_COST / 4.0, + 0, + DBL_MAX, + PGC_USERSET, + GUC_NOT_IN_SAMPLE, + NULL, NULL, NULL); } /* * pgstrom_init_gpu_device */ -void +bool pgstrom_init_gpu_device(void) { static char *cuda_visible_devices = NULL; - bool default_gpudirect_enabled = false; - size_t default_threshold = 0; - size_t shared_buffer_size = (size_t)NBuffers * (size_t)BLCKSZ; - int i; /* * Set CUDA_VISIBLE_DEVICES environment variable prior to CUDA @@ -353,52 +405,76 @@ pgstrom_init_gpu_device(void) if (setenv("CUDA_VISIBLE_DEVICES", cuda_visible_devices, 1) != 0) elog(ERROR, "failed to set CUDA_VISIBLE_DEVICES"); } - /* collect device properties by gpuinfo command */ - if (!pgstrom_collect_gpu_device()) - return; /* cpu_only_mode */ + /* collect device attributes using child process */ + pgstrom_collect_gpu_devices(); + if (numGpuDevAttrs > 0) + { + pgstrom_init_gpu_options(); + return true; + } + return false; +} + +/* + * gpuClientOpenSession + */ +static int +__gpuClientChooseDevice(const Bitmapset *gpuset) +{ + static bool rr_initialized = false; + static uint32 rr_counter = 0; + + if (!rr_initialized) + { + rr_counter = (uint32)getpid(); + rr_initialized = true; + } - /* pgstrom.gpudirect_enabled */ - if (gpuDirectInitDriver() == 0) + if (!bms_is_empty(gpuset)) { - for (i=0; i < numDevAttrs; i++) + int num = bms_num_members(gpuset); + int *dindex = alloca(sizeof(int) * num); + int i, k; + + for (i=0, k=bms_next_member(gpuset, -1); + k >= 0; + i++, k=bms_next_member(gpuset, k)) { - if (devAttrs[i].DEV_SUPPORT_GPUDIRECTSQL) - default_gpudirect_enabled = true; + dindex[i] = k; } - gpudirect_driver_is_initialized = true; + Assert(i == num); + return dindex[rr_counter++ % num]; } - DefineCustomBoolVariable("pg_strom.gpudirect_enabled", - "enables GPUDirect SQL", - NULL, - &__pgstrom_gpudirect_enabled, - default_gpudirect_enabled, - PGC_SUSET, - GUC_NOT_IN_SAMPLE, - pgstrom_gpudirect_enabled_checker, NULL, NULL); + /* a simple round-robin if no GPUs preference */ + return (rr_counter++ % numGpuDevAttrs); +} - /* - * MEMO: Threshold of table's physical size to use NVMe-Strom: - * ((System RAM size) - - * (shared_buffer size)) * 0.5 + (shared_buffer size) - * - * If table size is enough large to issue real i/o, NVMe-Strom will - * make advantage by higher i/o performance. - */ - if (PAGE_SIZE * PHYS_PAGES > shared_buffer_size / 2) - default_threshold = (PAGE_SIZE * PHYS_PAGES - shared_buffer_size / 2); - default_threshold += shared_buffer_size; - - DefineCustomIntVariable("pg_strom.gpudirect_threshold", - "Tablesize threshold to use GPUDirect SQL", - NULL, - &__pgstrom_gpudirect_threshold, - default_threshold >> 10, - 262144, /* 256MB */ - INT_MAX, - PGC_SUSET, - GUC_NOT_IN_SAMPLE | GUC_UNIT_KB, - NULL, NULL, NULL); +void +gpuClientOpenSession(pgstromTaskState *pts, + const XpuCommand *session) +{ + struct sockaddr_un addr; + pgsocket sockfd; + int cuda_dindex = __gpuClientChooseDevice(pts->optimal_gpus); + char namebuf[32]; + + sockfd = socket(AF_UNIX, SOCK_STREAM, 0); + if (sockfd < 0) + elog(ERROR, "failed on socket(2): %m"); + + memset(&addr, 0, sizeof(addr)); + addr.sun_family = AF_UNIX; + snprintf(addr.sun_path, sizeof(addr.sun_path), + ".pg_strom.%u.gpu%u.sock", + PostmasterPid, cuda_dindex); + if (connect(sockfd, (struct sockaddr *)&addr, sizeof(addr)) != 0) + { + close(sockfd); + elog(ERROR, "failed on connect('%s'): %m", addr.sun_path); + } + snprintf(namebuf, sizeof(namebuf), "GPU-%d", cuda_dindex); + __xpuClientOpenSession(pts, session, sockfd, namebuf, cuda_dindex); } /* @@ -406,155 +482,72 @@ pgstrom_init_gpu_device(void) * according to the function and device attributes */ static __thread size_t __dynamic_shmem_per_block; -static __thread size_t __dynamic_shmem_per_thread; +static __thread size_t __dynamic_shmem_per_warp; static size_t blocksize_to_shmemsize_helper(int blocksize) { - return (__dynamic_shmem_per_block + - __dynamic_shmem_per_thread * (size_t)blocksize); + int n_warps = (blocksize + WARPSIZE - 1) / WARPSIZE; + + return MAXALIGN(__dynamic_shmem_per_block + + __dynamic_shmem_per_warp * n_warps); } -/* - * gpuOccupancyMaxPotentialBlockSize - */ CUresult -gpuOccupancyMaxPotentialBlockSize(int *p_min_grid_sz, - int *p_max_block_sz, - CUfunction kern_function, - size_t dynamic_shmem_per_block, - size_t dynamic_shmem_per_thread) +gpuOptimalBlockSize(int *p_grid_sz, + int *p_block_sz, + unsigned int *p_shmem_sz, + CUfunction kern_function, + size_t dynamic_shmem_per_block, + size_t dynamic_shmem_per_warp) { - cl_int min_grid_sz; - cl_int max_block_sz; CUresult rc; - if (dynamic_shmem_per_thread > 0) + if (dynamic_shmem_per_warp == 0) { - __dynamic_shmem_per_block = dynamic_shmem_per_block; - __dynamic_shmem_per_thread = dynamic_shmem_per_thread; - rc = cuOccupancyMaxPotentialBlockSize(&min_grid_sz, - &max_block_sz, + rc = cuOccupancyMaxPotentialBlockSize(p_grid_sz, + p_block_sz, kern_function, - blocksize_to_shmemsize_helper, - 0, + NULL, + dynamic_shmem_per_block, 0); + if (rc == CUDA_SUCCESS) + *p_shmem_sz = dynamic_shmem_per_block; } else { - rc = cuOccupancyMaxPotentialBlockSize(&min_grid_sz, - &max_block_sz, + __dynamic_shmem_per_block = dynamic_shmem_per_block; + __dynamic_shmem_per_warp = dynamic_shmem_per_warp; + rc = cuOccupancyMaxPotentialBlockSize(p_grid_sz, + p_block_sz, kern_function, - 0, + blocksize_to_shmemsize_helper, dynamic_shmem_per_block, 0); + if (rc == CUDA_SUCCESS) + *p_shmem_sz = blocksize_to_shmemsize_helper(*p_block_sz); } - if (p_min_grid_sz) - *p_min_grid_sz = min_grid_sz; - if (p_max_block_sz) - *p_max_block_sz = max_block_sz; return rc; } -CUresult -gpuOptimalBlockSize(int *p_grid_sz, - int *p_block_sz, - CUfunction kern_function, - CUdevice cuda_device, - size_t dynamic_shmem_per_block, - size_t dynamic_shmem_per_thread) -{ - cl_int mp_count; - cl_int min_grid_sz; - cl_int max_block_sz; - cl_int max_multiplicity; - size_t dynamic_shmem_sz; - CUresult rc; - - rc = cuDeviceGetAttribute(&mp_count, - CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, - cuda_device); - if (rc != CUDA_SUCCESS) - return rc; - - rc = gpuOccupancyMaxPotentialBlockSize(&min_grid_sz, - &max_block_sz, - kern_function, - dynamic_shmem_per_block, - dynamic_shmem_per_thread); - if (rc != CUDA_SUCCESS) - return rc; - - dynamic_shmem_sz = (dynamic_shmem_per_block + - dynamic_shmem_per_thread * max_block_sz); - rc = cuOccupancyMaxActiveBlocksPerMultiprocessor(&max_multiplicity, - kern_function, - max_block_sz, - dynamic_shmem_sz); - if (rc != CUDA_SUCCESS) - return rc; - - *p_grid_sz = Min(GPUKERNEL_MAX_SM_MULTIPLICITY, - max_multiplicity) * mp_count; - *p_block_sz = max_block_sz; - - return CUDA_SUCCESS; -} - -CUresult -__gpuOptimalBlockSize(int *p_grid_sz, - int *p_block_sz, - CUfunction kern_function, - int cuda_dindex, - size_t dynamic_shmem_per_block, - size_t dynamic_shmem_per_thread) -{ - cl_int mp_count = devAttrs[cuda_dindex].MULTIPROCESSOR_COUNT; - cl_int min_grid_sz; - cl_int max_block_sz; - cl_int max_multiplicity; - size_t dynamic_shmem_sz; - CUresult rc; - - rc = gpuOccupancyMaxPotentialBlockSize(&min_grid_sz, - &max_block_sz, - kern_function, - dynamic_shmem_per_block, - dynamic_shmem_per_thread); - if (rc != CUDA_SUCCESS) - return rc; - - dynamic_shmem_sz = (dynamic_shmem_per_block + - dynamic_shmem_per_thread * max_block_sz); - rc = cuOccupancyMaxActiveBlocksPerMultiprocessor(&max_multiplicity, - kern_function, - max_block_sz, - dynamic_shmem_sz); - if (rc != CUDA_SUCCESS) - return rc; - - *p_grid_sz = Min(GPUKERNEL_MAX_SM_MULTIPLICITY, - max_multiplicity) * mp_count; - *p_block_sz = max_block_sz; - - return CUDA_SUCCESS; -} - /* - * pgstrom_device_info - SQL function to dump device info + * pgstrom_gpu_device_info - SQL function to dump device info */ +PG_FUNCTION_INFO_V1(pgstrom_gpu_device_info); Datum -pgstrom_device_info(PG_FUNCTION_ARGS) +pgstrom_gpu_device_info(PG_FUNCTION_ARGS) { FuncCallContext *fncxt; - DevAttributes *dattrs; - int dindex; - int aindex; - const char *att_name; - const char *att_value; - Datum values[4]; - bool isnull[4]; - HeapTuple tuple; + GpuDevAttributes *dattrs; + int dindex; + int aindex; + int i, val; + const char *att_name; + const char *att_value; + const char *att_desc; + Datum values[4]; + bool isnull[4]; + HeapTuple tuple; if (SRF_IS_FIRSTCALL()) { @@ -565,13 +558,13 @@ pgstrom_device_info(PG_FUNCTION_ARGS) oldcxt = MemoryContextSwitchTo(fncxt->multi_call_memory_ctx); tupdesc = CreateTemplateTupleDesc(4); - TupleDescInitEntry(tupdesc, (AttrNumber) 1, "device_nr", + TupleDescInitEntry(tupdesc, (AttrNumber) 1, "gpu_id", INT4OID, -1, 0); - TupleDescInitEntry(tupdesc, (AttrNumber) 2, "aindex", - INT4OID, -1, 0); - TupleDescInitEntry(tupdesc, (AttrNumber) 3, "attribute", + TupleDescInitEntry(tupdesc, (AttrNumber) 2, "att_name", + TEXTOID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 3, "att_value", TEXTOID, -1, 0); - TupleDescInitEntry(tupdesc, (AttrNumber) 4, "value", + TupleDescInitEntry(tupdesc, (AttrNumber) 4, "att_desc", TEXTOID, -1, 0); fncxt->tuple_desc = BlessTupleDesc(tupdesc); @@ -581,105 +574,125 @@ pgstrom_device_info(PG_FUNCTION_ARGS) } fncxt = SRF_PERCALL_SETUP(); - dindex = fncxt->call_cntr / (lengthof(DevAttrCatalog) + 5); - aindex = fncxt->call_cntr % (lengthof(DevAttrCatalog) + 5); - - if (dindex >= numDevAttrs) + dindex = fncxt->call_cntr / (lengthof(GpuDevAttrCatalog) + 5); + aindex = fncxt->call_cntr % (lengthof(GpuDevAttrCatalog) + 5); + if (dindex >= numGpuDevAttrs) SRF_RETURN_DONE(fncxt); - dattrs = &devAttrs[dindex]; - - if (aindex == 0) - { - att_name = "GPU Device Name"; - att_value = dattrs->DEV_NAME; - } - else if (aindex == 1) - { - att_name = "GPU Device Brand"; - att_value = dattrs->DEV_BRAND; - } - else if (aindex == 2) - { - att_name = "GPU Device UUID"; - att_value = dattrs->DEV_UUID; - } - else if (aindex == 3) - { - att_name = "GPU Total RAM Size"; - att_value = format_bytesz(dattrs->DEV_TOTAL_MEMSZ); - } - else if (aindex == 4) - { - att_name = "GPU PCI Bar1 Size"; - att_value = format_bytesz(dattrs->DEV_BAR1_MEMSZ); - } - else + dattrs = &gpuDevAttrs[dindex]; + switch (aindex) { - int i = aindex - 5; - int value = *((int *)((char *)dattrs + - DevAttrCatalog[i].attr_offset)); - - att_name = DevAttrCatalog[i].attr_desc; - switch (DevAttrCatalog[i].attr_kind) - { - case DEVATTRKIND__INT: - att_value = psprintf("%d", value); - break; - case DEVATTRKIND__BYTES: - att_value = format_bytesz((size_t)value); - break; - case DEVATTRKIND__KB: - att_value = format_bytesz((size_t)value * 1024); - break; - case DEVATTRKIND__KHZ: - if (value > 4000000) - att_value = psprintf("%.2f GHz", (double)value/1000000.0); - else if (value > 4000) - att_value = psprintf("%d MHz", value / 1000); - else - att_value = psprintf("%d kHz", value); - break; - case DEVATTRKIND__COMPUTEMODE: - switch (value) - { - case CU_COMPUTEMODE_DEFAULT: - att_value = "Default"; - break; -#if CUDA_VERSION < 8000 - case CU_COMPUTEMODE_EXCLUSIVE: - att_value = "Exclusive"; - break; -#endif - case CU_COMPUTEMODE_PROHIBITED: - att_value = "Prohibited"; - break; - case CU_COMPUTEMODE_EXCLUSIVE_PROCESS: - att_value = "Exclusive Process"; - break; - default: - att_value = "Unknown"; - break; - } - break; - case DEVATTRKIND__BOOL: - att_value = psprintf("%s", value != 0 ? "True" : "False"); - break; - case DEVATTRKIND__BITS: - att_value = psprintf("%dbits", value); - break; - default: - elog(ERROR, "Bug? unknown DevAttrKind: %d", - (int)DevAttrCatalog[i].attr_kind); - } + case 0: + att_name = "DEV_NAME"; + att_desc = "GPU Device Name"; + att_value = dattrs->DEV_NAME; + break; + case 1: + att_name = "DEV_ID"; + att_desc = "GPU Device ID"; + att_value = psprintf("%d", dattrs->DEV_ID); + break; + case 2: + att_name = "DEV_UUID"; + att_desc = "GPU Device UUID"; + att_value = psprintf("GPU-%02x%02x%02x%02x-%02x%02x-%02x%02x-" + "%02x%02x-%02x%02x%02x%02x%02x%02x", + (uint8_t)dattrs->DEV_UUID[0], + (uint8_t)dattrs->DEV_UUID[1], + (uint8_t)dattrs->DEV_UUID[2], + (uint8_t)dattrs->DEV_UUID[3], + (uint8_t)dattrs->DEV_UUID[4], + (uint8_t)dattrs->DEV_UUID[5], + (uint8_t)dattrs->DEV_UUID[6], + (uint8_t)dattrs->DEV_UUID[7], + (uint8_t)dattrs->DEV_UUID[8], + (uint8_t)dattrs->DEV_UUID[9], + (uint8_t)dattrs->DEV_UUID[10], + (uint8_t)dattrs->DEV_UUID[11], + (uint8_t)dattrs->DEV_UUID[12], + (uint8_t)dattrs->DEV_UUID[13], + (uint8_t)dattrs->DEV_UUID[14], + (uint8_t)dattrs->DEV_UUID[15]); + break; + case 3: + att_name = "DEV_TOTAL_MEMSZ"; + att_desc = "GPU Total RAM Size"; + att_value = format_bytesz(dattrs->DEV_TOTAL_MEMSZ); + break; + case 4: + att_name = "DEV_BAR1_MEMSZ"; + att_desc = "GPU PCI Bar1 Size"; + att_value = format_bytesz(dattrs->DEV_BAR1_MEMSZ); + break; + case 5: + att_name = "NUMA_NODE_ID"; + att_desc = "GPU NUMA Node Id"; + att_value = psprintf("%d", dattrs->NUMA_NODE_ID); + break; + default: + i = aindex - 6; + val = *((int *)((char *)dattrs + + GpuDevAttrCatalog[i].attr_offset)); + att_name = GpuDevAttrCatalog[i].attr_label; + att_desc = GpuDevAttrCatalog[i].attr_desc; + switch (GpuDevAttrCatalog[i].attr_id) + { + case CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK: + case CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY: + case CU_DEVICE_ATTRIBUTE_MAX_PITCH: + case CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE: + case CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR: + /* bytes */ + att_value = format_bytesz((size_t)val); + break; + + case CU_DEVICE_ATTRIBUTE_CLOCK_RATE: + case CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE: + /* clock */ + if (val > 4000000) + att_value = psprintf("%.2f GHz", (double)val/1000000.0); + else if (val > 4000) + att_value = psprintf("%d MHz", val / 1000); + else + att_value = psprintf("%d kHz", val); + break; + + case CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH: + /* bits */ + att_value = psprintf("%s", val != 0 ? "True" : "False"); + break; + + case CU_DEVICE_ATTRIBUTE_COMPUTE_MODE: + /* compute mode */ + switch (val) + { + case CU_COMPUTEMODE_DEFAULT: + att_value = "Default"; + break; + case CU_COMPUTEMODE_PROHIBITED: + att_value = "Prohibited"; + break; + case CU_COMPUTEMODE_EXCLUSIVE_PROCESS: + att_value = "Exclusive Process"; + break; + default: + att_value = "Unknown"; + break; + } + break; + + default: + att_value = psprintf("%d", val); + break; + } + break; } memset(isnull, 0, sizeof(isnull)); values[0] = Int32GetDatum(dattrs->DEV_ID); - values[1] = Int32GetDatum(aindex); - values[2] = CStringGetTextDatum(att_name); - values[3] = CStringGetTextDatum(att_value); + values[1] = CStringGetTextDatum(att_name); + values[2] = CStringGetTextDatum(att_value); + values[3] = CStringGetTextDatum(att_desc); tuple = heap_form_tuple(fncxt->tuple_desc, values, isnull); SRF_RETURN_NEXT(fncxt, HeapTupleGetDatum(tuple)); } -PG_FUNCTION_INFO_V1(pgstrom_device_info); diff --git a/next/gpu_join.c b/src/gpu_join.c similarity index 100% rename from next/gpu_join.c rename to src/gpu_join.c diff --git a/next/gpu_preagg.c b/src/gpu_preagg.c similarity index 100% rename from next/gpu_preagg.c rename to src/gpu_preagg.c diff --git a/next/gpu_scan.c b/src/gpu_scan.c similarity index 100% rename from next/gpu_scan.c rename to src/gpu_scan.c diff --git a/next/gpu_service.c b/src/gpu_service.c similarity index 100% rename from next/gpu_service.c rename to src/gpu_service.c diff --git a/src/heterodb_extra.h b/src/heterodb_extra.h index ee7464de6..bef3525fd 100644 --- a/src/heterodb_extra.h +++ b/src/heterodb_extra.h @@ -3,8 +3,8 @@ * * Definitions of HeteroDB Extra Package * -- - * Copyright 2011-2021 (C) KaiGai Kohei - * Copyright 2017-2021 (C) HeteroDB,Inc + * Copyright 2011-2023 (C) KaiGai Kohei + * Copyright 2017-2023 (C) HeteroDB,Inc * * This software is an extension of PostgreSQL; You can use, copy, * modify or distribute it under the terms of 'LICENSE' included @@ -16,7 +16,7 @@ #define HETERODB_EXTRA_FILENAME "heterodb_extra.so" #define HETERODB_EXTRA_PATHNAME "/usr/lib64/" HETERODB_EXTRA_FILENAME -#define HETERODB_EXTRA_API_VERSION 20211018 +#define HETERODB_EXTRA_API_VERSION 20221225 /* gpudirect.c */ typedef struct @@ -33,26 +33,6 @@ typedef struct strom_io_chunk ioc[1]; } strom_io_vector; -typedef struct GPUDirectFileDesc -{ - int rawfd; - void *fhandle; - size_t bytesize; - /* CUfileHandle_t is an alias of 'void *' defined at cufile.h */ -} GPUDirectFileDesc; - -/* sysfs.c */ -typedef struct -{ - int device_id; - char device_name[128]; - const char *cpu_affinity; /* __internal use__ */ - int pci_domain; /* PCI_DOMAIN_ID */ - int pci_bus_id; /* PCI_BUS_ID */ - int pci_dev_id; /* PCI_DEVICE_ID */ - int pci_func_id; /* MULTI_GPU_BOARD ? MULTI_GPU_BOARD_GROUP_ID : 0 */ -} GpuPciDevItem; - /* misc.c */ typedef struct { diff --git a/src/main.c b/src/main.c index f5281bec8..f3d13c8a6 100644 --- a/src/main.c +++ b/src/main.c @@ -1,10 +1,10 @@ /* * main.c * - * Entrypoint of PG-Strom extension, and misc uncategolized functions. + * Entrypoint of PG-Strom extension * ---- - * Copyright 2011-2021 (C) KaiGai Kohei - * Copyright 2014-2021 (C) PG-Strom Developers Team + * Copyright 2011-2023 (C) KaiGai Kohei + * Copyright 2014-2023 (C) PG-Strom Developers Team * * This program is free software; you can redistribute it and/or modify * it under the terms of the PostgreSQL License. @@ -13,36 +13,20 @@ PG_MODULE_MAGIC; -/* - * miscellaneous GUC parameters - */ -bool pgstrom_enabled; -bool pgstrom_cpu_fallback_enabled; -bool pgstrom_regression_test_mode; - -/* cost factors */ -double pgstrom_gpu_setup_cost; -double pgstrom_gpu_dma_cost; -double pgstrom_gpu_operator_cost; - -/* misc static variables */ -static HTAB *gpu_path_htable = NULL; -static planner_hook_type planner_hook_next = NULL; -static CustomPathMethods pgstrom_dummy_path_methods; -static CustomScanMethods pgstrom_dummy_plan_methods; - -/* for compatibility of shmem_request_hook in PG14 or former */ -#if PG_VERSION_NUM < 150000 -shmem_request_hook_type shmem_request_hook = NULL; -#endif - /* misc variables */ +bool pgstrom_enabled; /* GUC */ +bool pgstrom_cpu_fallback_enabled; /* GUC */ +bool pgstrom_regression_test_mode; /* GUC */ +int pgstrom_max_async_tasks; /* GUC */ long PAGE_SIZE; long PAGE_MASK; int PAGE_SHIFT; long PHYS_PAGES; -int pgstrom_num_users_extra = 0; -pgstromUsersExtraDescriptor pgstrom_users_extra_desc[8]; +long PAGES_PER_BLOCK; + +static planner_hook_type planner_hook_next = NULL; +static CustomPathMethods pgstrom_dummy_path_methods; +static CustomScanMethods pgstrom_dummy_plan_methods; /* pg_strom.githash() */ PG_FUNCTION_INFO_V1(pgstrom_githash); @@ -53,26 +37,38 @@ pgstrom_githash(PG_FUNCTION_ARGS) PG_RETURN_TEXT_P(cstring_to_text(PGSTROM_GITHASH)); #else PG_RETURN_NULL(); -#endif +#endif +} + +/* + * pg_kern_ereport - raise an ereport at host side + */ +void +pg_kern_ereport(kern_context *kcxt) +{ + ereport(ERROR, (errcode(kcxt->errcode), + errmsg("%s:%u %s", + kcxt->error_filename, + kcxt->error_lineno, + kcxt->error_message))); } +/* + * pg_hash_any - the standard hash function at device code + */ +uint32_t +pg_hash_any(const void *ptr, int sz) +{ + return (uint32_t)hash_any((const unsigned char *)ptr, sz); +} + +/* + * pgstrom_init_gucs + */ static void -pgstrom_init_common_guc(void) +pgstrom_init_gucs(void) { - if (cpu_only_mode()) - { - /* Disables PG-Strom features by GPU */ - DefineCustomBoolVariable("pg_strom.enabled", - "Enables the planner's use of PG-Strom", - NULL, - &pgstrom_enabled, - false, - PGC_INTERNAL, - GUC_NOT_IN_SAMPLE, - NULL, NULL, NULL); - return; - } - /* turn on/off PG-Strom feature */ + /* Disables PG-Strom features at all */ DefineCustomBoolVariable("pg_strom.enabled", "Enables the planner's use of PG-Strom", NULL, @@ -90,39 +86,6 @@ pgstrom_init_common_guc(void) PGC_USERSET, GUC_NOT_IN_SAMPLE, NULL, NULL, NULL); - /* cost factor for Gpu setup */ - DefineCustomRealVariable("pg_strom.gpu_setup_cost", - "Cost to setup GPU device to run", - NULL, - &pgstrom_gpu_setup_cost, - 4000 * DEFAULT_SEQ_PAGE_COST, - 0, - DBL_MAX, - PGC_USERSET, - GUC_NOT_IN_SAMPLE, - NULL, NULL, NULL); - /* cost factor for each Gpu task */ - DefineCustomRealVariable("pg_strom.gpu_dma_cost", - "Cost to send/recv data via DMA", - NULL, - &pgstrom_gpu_dma_cost, - 10 * DEFAULT_SEQ_PAGE_COST, - 0, - DBL_MAX, - PGC_USERSET, - GUC_NOT_IN_SAMPLE, - NULL, NULL, NULL); - /* cost factor for Gpu operator */ - DefineCustomRealVariable("pg_strom.gpu_operator_cost", - "Cost of processing each operators by GPU", - NULL, - &pgstrom_gpu_operator_cost, - DEFAULT_CPU_OPERATOR_COST / 16.0, - 0, - DBL_MAX, - PGC_USERSET, - GUC_NOT_IN_SAMPLE, - NULL, NULL, NULL); /* disables some platform specific EXPLAIN output */ DefineCustomBoolVariable("pg_strom.regression_test_mode", "Disables some platform specific output in EXPLAIN; that can lead undesired test failed but harmless", @@ -132,10 +95,20 @@ pgstrom_init_common_guc(void) PGC_USERSET, GUC_NO_SHOW_ALL | GUC_NOT_IN_SAMPLE, NULL, NULL, NULL); + DefineCustomIntVariable("pg_strom.max_async_tasks", + "Limit of conccurent execution at the xPU devices", + NULL, + &pgstrom_max_async_tasks, + 7, + 1, + 255, + PGC_SUSET, + GUC_NOT_IN_SAMPLE, + NULL, NULL, NULL); } /* - * GPU-aware path tracker + * xPU-aware path tracker * * motivation: add_path() and add_partial_path() keeps only cheapest paths. * Once some other dominates GpuXXX paths, it shall be wiped out, even if @@ -147,133 +120,114 @@ typedef struct { PlannerInfo *root; Relids relids; - bool outer_parallel; - bool inner_parallel; - const Path *cheapest_gpu_path; -} gpu_path_entry; + bool parallel_path; + uint32_t devkind; /* one of DEVKIND_* */ + CustomPath *cpath; +} custom_path_entry; + +static HTAB *custom_path_htable = NULL; static uint32 -gpu_path_entry_hashvalue(const void *key, Size keysize) +custom_path_entry_hashvalue(const void *key, Size keysize) { - gpu_path_entry *gent = (gpu_path_entry *)key; - uint32 hash; - uint32 flags = 0; + custom_path_entry *cent = (custom_path_entry *)key; + uint32 hash; - hash = hash_uint32(((uintptr_t)gent->root & 0xffffffffUL) ^ - ((uintptr_t)gent->root >> 32)); - if (gent->relids != NULL) - { - Bitmapset *relids = gent->relids; - - hash ^= hash_any((unsigned char *)relids, - offsetof(Bitmapset, words[relids->nwords])); - } - if (gent->outer_parallel) - flags |= 0x01; - if (gent->inner_parallel) - flags |= 0x02; - hash ^= hash_uint32(flags); + hash = hash_bytes((unsigned char *)¢->root, sizeof(PlannerInfo *)); + hash ^= bms_hash_value(cent->relids); + if (cent->parallel_path) + hash ^= 0x9e3779b9U; + hash ^= hash_uint32(cent->devkind); return hash; } static int -gpu_path_entry_compare(const void *key1, const void *key2, Size keysize) +custom_path_entry_compare(const void *key1, const void *key2, Size keysize) { - gpu_path_entry *gent1 = (gpu_path_entry *)key1; - gpu_path_entry *gent2 = (gpu_path_entry *)key2; + custom_path_entry *cent1 = (custom_path_entry *)key1; + custom_path_entry *cent2 = (custom_path_entry *)key2; - if (gent1->root == gent2->root && - bms_equal(gent1->relids, gent2->relids) && - gent1->outer_parallel == gent2->outer_parallel && - gent1->inner_parallel == gent2->inner_parallel) + if (cent1->root == cent2->root && + bms_equal(cent1->relids, cent2->relids) && + cent1->parallel_path == cent2->parallel_path && + cent1->devkind == cent2->devkind) return 0; /* not equal */ return 1; } -static void * -gpu_path_entry_keycopy(void *dest, const void *src, Size keysize) -{ - gpu_path_entry *dent = (gpu_path_entry *)dest; - const gpu_path_entry *sent = (const gpu_path_entry *)src; - - dent->root = sent->root; - dent->relids = bms_copy(sent->relids); - dent->outer_parallel = sent->outer_parallel; - dent->inner_parallel = sent->inner_parallel; - - return dest; -} - -const Path * -gpu_path_find_cheapest(PlannerInfo *root, RelOptInfo *rel, - bool outer_parallel, - bool inner_parallel) +CustomPath * +custom_path_find_cheapest(PlannerInfo *root, + RelOptInfo *rel, + bool parallel_path, + uint32_t devkind) { - gpu_path_entry hkey; - gpu_path_entry *gent; + custom_path_entry hkey; + custom_path_entry *cent; - memset(&hkey, 0, sizeof(gpu_path_entry)); + memset(&hkey, 0, sizeof(custom_path_entry)); hkey.root = root; hkey.relids = rel->relids; - hkey.outer_parallel = outer_parallel; - hkey.inner_parallel = inner_parallel; + hkey.parallel_path = (parallel_path ? true : false); + hkey.devkind = (devkind & DEVKIND__ANY); - gent = hash_search(gpu_path_htable, &hkey, HASH_FIND, NULL); - if (!gent) + cent = hash_search(custom_path_htable, &hkey, HASH_FIND, NULL); + if (!cent) return NULL; - return gent->cheapest_gpu_path; + return cent->cpath; } bool -gpu_path_remember(PlannerInfo *root, RelOptInfo *rel, - bool outer_parallel, - bool inner_parallel, - const Path *gpu_path) +custom_path_remember(PlannerInfo *root, + RelOptInfo *rel, + bool parallel_path, + uint32_t devkind, + const CustomPath *cpath) { - gpu_path_entry hkey; - gpu_path_entry *gent; - bool found; + custom_path_entry hkey; + custom_path_entry *cent; + bool found; - memset(&hkey, 0, sizeof(gpu_path_entry)); + Assert((devkind & DEVKIND__ANY) == DEVKIND__NVIDIA_GPU || + (devkind & DEVKIND__ANY) == DEVKIND__NVIDIA_DPU); + memset(&hkey, 0, sizeof(custom_path_entry)); hkey.root = root; hkey.relids = rel->relids; - hkey.outer_parallel = outer_parallel; - hkey.inner_parallel = inner_parallel; + hkey.parallel_path = (parallel_path ? true : false); + hkey.devkind = (devkind & DEVKIND__ANY); - gent = hash_search(gpu_path_htable, &hkey, HASH_ENTER, &found); + cent = hash_search(custom_path_htable, &hkey, HASH_ENTER, &found); if (found) { /* new path is more expensive than prior one! */ - if (gent->cheapest_gpu_path->total_cost < gpu_path->total_cost) + if (cent->cpath->path.total_cost <= cpath->path.total_cost) return false; } - Assert(gent->root == root && - bms_equal(gent->relids, rel->relids) && - gent->outer_parallel == outer_parallel && - gent->inner_parallel == inner_parallel); - gent->cheapest_gpu_path = pgstrom_copy_pathnode(gpu_path); + cent->cpath = (CustomPath *)pgstrom_copy_pathnode(&cpath->path); return true; } -/* - * pgstrom_create_dummy_path - */ +/* -------------------------------------------------------------------------------- + * + * add/remove dummy plan node + * + * -------------------------------------------------------------------------------- */ Path * pgstrom_create_dummy_path(PlannerInfo *root, Path *subpath) { - CustomPath *cpath = makeNode(CustomPath); - PathTarget *final_target = root->upper_targets[UPPERREL_FINAL]; - ListCell *lc1; - ListCell *lc2; + CustomPath *cpath = makeNode(CustomPath); + RelOptInfo *upper_rel = subpath->parent; + PathTarget *upper_target = upper_rel->reltarget; + PathTarget *sub_target = subpath->pathtarget; + ListCell *lc1, *lc2; /* sanity checks */ - if (list_length(final_target->exprs) != list_length(subpath->pathtarget->exprs)) + if (list_length(upper_target->exprs) != list_length(sub_target->exprs)) elog(ERROR, "CustomScan(dummy): incompatible tlist is supplied"); - forboth (lc1, final_target->exprs, - lc2, subpath->pathtarget->exprs) + forboth (lc1, upper_target->exprs, + lc2, sub_target->exprs) { Node *node1 = lfirst(lc1); Node *node2 = lfirst(lc2); @@ -283,10 +237,10 @@ pgstrom_create_dummy_path(PlannerInfo *root, Path *subpath) nodeToString(node1), nodeToString(node2)); } - + Assert(subpath->parent == upper_rel); cpath->path.pathtype = T_CustomScan; - cpath->path.parent = subpath->parent; - cpath->path.pathtarget = final_target; + cpath->path.parent = upper_rel; + cpath->path.pathtarget = upper_target; cpath->path.param_info = NULL; cpath->path.parallel_aware = subpath->parallel_aware; cpath->path.parallel_safe = subpath->parallel_safe; @@ -297,7 +251,7 @@ pgstrom_create_dummy_path(PlannerInfo *root, Path *subpath) cpath->path.total_cost = subpath->total_cost; cpath->custom_paths = list_make1(subpath); - cpath->methods = &pgstrom_dummy_path_methods; + cpath->methods = &pgstrom_dummy_path_methods; return &cpath->path; } @@ -358,22 +312,9 @@ pgstrom_removal_dummy_plans(PlannedStmt *pstmt, Plan **p_plan) Assert(plan != NULL); switch (nodeTag(plan)) { -#if PG_VERSION_NUM < 140000 - /* - * PG14 changed ModifyTable to use lefttree to save its subplan. - */ - case T_ModifyTable: - { - ModifyTable *splan = (ModifyTable *) plan; - - foreach (lc, splan->plans) - pgstrom_removal_dummy_plans(pstmt, (Plan **)&lfirst(lc)); - } - break; -#endif case T_Append: { - Append *splan = (Append *) plan; + Append *splan = (Append *)plan; foreach (lc, splan->appendplans) pgstrom_removal_dummy_plans(pstmt, (Plan **)&lfirst(lc)); @@ -382,7 +323,7 @@ pgstrom_removal_dummy_plans(PlannedStmt *pstmt, Plan **p_plan) case T_MergeAppend: { - MergeAppend *splan = (MergeAppend *) plan; + MergeAppend *splan = (MergeAppend *)plan; foreach (lc, splan->mergeplans) pgstrom_removal_dummy_plans(pstmt, (Plan **)&lfirst(lc)); @@ -391,7 +332,7 @@ pgstrom_removal_dummy_plans(PlannedStmt *pstmt, Plan **p_plan) case T_BitmapAnd: { - BitmapAnd *splan = (BitmapAnd *) plan; + BitmapAnd *splan = (BitmapAnd *)plan; foreach (lc, splan->bitmapplans) pgstrom_removal_dummy_plans(pstmt, (Plan **)&lfirst(lc)); @@ -400,7 +341,7 @@ pgstrom_removal_dummy_plans(PlannedStmt *pstmt, Plan **p_plan) case T_BitmapOr: { - BitmapOr *splan = (BitmapOr *) plan; + BitmapOr *splan = (BitmapOr *)plan; foreach (lc, splan->bitmapplans) pgstrom_removal_dummy_plans(pstmt, (Plan **)&lfirst(lc)); @@ -409,7 +350,7 @@ pgstrom_removal_dummy_plans(PlannedStmt *pstmt, Plan **p_plan) case T_SubqueryScan: { - SubqueryScan *sscan = (SubqueryScan *) plan; + SubqueryScan *sscan = (SubqueryScan *)plan; pgstrom_removal_dummy_plans(pstmt, &sscan->subplan); } @@ -417,24 +358,25 @@ pgstrom_removal_dummy_plans(PlannedStmt *pstmt, Plan **p_plan) case T_CustomScan: { - CustomScan *cscan = (CustomScan *) plan; + CustomScan *cscan = (CustomScan *)plan; if (cscan->methods == &pgstrom_dummy_plan_methods) { Plan *subplan = outerPlan(cscan); ListCell *lc1, *lc2; + /* sanity checks */ + Assert(innerPlan(cscan) == NULL); if (list_length(cscan->scan.plan.targetlist) != list_length(subplan->targetlist)) elog(ERROR, "Bug? dummy plan's targelist length mismatch"); forboth (lc1, cscan->scan.plan.targetlist, - lc2, subplan->targetlist) + lc2, subplan->targetlist) { TargetEntry *tle1 = lfirst(lc1); TargetEntry *tle2 = lfirst(lc2); - if (exprType((Node *)tle1->expr) != - exprType((Node *)tle2->expr)) + if (exprType((Node *)tle1->expr) != exprType((Node *)tle2->expr)) elog(ERROR, "Bug? dummy TLE type mismatch [%s] [%s]", nodeToString(tle1), nodeToString(tle2)); @@ -451,6 +393,7 @@ pgstrom_removal_dummy_plans(PlannedStmt *pstmt, Plan **p_plan) break; default: + /* nothing special sub-plans */ break; } if (plan->lefttree) @@ -464,53 +407,46 @@ pgstrom_removal_dummy_plans(PlannedStmt *pstmt, Plan **p_plan) */ static PlannedStmt * pgstrom_post_planner(Query *parse, -#if PG_VERSION_NUM >= 130000 const char *query_string, -#endif int cursorOptions, ParamListInfo boundParams) { - HTAB *gpu_path_htable_saved = gpu_path_htable; - PlannedStmt *pstmt; - ListCell *lc; + HTAB *custom_path_htable_saved = custom_path_htable; + HASHCTL hctl; + PlannedStmt *pstmt; + ListCell *lc; PG_TRY(); { - HASHCTL hctl; - - /* make hash-table to preserve GPU-aware path-nodes */ memset(&hctl, 0, sizeof(HASHCTL)); hctl.hcxt = CurrentMemoryContext; - hctl.keysize = offsetof(gpu_path_entry, cheapest_gpu_path); - hctl.entrysize = sizeof(gpu_path_entry); - hctl.hash = gpu_path_entry_hashvalue; - hctl.match = gpu_path_entry_compare; - hctl.keycopy = gpu_path_entry_keycopy; - gpu_path_htable = hash_create("GPU-aware Path-nodes table", - 512, - &hctl, - HASH_CONTEXT | - HASH_ELEM | - HASH_FUNCTION | - HASH_COMPARE | - HASH_KEYCOPY); + hctl.keysize = offsetof(custom_path_entry, cpath); + hctl.entrysize = sizeof(custom_path_entry); + hctl.hash = custom_path_entry_hashvalue; + hctl.match = custom_path_entry_compare; + custom_path_htable = hash_create("HTable to preserve Custom-Paths", + 512, + &hctl, + HASH_CONTEXT | + HASH_ELEM | + HASH_FUNCTION | + HASH_COMPARE); pstmt = planner_hook_next(parse, -#if PG_VERSION_NUM >= 130000 query_string, -#endif cursorOptions, boundParams); } PG_CATCH(); { - hash_destroy(gpu_path_htable); - gpu_path_htable = gpu_path_htable_saved; + hash_destroy(custom_path_htable); + custom_path_htable = custom_path_htable_saved; PG_RE_THROW(); } PG_END_TRY(); - hash_destroy(gpu_path_htable); - gpu_path_htable = gpu_path_htable_saved; + hash_destroy(custom_path_htable); + custom_path_htable = custom_path_htable_saved; + /* remove dummy plan */ pgstrom_removal_dummy_plans(pstmt, &pstmt->planTree); foreach (lc, pstmt->subplans) pgstrom_removal_dummy_plans(pstmt, (Plan **)&lfirst(lc)); @@ -519,34 +455,12 @@ pgstrom_post_planner(Query *parse, } /* - * Routines to support user's extra GPU logic + * pgstrom_sigpoll_handler */ -uint32 -pgstrom_register_users_extra(const pgstromUsersExtraDescriptor *__desc) +static void +pgstrom_sigpoll_handler(SIGNAL_ARGS) { - pgstromUsersExtraDescriptor *desc; - const char *extra_name; - uint32 extra_flags; - - if (pgstrom_num_users_extra >= 7) - elog(ERROR, "too much PG-Strom users' extra module is registered"); - if (__desc->magic != PGSTROM_USERS_EXTRA_MAGIC_V1) - elog(ERROR, "magic number of pgstromUsersExtraDescriptor mismatch"); - if (__desc->pg_version / 100 != PG_MAJOR_VERSION) - elog(ERROR, "PG-Strom Users Extra is built for %u", __desc->pg_version); - - extra_name = strdup(__desc->extra_name); - if (!extra_name) - elog(ERROR, "out of memory"); - extra_flags = (1U << (pgstrom_num_users_extra + 24)); - - desc = &pgstrom_users_extra_desc[pgstrom_num_users_extra++]; - memcpy(desc, __desc, sizeof(pgstromUsersExtraDescriptor)); - desc->extra_flags = extra_flags; - desc->extra_name = extra_name; - elog(LOG, "PG-Strom users's extra [%s] registered", extra_name); - - return extra_flags; + /* do nothing here, but invocation of this handler may wake up epoll(2) / poll(2) */ } /* @@ -560,69 +474,60 @@ void _PG_init(void) { /* - * PG-Strom has to be loaded using shared_preload_libraries option + * PG-Strom must be loaded using shared_preload_libraries */ if (!process_shared_preload_libraries_in_progress) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), - errmsg("PG-Strom must be loaded via shared_preload_libraries"))); - + errmsg("PG-Strom must be loaded via shared_preload_libraries"))); /* init misc variables */ PAGE_SIZE = sysconf(_SC_PAGESIZE); PAGE_MASK = PAGE_SIZE - 1; PAGE_SHIFT = get_next_log2(PAGE_SIZE); PHYS_PAGES = sysconf(_SC_PHYS_PAGES); + PAGES_PER_BLOCK = BLCKSZ / PAGE_SIZE; - /* load NVIDIA/HeteroDB related stuff, if any */ - pgstrom_init_nvrtc(); + /* init pg-strom infrastructure */ + pgstrom_init_gucs(); pgstrom_init_extra(); - + pgstrom_init_codegen(); + pgstrom_init_relscan(); + pgstrom_init_brin(); + pgstrom_init_arrow_fdw(); + pgstrom_init_executor(); /* dump version number */ elog(LOG, "PG-Strom version %s built for PostgreSQL %s (git: %s)", PGSTROM_VERSION, PG_MAJORVERSION, PGSTROM_GITHASH); - - /* init GPU/CUDA infrastracture */ - pgstrom_init_shmbuf(); - pgstrom_init_gpu_device(); - pgstrom_init_gpu_mmgr(); - pgstrom_init_gpu_context(); - pgstrom_init_cuda_program(); - pgstrom_init_codegen(); - - /* init custom-scan providers/FDWs */ - pgstrom_init_common_guc(); - pgstrom_init_gputasks(); - pgstrom_init_gpuscan(); - pgstrom_init_gpujoin(); - pgstrom_init_gpupreagg(); - pgstrom_init_relscan(); - pgstrom_init_arrow_fdw(); - pgstrom_init_gpu_cache(); - -#if PG_VERSION_NUM < 150000 - /* - * PG15 enforces shared memory requirement is added in the 'shmem_request_hook' - * but PG14 or former don't have such infrastructure. So, we provide our own - * infrastructure with same name and definition. - */ - if (shmem_request_hook) - shmem_request_hook(); -#endif - + /* init GPU related stuff */ + if (pgstrom_init_gpu_device()) + { + pgstrom_init_gpu_service(); + pgstrom_init_gpu_scan(); + pgstrom_init_gpu_join(); + pgstrom_init_gpu_preagg(); + } + /* init DPU related stuff */ + if (pgstrom_init_dpu_device()) + { + pgstrom_init_dpu_scan(); + pgstrom_init_dpu_join(); + pgstrom_init_dpu_preagg(); + } + pgstrom_init_pcie(); /* dummy custom-scan node */ memset(&pgstrom_dummy_path_methods, 0, sizeof(CustomPathMethods)); - pgstrom_dummy_path_methods.CustomName = "Dummy"; - pgstrom_dummy_path_methods.PlanCustomPath - = pgstrom_dummy_create_plan; + pgstrom_dummy_path_methods.CustomName = "Dummy"; + pgstrom_dummy_path_methods.PlanCustomPath = pgstrom_dummy_create_plan; memset(&pgstrom_dummy_plan_methods, 0, sizeof(CustomScanMethods)); - pgstrom_dummy_plan_methods.CustomName = "Dummy"; - pgstrom_dummy_plan_methods.CreateCustomScanState - = pgstrom_dummy_create_scan_state; + pgstrom_dummy_plan_methods.CustomName = "Dummy"; + pgstrom_dummy_plan_methods.CreateCustomScanState = pgstrom_dummy_create_scan_state; - /* planner hook registration */ + /* post planner hook */ planner_hook_next = (planner_hook ? planner_hook : standard_planner); planner_hook = pgstrom_post_planner; + /* signal handler for wake up */ + pqsignal(SIGPOLL, pgstrom_sigpoll_handler); } diff --git a/src/misc.c b/src/misc.c index c5e89bf7b..f7812de0d 100644 --- a/src/misc.c +++ b/src/misc.c @@ -4,44 +4,14 @@ * miscellaneous and uncategorized routines but usefull for multiple subsystems * of PG-Strom. * ---- - * Copyright 2011-2021 (C) KaiGai Kohei - * Copyright 2014-2021 (C) PG-Strom Developers Team + * Copyright 2011-2023 (C) KaiGai Kohei + * Copyright 2014-2023 (C) PG-Strom Developers Team * * This program is free software; you can redistribute it and/or modify * it under the terms of the PostgreSQL License. */ #include "pg_strom.h" -/* - * make_flat_ands_expr - similar to make_ands_explicit but it pulls up - * underlying and-clause - */ -Expr * -make_flat_ands_explicit(List *andclauses) -{ - List *args = NIL; - ListCell *lc; - - if (andclauses == NIL) - return (Expr *) makeBoolConst(true, false); - else if (list_length(andclauses) == 1) - return (Expr *) linitial(andclauses); - - foreach (lc, andclauses) - { - Expr *expr = lfirst(lc); - - Assert(exprType((Node *)expr) == BOOLOID); - if (IsA(expr, BoolExpr) && - ((BoolExpr *)expr)->boolop == AND_EXPR) - args = list_concat(args, ((BoolExpr *) expr)->args); - else - args = lappend(args, expr); - } - Assert(list_length(args) > 1); - return make_andclause(args); -} - /* * fixup_varnode_to_origin */ @@ -69,6 +39,7 @@ fixup_varnode_to_origin(Node *node, List *cscan_tlist) (void *)cscan_tlist); } +#if 0 /* * find_appinfos_by_relids_nofail * @@ -180,110 +151,40 @@ get_parallel_divisor(Path *path) } return parallel_divisor; } +#endif /* - * Usefulll wrapper routines like lsyscache.c + * append a binary chunk at the aligned block */ -#if PG_VERSION_NUM < 110000 -char -get_func_prokind(Oid funcid) -{ - HeapTuple tup; - Form_pg_proc procForm; - char prokind; - - tup = SearchSysCache1(PROCOID, ObjectIdGetDatum(funcid)); - if (!HeapTupleIsValid(tup)) - elog(ERROR, "cache lookup failed for function %u", funcid); - procForm = (Form_pg_proc) GETSTRUCT(tup); - if (procForm->proisagg) - { - Assert(!procForm->proiswindow); - prokind = PROKIND_AGGREGATE; - } - else if (procForm->proiswindow) - { - Assert(!procForm->proisagg); - prokind = PROKIND_WINDOW; - } - else - { - prokind = PROKIND_FUNCTION; - } - ReleaseSysCache(tup); - - return prokind; -} -#endif /* relnatts; - ReleaseSysCache(tup); - } - return relnatts; -} - -/* - * get_function_oid - */ -Oid -get_function_oid(const char *func_name, - oidvector *func_args, - Oid namespace_oid, - bool missing_ok) +__appendBinaryStringInfo(StringInfo buf, const void *data, int datalen) { - Oid func_oid; - - func_oid = GetSysCacheOid3(PROCNAMEARGSNSP, -#if PG_VERSION_NUM >= 120000 - Anum_pg_proc_oid, -#endif - CStringGetDatum(func_name), - PointerGetDatum(func_args), - ObjectIdGetDatum(namespace_oid)); - if (!missing_ok && !OidIsValid(func_oid)) - ereport(ERROR, - (errcode(ERRCODE_UNDEFINED_FUNCTION), - errmsg("function %s is not defined", - funcname_signature_string(func_name, - func_args->dim1, - NIL, - func_args->values)))); - return func_oid; + static uint64_t __zero = 0; + int padding = (MAXALIGN(buf->len) - buf->len); + int pos; + + if (padding > 0) + appendBinaryStringInfo(buf, (char *)&__zero, padding); + pos = buf->len; + appendBinaryStringInfo(buf, data, datalen); + return pos; } -/* - * get_type_oid - */ -Oid -get_type_oid(const char *type_name, - Oid namespace_oid, - bool missing_ok) +int +__appendZeroStringInfo(StringInfo buf, int nbytes) { - Oid type_oid; - - type_oid = GetSysCacheOid2(TYPENAMENSP, -#if PG_VERSION_NUM >= 120000 - Anum_pg_type_oid, -#endif - CStringGetDatum(type_name), - ObjectIdGetDatum(namespace_oid)); - if (!missing_ok && !OidIsValid(type_oid)) - ereport(ERROR, - (errcode(ERRCODE_UNDEFINED_OBJECT), - errmsg("type %s is not defined", type_name))); - - return type_oid; + static uint64_t __zero = 0; + int padding = (MAXALIGN(buf->len) - buf->len); + int pos; + + if (padding > 0) + appendBinaryStringInfo(buf, (char *)&__zero, padding); + pos = buf->len; + enlargeStringInfo(buf, nbytes); + memset(buf->data + pos, 0, nbytes); + buf->len += nbytes; + + return pos; } /* @@ -309,102 +210,25 @@ get_type_name(Oid type_oid, bool missing_ok) } /* - * get_proc_library - */ -char * -get_proc_library(HeapTuple protup) -{ - Form_pg_proc proc = (Form_pg_proc)GETSTRUCT(protup); - - if (proc->prolang == ClanguageId) - { - Datum datum; - bool isnull; - - datum = SysCacheGetAttr(PROCOID, protup, - Anum_pg_proc_probin, - &isnull); - if (!isnull) - return TextDatumGetCString(datum); - } - else if (proc->prolang != INTERNALlanguageId && - proc->prolang != SQLlanguageId) - { - return (void *)(~0UL); - } - return NULL; -} - -/* - * get_object_extension_oid + * get_relation_am */ Oid -get_object_extension_oid(Oid class_id, - Oid object_id, - int32 objsub_id, - bool missing_ok) +get_relation_am(Oid rel_oid, bool missing_ok) { - Relation drel; - ScanKeyData skeys[3]; - SysScanDesc sscan; HeapTuple tup; - Oid ext_oid = InvalidOid; - - drel = table_open(DependRelationId, AccessShareLock); - - ScanKeyInit(&skeys[0], - Anum_pg_depend_classid, - BTEqualStrategyNumber, F_OIDEQ, - ObjectIdGetDatum(class_id)); - ScanKeyInit(&skeys[1], - Anum_pg_depend_objid, - BTEqualStrategyNumber, F_OIDEQ, - ObjectIdGetDatum(object_id)); - ScanKeyInit(&skeys[2], - Anum_pg_depend_objsubid, - BTEqualStrategyNumber, F_INT4EQ, - Int32GetDatum(objsub_id)); - sscan = systable_beginscan(drel, DependDependerIndexId, true, - NULL, 3, skeys); - while (HeapTupleIsValid(tup = systable_getnext(sscan))) - { - Form_pg_depend dep = (Form_pg_depend) GETSTRUCT(tup); + Oid relam; - if (dep->refclassid == ExtensionRelationId && - dep->refobjsubid == 0 && - (dep->deptype == DEPENDENCY_EXTENSION || - dep->deptype == DEPENDENCY_AUTO_EXTENSION)) - { - ext_oid = dep->refobjid; - break; - } + tup = SearchSysCache1(RELOID, ObjectIdGetDatum(rel_oid)); + if (!HeapTupleIsValid(tup)) + { + if (!missing_ok) + elog(ERROR, "cache lookup failed for relation %u", rel_oid); + return InvalidOid; } - systable_endscan(sscan); - table_close(drel, AccessShareLock); - - if (!missing_ok && !OidIsValid(ext_oid)) - elog(ERROR, "couldn't find out references (class:%u, objid:%u, subid:%d) by pg_extension at pg_depend", - class_id, object_id, objsub_id); - - return ext_oid; -} - -/* - * bms_to_cstring - human readable Bitmapset - */ -char * -bms_to_cstring(Bitmapset *bms) -{ - StringInfoData buf; - int bit = -1; - - initStringInfo(&buf); - appendStringInfo(&buf, "{"); - while ((bit = bms_next_member(bms, bit)) >= 0) - appendStringInfo(&buf, " %d", bit); - appendStringInfo(&buf, " }"); + relam = ((Form_pg_class) GETSTRUCT(tup))->relam; + ReleaseSysCache(tup); - return buf.data; + return relam; } /* @@ -438,6 +262,33 @@ bms_from_pglist(List *pglist) return bms; } +Float * +__makeFloat(double fval) +{ + return makeFloat(psprintf("%e", fval)); +} + +Const * +__makeByteaConst(bytea *data) +{ + return makeConst(BYTEAOID, + -1, + InvalidOid, + -1, + PointerGetDatum(data), + data == NULL, + false); +} + +bytea * +__getByteaConst(Const *con) +{ + Assert(IsA(con, Const) && con->consttype == BYTEAOID); + + return (con->constisnull ? NULL : DatumGetByteaP(con->constvalue)); +} + +#if 0 /* * pathnode_tree_walker */ @@ -517,22 +368,18 @@ pathnode_tree_walker(Path *node, if (walker(((GatherPath *)node)->subpath, context)) return true; break; -#if PG_VERSION_NUM >= 100000 case T_GatherMergePath: if (walker(((GatherMergePath *)node)->subpath, context)) return true; break; -#endif /* >= PG10 */ case T_ProjectionPath: if (walker(((ProjectionPath *)node)->subpath, context)) return true; break; -#if PG_VERSION_NUM >= 100000 case T_ProjectSetPath: if (walker(((ProjectSetPath *)node)->subpath, context)) return true; break; -#endif /* >= PG10 */ case T_SortPath: if (walker(((SortPath *)node)->subpath, context)) return true; @@ -628,6 +475,7 @@ pathtree_has_parallel_aware(Path *node) { return __pathtree_has_parallel_aware(node, NULL); } +#endif /* * pgstrom_copy_pathnode @@ -685,21 +533,17 @@ pgstrom_copy_pathnode(const Path *pathnode) return &b->path; } case T_CustomPath: - if (pgstrom_path_is_gpuscan(pathnode)) - return pgstrom_copy_gpuscan_path(pathnode); - else if (pgstrom_path_is_gpujoin(pathnode)) - return pgstrom_copy_gpujoin_path(pathnode); - else if (pgstrom_path_is_gpupreagg(pathnode)) - return pgstrom_copy_gpupreagg_path(pathnode); - else { CustomPath *a = (CustomPath *)pathnode; CustomPath *b = pmemdup(a, sizeof(CustomPath)); List *subpaths = NIL; ListCell *lc; + foreach (lc, a->custom_paths) - subpaths = lappend(subpaths, - pgstrom_copy_pathnode(lfirst(lc))); + { + Path *sp = pgstrom_copy_pathnode(lfirst(lc)); + subpaths = lappend(subpaths, sp); + } b->custom_paths = subpaths; return &b->path; } @@ -735,13 +579,8 @@ pgstrom_copy_pathnode(const Path *pathnode) b->subpaths = subpaths; return &b->path; } -#if PG_VERSION_NUM < 120000 - case T_ResultPath: - return pmemdup(pathnode, sizeof(ResultPath)); -#else case T_GroupResultPath: return pmemdup(pathnode, sizeof(GroupResultPath)); -#endif case T_MaterialPath: { MaterialPath *a = (MaterialPath *)pathnode; @@ -749,6 +588,13 @@ pgstrom_copy_pathnode(const Path *pathnode) b->subpath = pgstrom_copy_pathnode(a->subpath); return &b->path; } + case T_MemoizePath: + { + MemoizePath *a = (MemoizePath *)pathnode; + MemoizePath *b = pmemdup(a, sizeof(MemoizePath)); + b->subpath = pgstrom_copy_pathnode(a->subpath); + return &b->path; + } case T_UniquePath: { UniquePath *a = (UniquePath *)pathnode; @@ -854,16 +700,7 @@ pgstrom_copy_pathnode(const Path *pathnode) { ModifyTablePath *a = (ModifyTablePath *)pathnode; ModifyTablePath *b = pmemdup(a, sizeof(ModifyTablePath)); -#if PG_VERSION_NUM < 140000 - List *subpaths = NIL; - ListCell *lc; - foreach (lc, a->subpaths) - subpaths = lappend(subpaths, - pgstrom_copy_pathnode(lfirst(lc))); - b->subpaths = subpaths; -#else b->subpath = pgstrom_copy_pathnode(a->subpath); -#endif return &b->path; } case T_LimitPath: @@ -879,30 +716,44 @@ pgstrom_copy_pathnode(const Path *pathnode) return NULL; } +#if 0 /* - * errorText - string form of the error code + * pgstrom_define_shell_type - A wrapper for TypeShellMake with a particular OID */ -const char * -errorText(int errcode) +PG_FUNCTION_INFO_V1(pgstrom_define_shell_type); +Datum +pgstrom_define_shell_type(PG_FUNCTION_ARGS) { - static __thread char buffer[160]; - const char *error_name; - const char *error_desc; + char *type_name = text_to_cstring(PG_GETARG_TEXT_PP(0)); + Oid type_oid = PG_GETARG_OID(1); + Oid type_namespace = PG_GETARG_OID(2); + bool __IsBinaryUpgrade = IsBinaryUpgrade; + Oid __binary_upgrade_next_pg_type_oid = binary_upgrade_next_pg_type_oid; - if (errcode >= 0 && errcode <= CUDA_ERROR_UNKNOWN) + if (!superuser()) + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + errmsg("must be superuser to create a shell type"))); + PG_TRY(); { - if (cuGetErrorName(errcode, &error_name) == CUDA_SUCCESS && - cuGetErrorString(errcode, &error_desc) == CUDA_SUCCESS) - { - snprintf(buffer, sizeof(buffer), "%s - %s", - error_name, error_desc); - return buffer; - } + IsBinaryUpgrade = true; + binary_upgrade_next_pg_type_oid = type_oid; + + TypeShellMake(type_name, type_namespace, GetUserId()); } - snprintf(buffer, sizeof(buffer), - "%d - unknown", errcode); - return buffer; + PG_CATCH(); + { + IsBinaryUpgrade = __IsBinaryUpgrade; + binary_upgrade_next_pg_type_oid = __binary_upgrade_next_pg_type_oid; + PG_RE_THROW(); + } + PG_END_TRY(); + IsBinaryUpgrade = __IsBinaryUpgrade; + binary_upgrade_next_pg_type_oid = __binary_upgrade_next_pg_type_oid; + + PG_RETURN_OID(type_oid); } +#endif /* * ---------------------------------------------------------------- @@ -946,15 +797,15 @@ pgstrom_random_setseed(PG_FUNCTION_ARGS) } PG_FUNCTION_INFO_V1(pgstrom_random_setseed); -static cl_long +static int64_t __random(void) { if (!pgstrom_random_seed_set) { - pgstrom_random_seed = (unsigned int)MyProcPid ^ 0xdeadbeaf; + pgstrom_random_seed = (unsigned int)MyProcPid ^ 0xdeadbeafU; pgstrom_random_seed_set = true; } - return (cl_ulong)rand_r(&pgstrom_random_seed); + return (uint64_t)rand_r(&pgstrom_random_seed); } static inline double @@ -979,7 +830,7 @@ pgstrom_random_int(PG_FUNCTION_ARGS) float8 ratio = (!PG_ARGISNULL(0) ? PG_GETARG_FLOAT8(0) : 0.0); int64 lower = (!PG_ARGISNULL(1) ? PG_GETARG_INT64(1) : 0); int64 upper = (!PG_ARGISNULL(2) ? PG_GETARG_INT64(2) : INT_MAX); - cl_ulong v; + uint64_t v; if (upper < lower) elog(ERROR, "%s: lower bound is larger than upper", __FUNCTION__); @@ -1017,7 +868,7 @@ pgstrom_random_date(PG_FUNCTION_ARGS) float8 ratio = (!PG_ARGISNULL(0) ? PG_GETARG_FLOAT8(0) : 0.0); DateADT lower; DateADT upper; - cl_ulong v; + uint64_t v; if (!PG_ARGISNULL(1)) lower = PG_GETARG_DATEADT(1); @@ -1046,7 +897,7 @@ pgstrom_random_time(PG_FUNCTION_ARGS) float8 ratio = (!PG_ARGISNULL(0) ? PG_GETARG_FLOAT8(0) : 0.0); TimeADT lower = 0; TimeADT upper = HOURS_PER_DAY * USECS_PER_HOUR - 1; - cl_ulong v; + uint64_t v; if (!PG_ARGISNULL(1)) lower = PG_GETARG_TIMEADT(1); @@ -1071,7 +922,7 @@ pgstrom_random_timetz(PG_FUNCTION_ARGS) TimeADT lower = 0; TimeADT upper = HOURS_PER_DAY * USECS_PER_HOUR - 1; TimeTzADT *temp; - cl_ulong v; + uint64_t v; if (!PG_ARGISNULL(1)) lower = PG_GETARG_TIMEADT(1); @@ -1100,7 +951,7 @@ pgstrom_random_timestamp(PG_FUNCTION_ARGS) float8 ratio = (!PG_ARGISNULL(0) ? PG_GETARG_FLOAT8(0) : 0.0); Timestamp lower; Timestamp upper; - cl_ulong v; + uint64_t v; struct pg_tm tm; if (!PG_ARGISNULL(1)) @@ -1139,18 +990,18 @@ pgstrom_random_macaddr(PG_FUNCTION_ARGS) { float8 ratio = (!PG_ARGISNULL(0) ? PG_GETARG_FLOAT8(0) : 0.0); macaddr *temp; - cl_ulong lower; - cl_ulong upper; - cl_ulong v, x; + uint64_t lower; + uint64_t upper; + uint64_t v, x; if (PG_ARGISNULL(1)) lower = 0xabcd00000000UL; else { temp = PG_GETARG_MACADDR_P(1); - lower = (((cl_ulong)temp->a << 40) | ((cl_ulong)temp->b << 32) | - ((cl_ulong)temp->c << 24) | ((cl_ulong)temp->d << 16) | - ((cl_ulong)temp->e << 8) | ((cl_ulong)temp->f)); + lower = (((uint64_t)temp->a << 40) | ((uint64_t)temp->b << 32) | + ((uint64_t)temp->c << 24) | ((uint64_t)temp->d << 16) | + ((uint64_t)temp->e << 8) | ((uint64_t)temp->f)); } if (PG_ARGISNULL(2)) @@ -1158,9 +1009,9 @@ pgstrom_random_macaddr(PG_FUNCTION_ARGS) else { temp = PG_GETARG_MACADDR_P(2); - upper = (((cl_ulong)temp->a << 40) | ((cl_ulong)temp->b << 32) | - ((cl_ulong)temp->c << 24) | ((cl_ulong)temp->d << 16) | - ((cl_ulong)temp->e << 8) | ((cl_ulong)temp->f)); + upper = (((uint64_t)temp->a << 40) | ((uint64_t)temp->b << 32) | + ((uint64_t)temp->c << 24) | ((uint64_t)temp->d << 16) | + ((uint64_t)temp->e << 8) | ((uint64_t)temp->f)); } if (upper < lower) @@ -1191,7 +1042,7 @@ pgstrom_random_inet(PG_FUNCTION_ARGS) float8 ratio = (!PG_ARGISNULL(0) ? PG_GETARG_FLOAT8(0) : 0.0); inet *temp; int i, j, bits; - cl_ulong v; + uint64_t v; if (generate_null(ratio)) PG_RETURN_NULL(); @@ -1224,7 +1075,7 @@ pgstrom_random_inet(PG_FUNCTION_ARGS) temp->inet_data.ipaddr[i--] = (v & 0xff); else { - cl_uint mask = (1 << bits) - 1; + uint32_t mask = (1 << bits) - 1; temp->inet_data.ipaddr[i] &= ~(mask); temp->inet_data.ipaddr[i] |= (v & mask); @@ -1246,7 +1097,7 @@ pgstrom_random_text(PG_FUNCTION_ARGS) text *temp; char *pos; int i, j, n; - cl_ulong v; + uint64_t v; if (generate_null(ratio)) PG_RETURN_NULL(); @@ -1284,11 +1135,11 @@ pgstrom_random_text_length(PG_FUNCTION_ARGS) "abcdefghijklmnopqrstuvwxyz" "0123456789+/"; float8 ratio = (!PG_ARGISNULL(0) ? PG_GETARG_FLOAT8(0) : 0.0); - cl_int maxlen; + int32_t maxlen; text *temp; char *pos; int i, j, n; - cl_ulong v = 0; + uint64_t v = 0; if (generate_null(ratio)) PG_RETURN_NULL(); @@ -1350,7 +1201,12 @@ pgstrom_random_int4range(PG_FUNCTION_ARGS) if (generate_null(ratio)) PG_RETURN_NULL(); - type_oid = get_type_oid("int4range", PG_CATALOG_NAMESPACE, false); + type_oid = GetSysCacheOid2(TYPENAMENSP, + Anum_pg_type_oid, + CStringGetDatum("int4range"), + ObjectIdGetDatum(PG_CATALOG_NAMESPACE)); + if (!OidIsValid(type_oid)) + elog(ERROR, "type 'int4range' is not defined"); typcache = range_get_typcache(fcinfo, type_oid); x = lower + __random() % (upper - lower); y = lower + __random() % (upper - lower); @@ -1372,7 +1228,12 @@ pgstrom_random_int8range(PG_FUNCTION_ARGS) if (generate_null(ratio)) PG_RETURN_NULL(); - type_oid = get_type_oid("int8range", PG_CATALOG_NAMESPACE, false); + type_oid = GetSysCacheOid2(TYPENAMENSP, + Anum_pg_type_oid, + CStringGetDatum("int8range"), + ObjectIdGetDatum(PG_CATALOG_NAMESPACE)); + if (!OidIsValid(type_oid)) + elog(ERROR, "type 'int8range' is not defined"); typcache = range_get_typcache(fcinfo, type_oid); v = (__random() << 31) | __random(); x = lower + v % (upper - lower); @@ -1394,7 +1255,7 @@ pgstrom_random_tsrange(PG_FUNCTION_ARGS) TypeCacheEntry *typcache; Oid type_oid; Timestamp x, y; - cl_ulong v; + uint64_t v; if (generate_null(ratio)) PG_RETURN_NULL(); @@ -1419,8 +1280,12 @@ pgstrom_random_tsrange(PG_FUNCTION_ARGS) } if (upper < lower) elog(ERROR, "%s: lower bound is larger than upper", __FUNCTION__); - - type_oid = get_type_oid("tsrange", PG_CATALOG_NAMESPACE, false); + type_oid = GetSysCacheOid2(TYPENAMENSP, + Anum_pg_type_oid, + CStringGetDatum("tsrange"), + ObjectIdGetDatum(PG_CATALOG_NAMESPACE)); + if (!OidIsValid(type_oid)) + elog(ERROR, "type 'tsrange' is not defined"); typcache = range_get_typcache(fcinfo, type_oid); v = (__random() << 31) | __random(); x = lower + v % (upper - lower); @@ -1442,7 +1307,7 @@ pgstrom_random_tstzrange(PG_FUNCTION_ARGS) TypeCacheEntry *typcache; Oid type_oid; Timestamp x, y; - cl_ulong v; + uint64_t v; if (generate_null(ratio)) PG_RETURN_NULL(); @@ -1467,8 +1332,12 @@ pgstrom_random_tstzrange(PG_FUNCTION_ARGS) } if (upper < lower) elog(ERROR, "%s: lower bound is larger than upper", __FUNCTION__); - - type_oid = get_type_oid("tstzrange", PG_CATALOG_NAMESPACE, false); + type_oid = GetSysCacheOid2(TYPENAMENSP, + Anum_pg_type_oid, + CStringGetDatum("tstzrange"), + ObjectIdGetDatum(PG_CATALOG_NAMESPACE)); + if (!OidIsValid(type_oid)) + elog(ERROR, "type 'tstzrange' is not defined"); typcache = range_get_typcache(fcinfo, type_oid); v = (__random() << 31) | __random(); x = lower + v % (upper - lower); @@ -1503,7 +1372,12 @@ pgstrom_random_daterange(PG_FUNCTION_ARGS) if (upper < lower) elog(ERROR, "%s: lower bound is larger than upper", __FUNCTION__); - type_oid = get_type_oid("daterange", PG_CATALOG_NAMESPACE, false); + type_oid = GetSysCacheOid2(TYPENAMENSP, + Anum_pg_type_oid, + CStringGetDatum("daterange"), + ObjectIdGetDatum(PG_CATALOG_NAMESPACE)); + if (!OidIsValid(type_oid)) + elog(ERROR, "type 'daterange' is not defined"); typcache = range_get_typcache(fcinfo, type_oid); x = lower + __random() % (upper - lower); y = lower + __random() % (upper - lower); @@ -1609,10 +1483,21 @@ __pwriteFile(int fdesc, const void *buffer, size_t nbytes, off_t f_pos) return count; } -/* - * mmap/munmap wrapper that is automatically unmapped on regarding to - * the resource-owner. +/* ---------------------------------------------------------------- + * + * shared memory and mmap/munmap routines + * + * ---------------------------------------------------------------- */ +#define IS_POSIX_SHMEM 0x80000000U +typedef struct +{ + uint32_t shmem_handle; + int shmem_fdesc; + char shmem_name[MAXPGPATH]; + ResourceOwner owner; +} shmemEntry; + typedef struct { void *mmap_addr; @@ -1621,18 +1506,54 @@ typedef struct int mmap_flags; ResourceOwner owner; } mmapEntry; + +static HTAB *shmem_tracker_htab = NULL; static HTAB *mmap_tracker_htab = NULL; +static void +cleanup_shmem_chunks(ResourceReleasePhase phase, + bool isCommit, + bool isTopLevel, + void *arg) +{ + if (phase == RESOURCE_RELEASE_AFTER_LOCKS && + shmem_tracker_htab && + hash_get_num_entries(shmem_tracker_htab) > 0) + { + HASH_SEQ_STATUS seq; + shmemEntry *entry; + + hash_seq_init(&seq, shmem_tracker_htab); + while ((entry = hash_seq_search(&seq)) != NULL) + { + if (entry->owner != CurrentResourceOwner) + continue; + if (isCommit) + elog(WARNING, "shared-memory '%s' leaks, and still alive", + entry->shmem_name); + if (unlink(entry->shmem_name) != 0) + elog(WARNING, "failed on unlink('%s'): %m", entry->shmem_name); + if (close(entry->shmem_fdesc) != 0) + elog(WARNING, "failed on close('%s'): %m", entry->shmem_name); + hash_search(shmem_tracker_htab, + &entry->shmem_handle, + HASH_REMOVE, + NULL); + } + } +} + static void cleanup_mmap_chunks(ResourceReleasePhase phase, bool isCommit, bool isTopLevel, void *arg) { - if (mmap_tracker_htab && + if (phase == RESOURCE_RELEASE_AFTER_LOCKS && + mmap_tracker_htab && hash_get_num_entries(mmap_tracker_htab) > 0) { - HASH_SEQ_STATUS seq; + HASH_SEQ_STATUS seq; mmapEntry *entry; hash_seq_init(&seq, mmap_tracker_htab); @@ -1656,15 +1577,125 @@ cleanup_mmap_chunks(ResourceReleasePhase phase, } } +uint32_t +__shmemCreate(const DpuStorageEntry *ds_entry) +{ + static uint my_random_seed = 0; + const char *shmem_dir = "/dev/shm"; + int fdesc; + uint32_t handle; + char namebuf[MAXPGPATH]; + size_t off = 0; + + if (!shmem_tracker_htab) + { + HASHCTL hctl; + + my_random_seed = (uint)MyProcPid ^ 0xcafebabeU; + + memset(&hctl, 0, sizeof(HASHCTL)); + hctl.keysize = sizeof(uint32_t); + hctl.entrysize = sizeof(shmemEntry); + shmem_tracker_htab = hash_create("shmem_tracker_htab", + 256, + &hctl, + HASH_ELEM | HASH_BLOBS); + RegisterResourceReleaseCallback(cleanup_shmem_chunks, 0); + } + + if (ds_entry) + shmem_dir = DpuStorageEntryBaseDir(ds_entry); + off = snprintf(namebuf, sizeof(namebuf), "%s/", shmem_dir); + do { + handle = rand_r(&my_random_seed); + if (handle == 0) + continue; + /* to avoid hash conflict */ + if (!shmem_dir) + handle |= IS_POSIX_SHMEM; + else + handle &= ~IS_POSIX_SHMEM; + + snprintf(namebuf + off, sizeof(namebuf) - off, + ".pgstrom_shmbuf_%u_%d", + PostPortNumber, handle); + fdesc = open(namebuf, O_RDWR | O_CREAT | O_EXCL, 0600); + if (fdesc < 0 && errno != EEXIST) + elog(ERROR, "failed on open('%s'): %m", namebuf); + } while (fdesc < 0); + + PG_TRY(); + { + shmemEntry *entry; + bool found; + + entry = hash_search(shmem_tracker_htab, + &handle, + HASH_ENTER, + &found); + if (found) + elog(ERROR, "Bug? duplicated shmem entry"); + entry->shmem_handle = handle; + entry->shmem_fdesc = fdesc; + strcpy(entry->shmem_name, namebuf); + entry->owner = CurrentResourceOwner; + } + PG_CATCH(); + { + if (close(fdesc) != 0) + elog(WARNING, "failed on close('%s'): %m", namebuf); + if (unlink(namebuf) != 0) + elog(WARNING, "failed on unlink('%s'): %m", namebuf); + PG_RE_THROW(); + } + PG_END_TRY(); + + return handle; +} + +void +__shmemDrop(uint32_t shmem_handle) +{ + if (shmem_tracker_htab) + { + shmemEntry *entry; + + entry = hash_search(shmem_tracker_htab, + &shmem_handle, + HASH_REMOVE, + NULL); + if (entry) + { + if (unlink(entry->shmem_name) != 0) + elog(WARNING, "failed on unlink('%s'): %m", entry->shmem_name); + if (close(entry->shmem_fdesc) != 0) + elog(WARNING, "failed on close('%s'): %m", entry->shmem_name); + return; + } + } + elog(ERROR, "failed on __shmemDrop - no such segment (%u)", shmem_handle); +} + void * -__mmapFile(void *addr, size_t length, - int prot, int flags, int fdesc, off_t offset) +__mmapShmem(uint32_t shmem_handle, + size_t shmem_length, + const DpuStorageEntry *ds_entry) { - void *mmap_addr; - size_t mmap_size = TYPEALIGN(PAGE_SIZE, length); - mmapEntry *entry; + void *mmap_addr = MAP_FAILED; + size_t mmap_size = TYPEALIGN(PAGE_SIZE, shmem_length); + int mmap_prot = PROT_READ | PROT_WRITE; + int mmap_flags = MAP_SHARED; + mmapEntry *mmap_entry = NULL; + shmemEntry *shmem_entry = NULL; + int fdesc = -1; + const char *shmem_dir = "/dev/shm"; + const char *fname = NULL; + struct stat stat_buf; bool found; + char namebuf[MAXPGPATH]; + if (ds_entry) + shmem_dir = DpuStorageEntryBaseDir(ds_entry); if (!mmap_tracker_htab) { HASHCTL hctl; @@ -1672,35 +1703,83 @@ __mmapFile(void *addr, size_t length, memset(&hctl, 0, sizeof(HASHCTL)); hctl.keysize = sizeof(void *); hctl.entrysize = sizeof(mmapEntry); - hctl.hcxt = CacheMemoryContext; mmap_tracker_htab = hash_create("mmap_tracker_htab", 256, &hctl, - HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); + HASH_ELEM | HASH_BLOBS); RegisterResourceReleaseCallback(cleanup_mmap_chunks, 0); } - mmap_addr = mmap(addr, mmap_size, prot, flags, fdesc, offset); - if (mmap_addr == MAP_FAILED) - return MAP_FAILED; + + if (shmem_tracker_htab) + { + shmem_entry = hash_search(shmem_tracker_htab, + &shmem_handle, + HASH_FIND, + NULL); + if (shmem_entry) + { + size_t len = strlen(shmem_dir); + + if (strncmp(shmem_entry->shmem_name, shmem_dir, len) != 0 || + shmem_entry->shmem_name[len] != '/') + elog(ERROR, "Bug? shmem_dir mismatch '%s'", shmem_dir); + fdesc = shmem_entry->shmem_fdesc; + fname = shmem_entry->shmem_name; + } + } + if (fdesc < 0) + { + snprintf(namebuf, sizeof(namebuf), + "%s/.pgstrom_shmbuf_%u_%d", + shmem_dir, PostPortNumber, shmem_handle); + fdesc = open(namebuf, O_RDWR, 0600); + if (fdesc < 0) + elog(ERROR, "failed on open('%s'): %m", namebuf); + fname = namebuf; + } + PG_TRY(); { - entry = hash_search(mmap_tracker_htab, - &mmap_addr, - HASH_ENTER, - &found); + if (fstat(fdesc, &stat_buf) != 0) + elog(ERROR, "failed on fstat('%s'): %m", fname); + if (stat_buf.st_size < mmap_size) + { + while (fallocate(fdesc, 0, 0, mmap_size) != 0) + { + if (errno != EINTR) + elog(ERROR, "failed on fallocate('%s', %lu): %m", + fname, mmap_size); + } + } + mmap_addr = mmap(NULL, mmap_size, mmap_prot, mmap_flags, fdesc, 0); + if (mmap_addr == MAP_FAILED) + elog(ERROR, "failed on mmap(2): %m"); + + mmap_entry = hash_search(mmap_tracker_htab, + &mmap_addr, + HASH_ENTER, + &found); if (found) elog(ERROR, "Bug? duplicated mmap entry"); - Assert(entry->mmap_addr == mmap_addr); - entry->mmap_size = mmap_size; - entry->mmap_prot = prot; - entry->mmap_flags = flags; - entry->owner = CurrentResourceOwner; + Assert(mmap_entry->mmap_addr == mmap_addr); + mmap_entry->mmap_size = mmap_size; + mmap_entry->mmap_prot = mmap_prot; + mmap_entry->mmap_flags = mmap_flags; + mmap_entry->owner = CurrentResourceOwner; + + if (!shmem_entry) + close(fdesc); } PG_CATCH(); { - if (munmap(mmap_addr, mmap_size) != 0) - elog(WARNING, "failed on munmap(%p, %zu): %m", - mmap_addr, mmap_size); + if (mmap_addr != MAP_FAILED) + { + if (munmap(mmap_addr, mmap_size) != 0) + elog(WARNING, "failed on munmap(%p, %zu) of '%s': %m", + mmap_addr, mmap_size, fname); + } + if (!shmem_entry && close(fdesc) != 0) + elog(WARNING, "failed on close('%s'): %m", fname); PG_RE_THROW(); } PG_END_TRY(); @@ -1708,213 +1787,26 @@ __mmapFile(void *addr, size_t length, return mmap_addr; } -int -__munmapFile(void *mmap_addr) +bool +__munmapShmem(void *mmap_addr) { - mmapEntry *entry; - int rv; - if (mmap_tracker_htab) { - entry = hash_search(mmap_tracker_htab, - &mmap_addr, HASH_REMOVE, NULL); + mmapEntry *entry + = hash_search(mmap_tracker_htab, + &mmap_addr, + HASH_REMOVE, + NULL); if (entry) { - rv = munmap(entry->mmap_addr, - entry->mmap_size); - if (rv != 0) - { - int errno_saved = errno; - + if (munmap(entry->mmap_addr, + entry->mmap_size) != 0) elog(WARNING, "failed on munmap(%p, %zu): %m", entry->mmap_addr, entry->mmap_size); - errno = errno_saved; - } - return rv; + return true; } } - /* mmapEntry not found */ - errno = EINVAL; - return -1; -} - -void * -__mremapFile(void *mmap_addr, size_t new_size) -{ - mmapEntry *entry = NULL; - void *addr; - - if (mmap_tracker_htab) - { - entry = hash_search(mmap_tracker_htab, - &mmap_addr, HASH_FIND, NULL); - } - if (!entry) - { - errno = EINVAL; - return MAP_FAILED; - } - /* nothing to do */ - if (new_size <= entry->mmap_size) - return entry->mmap_addr; - addr = mremap(entry->mmap_addr, - entry->mmap_size, - new_size, - MREMAP_MAYMOVE); - if (addr == MAP_FAILED) - return MAP_FAILED; - - entry->mmap_addr = addr; - entry->mmap_size = new_size; - return addr; -} - -/* - * dummy entry for deprecated functions - */ -static void -__pg_deprecated_function(PG_FUNCTION_ARGS, const char *cfunc_name) -{ - FmgrInfo *flinfo = fcinfo->flinfo; - - if (OidIsValid(flinfo->fn_oid)) - elog(ERROR, "'%s' on behalf of %s is already deprecated", - cfunc_name, format_procedure(flinfo->fn_oid)); - elog(ERROR, "'%s' is already deprecated", cfunc_name); + elog(ERROR, "it looks addr=%p not memory-mapped", mmap_addr); + return false; } - -#define PG_DEPRECATED_FUNCTION(cfunc_name) \ - Datum cfunc_name(PG_FUNCTION_ARGS); \ - Datum cfunc_name(PG_FUNCTION_ARGS) \ - { \ - __pg_deprecated_function(fcinfo, __FUNCTION__); \ - PG_RETURN_NULL(); \ - } \ - PG_FUNCTION_INFO_V1(cfunc_name) - -/* deprecated functions */ -/* - * SQL functions for GPU attributes (deprecated) - */ -PG_DEPRECATED_FUNCTION(pgstrom_gpu_device_name); -PG_DEPRECATED_FUNCTION(pgstrom_gpu_global_memsize); -PG_DEPRECATED_FUNCTION(pgstrom_gpu_max_blocksize); -PG_DEPRECATED_FUNCTION(pgstrom_gpu_warp_size); -PG_DEPRECATED_FUNCTION(pgstrom_gpu_max_shared_memory_perblock); -PG_DEPRECATED_FUNCTION(pgstrom_gpu_num_registers_perblock); -PG_DEPRECATED_FUNCTION(pgstrom_gpu_num_multiptocessors); -PG_DEPRECATED_FUNCTION(pgstrom_gpu_num_cuda_cores); -PG_DEPRECATED_FUNCTION(pgstrom_gpu_cc_major); -PG_DEPRECATED_FUNCTION(pgstrom_gpu_cc_minor); -PG_DEPRECATED_FUNCTION(pgstrom_gpu_pci_id); - -/* deadcode/gstore_(fdw|buf).c */ -PG_DEPRECATED_FUNCTION(pgstrom_reggstore_in); -PG_DEPRECATED_FUNCTION(pgstrom_reggstore_out); -PG_DEPRECATED_FUNCTION(pgstrom_reggstore_recv); -PG_DEPRECATED_FUNCTION(pgstrom_reggstore_send); - -PG_DEPRECATED_FUNCTION(pgstrom_gstore_fdw_chunk_info); -PG_DEPRECATED_FUNCTION(pgstrom_gstore_fdw_format); -PG_DEPRECATED_FUNCTION(pgstrom_gstore_fdw_nitems); -PG_DEPRECATED_FUNCTION(pgstrom_gstore_fdw_nattrs); -PG_DEPRECATED_FUNCTION(pgstrom_gstore_fdw_rawsize); -PG_DEPRECATED_FUNCTION(pgstrom_gstore_export_ipchandle); - -/* deadcode/largeobject.c */ -PG_DEPRECATED_FUNCTION(pgstrom_lo_import_gpu); -PG_DEPRECATED_FUNCTION(pgstrom_lo_export_gpu); - -/* deadcode/pl_cuda_v2.c */ -PG_DEPRECATED_FUNCTION(plcuda_function_validator); -PG_DEPRECATED_FUNCTION(plcuda_function_handler); -PG_DEPRECATED_FUNCTION(pgsql_table_attr_numbers_by_names); -PG_DEPRECATED_FUNCTION(pgsql_table_attr_number_by_name); -PG_DEPRECATED_FUNCTION(pgsql_table_attr_types_by_names); -PG_DEPRECATED_FUNCTION(pgsql_table_attr_type_by_name); -PG_DEPRECATED_FUNCTION(pgsql_check_attrs_of_types); -PG_DEPRECATED_FUNCTION(pgsql_check_attrs_of_type); -PG_DEPRECATED_FUNCTION(pgsql_check_attr_of_type); - -/* arrow_fdw.c */ -PG_DEPRECATED_FUNCTION(pgstrom_arrow_fdw_export_cupy); -PG_DEPRECATED_FUNCTION(pgstrom_arrow_fdw_export_cupy_pinned); -PG_DEPRECATED_FUNCTION(pgstrom_arrow_fdw_unpin_gpu_buffer); -PG_DEPRECATED_FUNCTION(pgstrom_arrow_fdw_put_gpu_buffer); - -/* deadcode/matrix.c */ -PG_DEPRECATED_FUNCTION(array_matrix_accum); -PG_DEPRECATED_FUNCTION(array_matrix_accum_varbit); -PG_DEPRECATED_FUNCTION(varbit_to_int4_array); -PG_DEPRECATED_FUNCTION(int4_array_to_varbit); -PG_DEPRECATED_FUNCTION(array_matrix_final_bool); -PG_DEPRECATED_FUNCTION(array_matrix_final_int2); -PG_DEPRECATED_FUNCTION(array_matrix_final_int4); -PG_DEPRECATED_FUNCTION(array_matrix_final_int8); -PG_DEPRECATED_FUNCTION(array_matrix_final_float4); -PG_DEPRECATED_FUNCTION(array_matrix_final_float8); -PG_DEPRECATED_FUNCTION(array_matrix_unnest); -PG_DEPRECATED_FUNCTION(array_matrix_rbind_bool); -PG_DEPRECATED_FUNCTION(array_matrix_rbind_int2); -PG_DEPRECATED_FUNCTION(array_matrix_rbind_int4); -PG_DEPRECATED_FUNCTION(array_matrix_rbind_int8); -PG_DEPRECATED_FUNCTION(array_matrix_rbind_float4); -PG_DEPRECATED_FUNCTION(array_matrix_rbind_float8); -PG_DEPRECATED_FUNCTION(array_matrix_rbind_scalar_boolt); -PG_DEPRECATED_FUNCTION(array_matrix_rbind_scalar_boolb); -PG_DEPRECATED_FUNCTION(array_matrix_rbind_scalar_int2t); -PG_DEPRECATED_FUNCTION(array_matrix_rbind_scalar_int2b); -PG_DEPRECATED_FUNCTION(array_matrix_rbind_scalar_int4t); -PG_DEPRECATED_FUNCTION(array_matrix_rbind_scalar_int4b); -PG_DEPRECATED_FUNCTION(array_matrix_rbind_scalar_int8t); -PG_DEPRECATED_FUNCTION(array_matrix_rbind_scalar_int8b); -PG_DEPRECATED_FUNCTION(array_matrix_rbind_scalar_float4t); -PG_DEPRECATED_FUNCTION(array_matrix_rbind_scalar_float4b); -PG_DEPRECATED_FUNCTION(array_matrix_rbind_scalar_float8t); -PG_DEPRECATED_FUNCTION(array_matrix_rbind_scalar_float8b); -PG_DEPRECATED_FUNCTION(array_matrix_cbind_bool); -PG_DEPRECATED_FUNCTION(array_matrix_cbind_int2); -PG_DEPRECATED_FUNCTION(array_matrix_cbind_int4); -PG_DEPRECATED_FUNCTION(array_matrix_cbind_int8); -PG_DEPRECATED_FUNCTION(array_matrix_cbind_float4); -PG_DEPRECATED_FUNCTION(array_matrix_cbind_float8); -PG_DEPRECATED_FUNCTION(array_matrix_cbind_scalar_booll); -PG_DEPRECATED_FUNCTION(array_matrix_cbind_scalar_boolr); -PG_DEPRECATED_FUNCTION(array_matrix_cbind_scalar_int2l); -PG_DEPRECATED_FUNCTION(array_matrix_cbind_scalar_int2r); -PG_DEPRECATED_FUNCTION(array_matrix_cbind_scalar_int4l); -PG_DEPRECATED_FUNCTION(array_matrix_cbind_scalar_int4r); -PG_DEPRECATED_FUNCTION(array_matrix_cbind_scalar_int8l); -PG_DEPRECATED_FUNCTION(array_matrix_cbind_scalar_int8r); -PG_DEPRECATED_FUNCTION(array_matrix_cbind_scalar_float4l); -PG_DEPRECATED_FUNCTION(array_matrix_cbind_scalar_float4r); -PG_DEPRECATED_FUNCTION(array_matrix_cbind_scalar_float8l); -PG_DEPRECATED_FUNCTION(array_matrix_cbind_scalar_float8r); -PG_DEPRECATED_FUNCTION(array_matrix_rbind_accum); -PG_DEPRECATED_FUNCTION(array_matrix_rbind_final_bool); -PG_DEPRECATED_FUNCTION(array_matrix_rbind_final_int2); -PG_DEPRECATED_FUNCTION(array_matrix_rbind_final_int4); -PG_DEPRECATED_FUNCTION(array_matrix_rbind_final_int8); -PG_DEPRECATED_FUNCTION(array_matrix_rbind_final_float4); -PG_DEPRECATED_FUNCTION(array_matrix_rbind_final_float8); -PG_DEPRECATED_FUNCTION(array_matrix_cbind_accum); -PG_DEPRECATED_FUNCTION(array_matrix_cbind_final_bool); -PG_DEPRECATED_FUNCTION(array_matrix_cbind_final_int2); -PG_DEPRECATED_FUNCTION(array_matrix_cbind_final_int4); -PG_DEPRECATED_FUNCTION(array_matrix_cbind_final_int8); -PG_DEPRECATED_FUNCTION(array_matrix_cbind_final_float4); -PG_DEPRECATED_FUNCTION(array_matrix_cbind_final_float8); -PG_DEPRECATED_FUNCTION(array_matrix_transpose_bool); -PG_DEPRECATED_FUNCTION(array_matrix_transpose_int2); -PG_DEPRECATED_FUNCTION(array_matrix_transpose_int4); -PG_DEPRECATED_FUNCTION(array_matrix_transpose_int8); -PG_DEPRECATED_FUNCTION(array_matrix_transpose_float4); -PG_DEPRECATED_FUNCTION(array_matrix_transpose_float8); -PG_DEPRECATED_FUNCTION(float4_as_int4); /* duplicated, see float2.c */ -PG_DEPRECATED_FUNCTION(int4_as_float4); /* duplicated, see float2.c */ -PG_DEPRECATED_FUNCTION(float8_as_int8); /* duplicated, see float2.c */ -PG_DEPRECATED_FUNCTION(int8_as_float8); /* duplicated, see float2.c */ -PG_DEPRECATED_FUNCTION(array_matrix_validation); -PG_DEPRECATED_FUNCTION(array_matrix_height); -PG_DEPRECATED_FUNCTION(array_matrix_width); diff --git a/next/multirels.c b/src/multirels.c similarity index 100% rename from next/multirels.c rename to src/multirels.c diff --git a/next/pcie.c b/src/pcie.c similarity index 100% rename from next/pcie.c rename to src/pcie.c diff --git a/next/pg_strom.control b/src/pg_strom.control similarity index 100% rename from next/pg_strom.control rename to src/pg_strom.control diff --git a/src/pg_strom.h b/src/pg_strom.h index 739bb3825..7260b8545 100644 --- a/src/pg_strom.h +++ b/src/pg_strom.h @@ -3,8 +3,8 @@ * * Header file of pg_strom module * -- - * Copyright 2011-2021 (C) KaiGai Kohei - * Copyright 2014-2021 (C) PG-Strom Developers Team + * Copyright 2011-2023 (C) KaiGai Kohei + * Copyright 2014-2023 (C) PG-Strom Developers Team * * This program is free software; you can redistribute it and/or modify * it under the terms of the PostgreSQL License. @@ -13,183 +13,94 @@ #define PG_STROM_H #include "postgres.h" -#if PG_VERSION_NUM < 110000 -#error Base PostgreSQL version must be v11 or later +#if PG_VERSION_NUM < 150000 +#error Base PostgreSQL version must be v15 or later #endif #define PG_MAJOR_VERSION (PG_VERSION_NUM / 100) #define PG_MINOR_VERSION (PG_VERSION_NUM % 100) #include "access/brin.h" -#include "access/brin_revmap.h" -#include "access/generic_xlog.h" -#include "access/gist.h" -#include "access/hash.h" #include "access/heapam.h" -#include "access/heapam_xlog.h" -#if PG_VERSION_NUM >= 130000 -#include "access/heaptoast.h" -#endif -#include "access/htup_details.h" +#include "access/genam.h" #include "access/reloptions.h" #include "access/relscan.h" -#if PG_VERSION_NUM >= 140000 #include "access/syncscan.h" -#endif -#include "access/sysattr.h" -#if PG_VERSION_NUM < 130000 -#include "access/tuptoaster.h" -#endif -#include "access/twophase.h" +#include "access/table.h" +#include "access/tableam.h" #include "access/visibilitymap.h" #include "access/xact.h" -#include "catalog/catalog.h" +#include "catalog/binary_upgrade.h" #include "catalog/dependency.h" -#include "catalog/heap.h" #include "catalog/indexing.h" #include "catalog/namespace.h" #include "catalog/objectaccess.h" -#include "catalog/objectaddress.h" #include "catalog/pg_aggregate.h" #include "catalog/pg_am.h" #include "catalog/pg_amop.h" -#include "catalog/pg_attribute.h" #include "catalog/pg_cast.h" -#include "catalog/pg_class.h" -#include "catalog/pg_database.h" #include "catalog/pg_depend.h" -#include "catalog/pg_extension.h" +#include "catalog/pg_foreign_table.h" #include "catalog/pg_foreign_data_wrapper.h" #include "catalog/pg_foreign_server.h" -#include "catalog/pg_foreign_table.h" -#include "catalog/pg_language.h" +#include "catalog/pg_user_mapping.h" +#include "catalog/pg_extension.h" #include "catalog/pg_namespace.h" -#include "catalog/pg_operator.h" #include "catalog/pg_proc.h" #include "catalog/pg_statistic.h" -#include "catalog/pg_tablespace.h" -#include "catalog/pg_trigger.h" +#include "catalog/pg_tablespace_d.h" #include "catalog/pg_type.h" -#if PG_VERSION_NUM < 110000 -#include "catalog/pg_type_fn.h" -#else -#include "catalog/pg_type_d.h" -#endif -#include "catalog/pg_user_mapping.h" -#include "commands/dbcommands.h" #include "commands/defrem.h" #include "commands/event_trigger.h" -#include "commands/explain.h" #include "commands/extension.h" -#include "commands/proclang.h" #include "commands/tablecmds.h" #include "commands/tablespace.h" -#include "commands/trigger.h" #include "commands/typecmds.h" -#include "commands/variable.h" -#include "common/base64.h" -#if PG_VERSION_NUM >= 130000 #include "common/hashfn.h" -#endif #include "common/int.h" -#include "common/md5.h" -#include "executor/executor.h" -#include "executor/nodeAgg.h" -#include "executor/nodeIndexscan.h" -#include "executor/nodeCustom.h" #include "executor/nodeSubplan.h" -#include "fmgr.h" #include "foreign/fdwapi.h" #include "foreign/foreign.h" #include "funcapi.h" -#include "lib/ilist.h" -#include "lib/stringinfo.h" -#include "libpq/be-fsstubs.h" -#include "libpq/libpq-fs.h" #include "libpq/pqformat.h" -#include "libpq/pqsignal.h" +#include "lib/stringinfo.h" #include "miscadmin.h" -#include "nodes/execnodes.h" #include "nodes/extensible.h" #include "nodes/makefuncs.h" #include "nodes/nodeFuncs.h" -#include "nodes/pg_list.h" -#include "nodes/plannodes.h" -#include "nodes/primnodes.h" -#include "nodes/readfuncs.h" -#if PG_VERSION_NUM < 120000 -#include "nodes/relation.h" -#endif -#if PG_VERSION_NUM >= 120000 -#include "nodes/supportnodes.h" -#endif -#if PG_VERSION_NUM >= 120000 -#include "optimizer/appendinfo.h" -#endif +#include "nodes/pathnodes.h" #include "optimizer/clauses.h" #include "optimizer/cost.h" -#if PG_VERSION_NUM >= 120000 #include "optimizer/optimizer.h" -#endif #include "optimizer/pathnode.h" #include "optimizer/paths.h" #include "optimizer/plancat.h" -#include "optimizer/planmain.h" #include "optimizer/planner.h" -#include "optimizer/prep.h" +#include "optimizer/planmain.h" #include "optimizer/restrictinfo.h" #include "optimizer/tlist.h" -#if PG_VERSION_NUM < 120000 -#include "optimizer/var.h" -#endif -#include "parser/parse_coerce.h" -#include "parser/parsetree.h" #include "parser/parse_func.h" -#include "parser/parse_oper.h" -#include "parser/scansup.h" -#include "pgstat.h" -#include "port/atomics.h" #include "postmaster/bgworker.h" #include "postmaster/postmaster.h" -#include "storage/buf.h" +#include "storage/bufmgr.h" #include "storage/buf_internals.h" #include "storage/ipc.h" -#include "storage/itemptr.h" #include "storage/fd.h" -#include "storage/large_object.h" #include "storage/latch.h" -#include "storage/lmgr.h" -#include "storage/lock.h" -#include "storage/pg_shmem.h" -#include "storage/predicate.h" -#include "storage/proc.h" -#include "storage/procarray.h" +#include "storage/pmsignal.h" #include "storage/shmem.h" #include "storage/smgr.h" -#include "storage/spin.h" -#include "utils/array.h" -#include "utils/arrayaccess.h" #include "utils/builtins.h" -#include "utils/bytea.h" #include "utils/cash.h" #include "utils/catcache.h" #include "utils/date.h" #include "utils/datetime.h" -#if PG_VERSION_NUM >= 120000 #include "utils/float.h" -#endif #include "utils/fmgroids.h" #include "utils/guc.h" -#include "utils/json.h" -#include "utils/jsonb.h" #include "utils/inet.h" -#if PG_VERSION_NUM < 150000 -#include "utils/int8.h" -#endif #include "utils/inval.h" +#include "utils/jsonb.h" #include "utils/lsyscache.h" -#include "utils/memutils.h" -#include "utils/numeric.h" -#include "utils/pg_crc.h" #include "utils/pg_locale.h" #include "utils/rangetypes.h" #include "utils/regproc.h" @@ -197,1742 +108,834 @@ #include "utils/resowner.h" #include "utils/ruleutils.h" #include "utils/selfuncs.h" -#include "utils/snapmgr.h" #include "utils/spccache.h" #include "utils/syscache.h" -#if PG_VERSION_NUM < 120000 -#include "utils/tqual.h" -#endif +#include "utils/timestamp.h" +#include "utils/tuplestore.h" #include "utils/typcache.h" #include "utils/uuid.h" -#include "utils/varbit.h" -#include "utils/varlena.h" - +#include "utils/wait_event.h" +#include #define CUDA_API_PER_THREAD_DEFAULT_STREAM 1 #include -#include -#include -#include -#include +#include +#include #include #include #include -#include -#include +#include +#include +#include #include -#include -#include +#include +#include +#include #include #include +#include #include -#include +#include +#include "xpu_common.h" +#include "pg_utils.h" #include "heterodb_extra.h" -#include "arrow_defs.h" -/* - * -------------------------------------------------------------------- - * - * Configuration sections +/* ------------------------------------------------ * - * NOTE: We uses configuration of the host PostgreSQL system, instead of - * own configure script, not to mismatch prerequisites for module build. - * However, some (possible) configuration will lead unexpected behavior. - * So, we put some checks to prevent unexpected host configurations. + * Global Type Definitions * - * -------------------------------------------------------------------- - */ -#if SIZEOF_DATUM != 8 -#error PG-Strom expects 64bit platform -#endif -#if PG_VERSION_NUM < 130000 -/* - * At PG13, 2e4db241bfd3206bad8286f8ffc2db6bbdaefcdf removed - * '--disable-float4-byval' configure flag, thus, float32 should be - * always passed by value. + * ------------------------------------------------ */ -#ifndef USE_FLOAT4_BYVAL -#error PG-Strom expects float32 is referenced by value, not reference -#endif -#endif /* VER < PG13*/ -#ifndef USE_FLOAT8_BYVAL -#error PG-Strom expexts float64 is referenced by value, not reference -#endif -#ifndef HAVE_INT64_TIMESTAMP -#error PG-Strom expects timestamp has 64bit integer format -#endif -#include "cuda_common.h" -#include "pg_compat.h" - -#define RESTRACK_HASHSIZE 53 -typedef struct GpuContext +typedef struct GpuDevAttributes { - dlist_node chain; - pg_atomic_uint32 refcnt; - ResourceOwner resowner; - /* cuda resources per GpuContext */ - cl_int cuda_dindex; - CUdevice cuda_device; - CUcontext cuda_context; - /* resource management */ - slock_t restrack_lock; - dlist_head restrack[RESTRACK_HASHSIZE]; - /* GPU device memory management */ - pthread_rwlock_t gm_rwlock; - dlist_head gm_normal_list; /* list of device memory segments */ - dlist_head gm_iomap_list; /* list of I/O map memory segments */ - dlist_head gm_managed_list; /* list of managed memory segments */ - dlist_head gm_hostmem_list; /* list of Host memory segments */ - /* error information buffer */ - pg_atomic_uint32 error_level; - int error_code; - const char *error_filename; - int error_lineno; - const char *error_funcname; - char error_message[200]; - /* debug counter */ - pg_atomic_uint64 debug_count1; - pg_atomic_uint64 debug_count2; - pg_atomic_uint64 debug_count3; - pg_atomic_uint64 debug_count4; - /* management of the work-queue */ - bool worker_is_running; - pthread_mutex_t worker_mutex; - pthread_cond_t worker_cond; - pg_atomic_uint32 terminate_workers; - dlist_head pending_tasks; /* list of GpuTask */ - cl_int num_workers; - pg_atomic_uint32 worker_index; - pthread_t worker_threads[FLEXIBLE_ARRAY_MEMBER]; -} GpuContext; - -/* Identifier of the Gpu Programs */ -typedef cl_long ProgramId; -#define INVALID_PROGRAM_ID (-1L) - -/* - * GpuTask and related - */ -typedef enum { - GpuTaskKind_GpuScan, - GpuTaskKind_GpuJoin, - GpuTaskKind_GpuPreAgg, - GpuTaskKind_GpuSort, - GpuTaskKind_PL_CUDA, -} GpuTaskKind; - -typedef struct GpuTask GpuTask; -typedef struct GpuTaskState GpuTaskState; -typedef struct GpuTaskSharedState GpuTaskSharedState; -typedef struct ArrowFdwState ArrowFdwState; -typedef struct GpuCacheState GpuCacheState; - -/* - * GpuTaskState - * - * A common structure of the state machine of GPU related tasks. + int32 NUMA_NODE_ID; + int32 DEV_ID; + char DEV_NAME[256]; + char DEV_UUID[sizeof(CUuuid)]; + size_t DEV_TOTAL_MEMSZ; + size_t DEV_BAR1_MEMSZ; + bool DEV_SUPPORT_GPUDIRECTSQL; +#define DEV_ATTR(LABEL,DESC) \ + int32 LABEL; +#include "gpu_devattrs.h" +#undef DEV_ATTR +} GpuDevAttributes; + +extern GpuDevAttributes *gpuDevAttrs; +extern int numGpuDevAttrs; +#define GPUKERNEL_MAX_SM_MULTIPLICITY 4 + +/* + * devtype/devfunc/devcast definitions + */ +struct devtype_info; +struct devfunc_info; +struct devcast_info; + +typedef uint32_t (*devtype_hashfunc_f)(bool isnull, Datum value); + +typedef struct devtype_info +{ + uint32_t hash; + TypeOpCode type_code; + Oid type_oid; + uint64_t type_flags; + int16 type_length; + int16 type_align; + bool type_byval; + bool type_is_negative; + const char *type_name; + const char *type_extension; + int type_sizeof; + int type_alignof; + devtype_hashfunc_f type_hashfunc; + /* oid of type related functions */ + Oid type_eqfunc; + Oid type_cmpfunc; + /* alias type, if any */ + struct devtype_info *type_alias; + /* element type of array, if type is array */ + struct devtype_info *type_element; + /* attribute of sub-fields, if type is composite */ + int comp_nfields; + struct devtype_info *comp_subtypes[1]; +} devtype_info; + +typedef struct devfunc_info +{ + dlist_node chain; + uint32_t hash; + FuncOpCode func_code; + const char *func_extension; + const char *func_name; + Oid func_oid; + struct devtype_info *func_rettype; + uint64_t func_flags; + int func_cost; + bool func_is_negative; + int func_nargs; + struct devtype_info *func_argtypes[1]; +} devfunc_info; + +typedef struct XpuConnection XpuConnection; +typedef struct GpuCacheState GpuCacheState; +typedef struct DpuStorageEntry DpuStorageEntry; +typedef struct ArrowFdwState ArrowFdwState; +typedef struct BrinIndexState BrinIndexState; + +/* + * pgstromPlanInfo */ -struct NVMEScanState; -struct GpuTaskSharedState; - -struct GpuTaskState +typedef struct { - CustomScanState css; - GpuContext *gcontext; - GpuTaskKind task_kind; /* one of GpuTaskKind_* */ - ProgramId program_id; /* CUDA Program (to be acquired) */ - CUmodule cuda_module; /* CUDA binary module */ - CUdeviceptr kern_params; /* Const/Param buffer */ - List *used_params; /* Const/Param expressions */ - const Bitmapset *optimal_gpus; /* GPUs preference on plan time */ - bool scan_done; /* True, if no more rows to read */ - - /* fields for outer scan */ - Cost outer_startup_cost; /* copy from the outer path node */ - Cost outer_total_cost; /* copy from the outer path node */ - double outer_plan_rows; /* copy from the outer path node */ - int outer_plan_width; /* copy from the outer path node */ - cl_uint outer_nrows_per_block; - Bitmapset *outer_refs; /* referenced outer attributes */ - Instrumentation outer_instrument; /* runtime statistics, if any */ - TupleTableSlot *scan_overflow; /* temporary buffer, if no space on PDS */ - /* BRIN index support on outer relation, if any */ - struct pgstromIndexState *outer_index_state; - Bitmapset *outer_index_map; - - IndexScanDesc outer_brin_index; /* brin index of outer scan, if any */ - long outer_brin_count; /* # of blocks skipped by index */ - - ArrowFdwState *af_state; /* for GpuTask on Arrow_Fdw */ - GpuCacheState *gc_state; /* for GpuTask on GpuCache */ - - /* - * A state object for NVMe-Strom. If not NULL, GTS prefers BLOCK format - * as source data store. Then, SSD2GPU Direct SQL Execution will be kicked. - */ - struct NVMEScanState *nvme_sstate; - long nvme_count; /* # of blocks loaded by SSD2GPU */ - - /* - * fields to fetch rows from the current task - * - * NOTE: @curr_index is sufficient to point a particular row of KDS, - * if format is ROW, HASH and SLOT. However, BLOCK format has no direct - * pointer for each rows. It contains @nitems blocks and individual block - * contains uncertain number of rows. So, at BLOCK format, @curr_index - * is index of the current block, and @curr_lp_index is also index of - * the current line pointer. - * For all format, @curr_index == @nitems means no rows any more. - */ - cl_long curr_index; /* current position on the curr_task */ - cl_long curr_lp_index; /* index of LinePointer in a block */ - HeapTupleData curr_tuple; /* internal use of PDS_fetch() */ - struct GpuTask *curr_task; /* a GpuTask currently processed */ + JoinType join_type; /* one of JOIN_* */ + double join_nrows; /* estimated nrows in this depth */ + List *hash_outer_keys;/* hash-keys for outer-side */ + List *hash_outer_keys_fallback; + List *hash_inner_keys;/* hash-keys for inner-side */ + List *hash_inner_keys_fallback; + List *join_quals; /* join quals */ + List *join_quals_fallback; + List *other_quals; /* other quals */ + List *other_quals_fallback; + Oid gist_index_oid; /* GiST index oid */ + AttrNumber gist_index_col; /* GiST index column number */ + Node *gist_clause; /* GiST index clause */ + Selectivity gist_selectivity; /* GiST selectivity */ +} pgstromPlanInnerInfo; - /* callbacks used by gputasks.c */ - GpuTask *(*cb_next_task)(GpuTaskState *gts); - GpuTask *(*cb_terminator_task)(GpuTaskState *gts, - cl_bool *task_is_ready); - void (*cb_switch_task)(GpuTaskState *gts, GpuTask *gtask); - TupleTableSlot *(*cb_next_tuple)(GpuTaskState *gts); - int (*cb_process_task)(GpuTask *gtask, - CUmodule cuda_module); - void (*cb_release_task)(GpuTask *gtask); - /* list of GpuTasks (protexted with GpuContext->mutex) */ - dlist_head ready_tasks; /* list of tasks already processed */ - cl_uint num_running_tasks; /* # of running tasks */ - cl_uint num_ready_tasks; /* # of ready tasks */ - - /* misc fields */ - cl_long num_cpu_fallbacks; /* # of CPU fallback chunks */ - uint64 debug_counter0; - uint64 debug_counter1; - uint64 debug_counter2; - uint64 debug_counter3; - - /* co-operation with CPU parallel */ - GpuTaskSharedState *gtss; /* DSM segment of GTS if any */ - ParallelContext *pcxt; /* Parallel context of PostgreSQL */ -}; - -/* - * GpuTaskSharedState - */ -struct GpuTaskSharedState +typedef struct { - /* for arrow_fdw file scan */ - pg_atomic_uint32 af_rbatch_index; - pg_atomic_uint32 af_rbatch_nload; /* # of loaded record-batches */ - pg_atomic_uint32 af_rbatch_nskip; /* # of skipped record-batches */ - /* for gpu_cache file scan */ - pg_atomic_uint32 gc_fetch_count; - /* for block-based regular table scan */ - BlockNumber pbs_nblocks; /* # blocks in relation at start of scan */ - slock_t pbs_mutex; /* lock of the fields below */ - BlockNumber pbs_startblock; /* starting block number */ - BlockNumber pbs_nallocated; /* # of blocks allocated to workers */ - - /* common parallel table scan descriptor */ - ParallelTableScanDescData phscan; -}; - -/* - * GpuTaskRuntimeStat - common statistics + uint32_t task_kind; /* one of TASK_KIND__* */ + const Bitmapset *gpu_cache_devs; /* device for GpuCache, if any */ + const Bitmapset *gpu_direct_devs; /* device for GPU-Direct SQL, if any */ + const DpuStorageEntry *ds_entry; /* target DPU if DpuJoin */ + /* Plan information */ + const Bitmapset *outer_refs; /* referenced columns */ + List *used_params; /* param list in use */ + List *host_quals; /* host qualifiers to scan the outer */ + Index scan_relid; /* relid of the outer relation to scan */ + List *scan_quals; /* device qualifiers to scan the outer */ + List *scan_quals_fallback;/* 'scan_quals' for CPU fallback */ + double scan_tuples; /* copy of baserel->tuples */ + double scan_rows; /* copy of baserel->rows */ + double parallel_divisor; /* parallel divisor */ + Cost final_cost; /* cost for sendback and host-side tasks */ + /* BRIN-index support */ + Oid brin_index_oid; /* OID of BRIN-index, if any */ + List *brin_index_conds; /* BRIN-index key conditions */ + List *brin_index_quals; /* Original BRIN-index qualifier */ + /* XPU code for JOIN */ + bytea *kexp_scan_kvars_load; /* VarLoads at depth=0 */ + bytea *kexp_scan_quals; + bytea *kexp_join_kvars_load_packed; /* VarLoads at depth>0 */ + bytea *kexp_join_quals_packed; + bytea *kexp_hash_keys_packed; + bytea *kexp_gist_quals_packed; + bytea *kexp_projection; + bytea *kexp_groupby_keyhash; + bytea *kexp_groupby_keyload; + bytea *kexp_groupby_keycomp; + bytea *kexp_groupby_actions; + List *kvars_depth; + List *kvars_resno; + List *kvars_types; /* type-oid, if it needs extra buffer on kvars-slot */ + List *kvars_exprs; + uint32_t extra_flags; + uint32_t extra_bufsz; + /* fallback projection */ + List *fallback_tlist; /* fallback_slot -> custom_scan_tlist if JOIN/PREAGG */ + /* group-by parameters */ + List *groupby_actions; /* list of KAGG_ACTION__* on the kds_final */ + List *groupby_keys; /* resno of grouping keys, if GROUP BY exists */ + /* inner relations */ + int num_rels; + pgstromPlanInnerInfo inners[FLEXIBLE_ARRAY_MEMBER]; +} pgstromPlanInfo; + +/* + * pgstromSharedState */ typedef struct { - slock_t lock; - Instrumentation outer_instrument; - pg_atomic_uint64 source_nitems; - pg_atomic_uint64 nitems_filtered; - pg_atomic_uint64 nvme_count; - pg_atomic_uint64 brin_count; - pg_atomic_uint64 fallback_count; - /* debug counter */ - pg_atomic_uint64 debug_counter0; - pg_atomic_uint64 debug_counter1; - pg_atomic_uint64 debug_counter2; - pg_atomic_uint64 debug_counter3; -} GpuTaskRuntimeStat; + pg_atomic_uint64 inner_nitems; + pg_atomic_uint64 inner_usage; +} pgstromSharedInnerState; -static inline void -mergeGpuTaskRuntimeStatParallelWorker(GpuTaskState *gts, - GpuTaskRuntimeStat *gt_rtstat) -{ - Assert(IsParallelWorker()); - if (!gt_rtstat) - return; - SpinLockAcquire(>_rtstat->lock); - InstrAggNode(>_rtstat->outer_instrument, - >s->outer_instrument); - SpinLockRelease(>_rtstat->lock); - pg_atomic_add_fetch_u64(>_rtstat->nvme_count, gts->nvme_count); - pg_atomic_add_fetch_u64(>_rtstat->brin_count, gts->outer_brin_count); - pg_atomic_add_fetch_u64(>_rtstat->fallback_count, - gts->num_cpu_fallbacks); - /* debug counter */ - if (gts->debug_counter0 != 0) - pg_atomic_add_fetch_u64(>_rtstat->debug_counter0, gts->debug_counter0); - if (gts->debug_counter1 != 0) - pg_atomic_add_fetch_u64(>_rtstat->debug_counter1, gts->debug_counter1); - if (gts->debug_counter2 != 0) - pg_atomic_add_fetch_u64(>_rtstat->debug_counter2, gts->debug_counter2); - if (gts->debug_counter3 != 0) - pg_atomic_add_fetch_u64(>_rtstat->debug_counter3, gts->debug_counter3); -} - -static inline void -mergeGpuTaskRuntimeStat(GpuTaskState *gts, - GpuTaskRuntimeStat *gt_rtstat) +typedef struct { - InstrAggNode(>s->outer_instrument, - >_rtstat->outer_instrument); - gts->outer_instrument.tuplecount = (double) - pg_atomic_read_u64(>_rtstat->source_nitems); - gts->outer_instrument.nfiltered1 = (double) - pg_atomic_read_u64(>_rtstat->nitems_filtered); - gts->nvme_count += pg_atomic_read_u64(>_rtstat->nvme_count); - gts->outer_brin_count += pg_atomic_read_u64(>_rtstat->brin_count); - gts->num_cpu_fallbacks += pg_atomic_read_u64(>_rtstat->fallback_count); - - gts->debug_counter0 += pg_atomic_read_u64(>_rtstat->debug_counter0); - gts->debug_counter1 += pg_atomic_read_u64(>_rtstat->debug_counter1); - gts->debug_counter2 += pg_atomic_read_u64(>_rtstat->debug_counter2); - gts->debug_counter3 += pg_atomic_read_u64(>_rtstat->debug_counter3); - - if (gts->css.ss.ps.instrument) - memcpy(>s->css.ss.ps.instrument->bufusage, - >s->outer_instrument.bufusage, - sizeof(BufferUsage)); -} + dsm_handle ss_handle; /* DSM handle of the SharedState */ + uint32_t ss_length; /* length of the SharedState */ + /* pg-strom's unique plan-id */ + uint64_t query_plan_id; + /* control variables to detect the last plan-node at parallel execution */ + pg_atomic_uint32 scan_task_control; + slock_t __rjoin_control_lock; + /* statistics */ + pg_atomic_uint64 source_ntuples; + pg_atomic_uint64 source_nvalids; + pg_atomic_uint32 source_nblocks; /* only KDS_FORMAT_BLOCK */ + /* for arrow_fdw */ + pg_atomic_uint32 arrow_rbatch_index; + pg_atomic_uint32 arrow_rbatch_nload; /* # of loaded record-batches */ + pg_atomic_uint32 arrow_rbatch_nskip; /* # of skipped record-batches */ + /* for gpu-cache */ + pg_atomic_uint32 gcache_fetch_count; + /* for gpu/dpu-direct */ + pg_atomic_uint32 heap_normal_nblocks; + pg_atomic_uint32 heap_direct_nblocks; + pg_atomic_uint32 heap_fallback_nblocks; + /* for brin-index */ + pg_atomic_uint32 brin_index_fetched; + pg_atomic_uint32 brin_index_skipped; + /* for join-inner-preload */ + ConditionVariable preload_cond; /* sync object */ + slock_t preload_mutex; /* mutex for inner-preloading */ + int preload_phase; /* one of INNER_PHASE__* in gpu_join.c */ + int preload_nr_scanning;/* # of scanning process */ + int preload_nr_setup; /* # of setup process */ + uint32_t preload_shmem_handle; /* host buffer handle */ + uint64_t preload_shmem_length; /* host buffer length */ + /* for join-inner relations */ + uint32_t num_rels; /* if xPU-JOIN involved */ + pgstromSharedInnerState inners[FLEXIBLE_ARRAY_MEMBER]; + /* + * MEMO: ...and ParallelBlockTableScanDescData should be allocated + * next to the inners[nmum_rels] array + */ +} pgstromSharedState; -/* - * GpuTask - * - * It is a unit of task to be sent GPU server. Thus, this object must be - * allocated on the DMA buffer area. - */ -struct GpuTask +typedef struct { - kern_errorbuf kerror; /* error status of the task */ - dlist_node chain; /* link to the task state list */ - GpuTaskKind task_kind; /* same with GTS's one */ - ProgramId program_id; /* same with GTS's one */ - GpuTaskState *gts; /* GTS reference in the backend */ - bool cpu_fallback; /* true, if task needs CPU fallback */ + PlanState *ps; + ExprContext *econtext; + /* + * inner preload buffer + */ + List *preload_tuples; + List *preload_hashes; /* if hash-join or gist-join */ + size_t preload_usage; + /* + * join properties (common) + */ + int depth; + JoinType join_type; + ExprState *join_quals; + ExprState *other_quals; + /* + * join properties (hash-join) + */ + List *hash_outer_keys; /* list of ExprState */ + List *hash_inner_keys; /* list of ExprState */ + List *hash_outer_dtypes; /* list of devtype_info */ + List *hash_inner_dtypes; /* list of devtype_info */ + /* + * join properties (gist-join) + */ + Relation gist_irel; + ExprState *gist_clause; +} pgstromTaskInnerState; + +struct pgstromTaskState +{ + CustomScanState css; + uint32_t task_kind; /* one of TASK_KIND__* */ + const Bitmapset *optimal_gpus; /* candidate GPUs to connect */ + const DpuStorageEntry *ds_entry; /* candidate DPUs to connect */ + XpuConnection *conn; + pgstromSharedState *ps_state; /* on the shared-memory segment */ + pgstromPlanInfo *pp_info; + GpuCacheState *gcache_state; + ArrowFdwState *arrow_state; + BrinIndexState *br_state; + kern_multirels *h_kmrels; /* host inner buffer (if JOIN) */ + const char *kds_pathname; /* pathname to be used for KDS setup */ + /* current chunk (already processed by the device) */ + XpuCommand *curr_resp; + HeapTupleData curr_htup; + kern_data_store *curr_kds; + int curr_chunk; + int64_t curr_index; + bool scan_done; + bool final_done; + /* control variables to handle right outer join */ + slock_t *rjoin_control_lock; + int *rjoin_control_array; /* per xPU device */ + /* base relation scan, if any */ + TupleTableSlot *base_slot; + ExprState *base_quals; /* equivalent to device quals */ + /* CPU fallback support */ + off_t *fallback_tuples; + size_t fallback_index; + size_t fallback_nitems; + size_t fallback_nrooms; + size_t fallback_usage; + size_t fallback_bufsz; + char *fallback_buffer; + TupleTableSlot *fallback_slot; /* host-side kvars-slot */ + ProjectionInfo *fallback_proj; /* base or fallback slot -> custom_tlist */ + /* request command buffer (+ status for table scan) */ + TBMIterateResult *curr_tbm; + Buffer curr_vm_buffer; /* for visibility-map */ + BlockNumber curr_block_num; /* for KDS_FORMAT_BLOCK */ + BlockNumber curr_block_tail; /* for KDS_FORMAT_BLOCK */ + StringInfoData xcmd_buf; + /* callbacks */ + TupleTableSlot *(*cb_next_tuple)(struct pgstromTaskState *pts); + XpuCommand *(*cb_next_chunk)(struct pgstromTaskState *pts, + struct iovec *xcmd_iov, int *xcmd_iovcnt); + XpuCommand *(*cb_final_chunk)(struct pgstromTaskState *pts, + kern_final_task *fin, + struct iovec *xcmd_iov, int *xcmd_iovcnt); + void (*cb_cpu_fallback)(struct pgstromTaskState *pts, + struct kern_data_store *kds, + HeapTuple htuple); + /* inner relations state (if JOIN) */ + int num_rels; + pgstromTaskInnerState inners[FLEXIBLE_ARRAY_MEMBER]; }; +typedef struct pgstromTaskState pgstromTaskState; /* - * State structure of NVMe-Strom per GpuTaskState - */ -typedef struct NVMEScanState -{ - cl_uint nrows_per_block; - cl_uint nblocks_per_chunk; - BlockNumber curr_segno; - Buffer curr_vmbuffer; - BlockNumber nr_segs; - GPUDirectFileDesc files[FLEXIBLE_ARRAY_MEMBER]; -} NVMEScanState; - -/* - * pgstrom_data_store - a data structure with various format to exchange - * a data chunk between the host and CUDA server. + * Global variables */ -typedef struct pgstrom_data_store -{ - /* GpuContext which owns this data store */ - GpuContext *gcontext; - - /* reference counter */ - pg_atomic_uint32 refcnt; - - /* - * NOTE: Extra information for KDS_FORMAT_BLOCK. - * @nblocks_uncached is number of PostgreSQL blocks, to be processed - * by NVMe-Strom. If @nblocks_uncached > 0, the tail of PDS shall be - * filled up by an array of strom_dma_chunk. - * @filedesc is file-descriptor of the underlying blocks. - * - * NOTE: Extra information for KDS_FORMAT_ARROW - * @iovec introduces pairs of destination offset, file offset and - * chunk length to be read (usually by SSD-to-GPU Direct SQL). - * If NULL, KDS is preliminary loaded by CPU and filesystem, and - * PDS is also allocated on managed memory area. So, worker don't - * need to kick DMA operations explicitly. - * - * NOTE: Extra information for KDS_FORMAT_COLUMN - * @gc_sstate points the GpuCacheShareState for reference IPC handle - * of the main/extra buffer on the device. This IPC handle is only - * valid under the read lock. - */ - cl_uint nblocks_uncached; /* for KDS_FORMAT_BLOCK */ - GPUDirectFileDesc filedesc; - strom_io_vector *iovec; /* for KDS_FORMAT_ARROW */ - /* for KDS_FORMAT_COLUMN */ - void *gc_sstate; - CUdeviceptr m_kds_main; - CUdeviceptr m_kds_extra; - /* data chunk in kernel portion */ - kern_data_store kds __attribute__ ((aligned (STROMALIGN_LEN))); -} pgstrom_data_store; - -/* -------------------------------------------------------------------- - * - * PG-Strom GUC variables - * - * -------------------------------------------------------------------- */ -extern bool pgstrom_enabled; -extern bool pgstrom_bulkexec_enabled; -extern bool pgstrom_cpu_fallback_enabled; -extern bool pgstrom_regression_test_mode; -extern int pgstrom_max_async_tasks; -extern double pgstrom_gpu_setup_cost; -extern double pgstrom_gpu_dma_cost; -extern double pgstrom_gpu_operator_cost; -extern Size pgstrom_chunk_size(void); extern long PAGE_SIZE; extern long PAGE_MASK; extern int PAGE_SHIFT; extern long PHYS_PAGES; -#define PAGE_ALIGN(sz) TYPEALIGN(PAGE_SIZE,(sz)) - -/* -------------------------------------------------------------------- - * - * Function Declarations - * - * -------------------------------------------------------------------- */ - -/* - * gpu_device.c - */ -typedef struct DevAttributes -{ - cl_int NUMA_NODE_ID; - cl_int DEV_ID; - char DEV_NAME[256]; - char DEV_BRAND[16]; - char DEV_UUID[48]; - size_t DEV_TOTAL_MEMSZ; - size_t DEV_BAR1_MEMSZ; - bool DEV_SUPPORT_GPUDIRECTSQL; -#define DEV_ATTR(LABEL,a,b,c) \ - cl_int LABEL; -#include "device_attrs.h" -#undef DEV_ATTR -} DevAttributes; - -extern DevAttributes *devAttrs; -extern cl_int numDevAttrs; -extern cl_uint devBaselineMaxThreadsPerBlock; -#define cpu_only_mode() (numDevAttrs == 0) -extern void pgstrom_init_gpu_device(void); - -#define GPUKERNEL_MAX_SM_MULTIPLICITY 4 - -extern CUresult gpuOccupancyMaxPotentialBlockSize(int *p_min_grid_sz, - int *p_max_block_sz, - CUfunction kern_function, - size_t dyn_shmem_per_block, - size_t dyn_shmem_per_thread); -extern CUresult gpuOptimalBlockSize(int *p_grid_sz, - int *p_block_sz, - CUfunction kern_function, - CUdevice cuda_device, - size_t dyn_shmem_per_block, - size_t dyn_shmem_per_thread); -extern CUresult __gpuOptimalBlockSize(int *p_grid_sz, - int *p_block_sz, - CUfunction kern_function, - int cuda_dindex, - size_t dyn_shmem_per_block, - size_t dyn_shmem_per_thread); -/* - * shmbuf.c - */ -extern void *shmbufAlloc(size_t sz); -extern void *shmbufAllocZero(size_t sz); -extern void shmbufFree(void *addr); -extern void pgstrom_init_shmbuf(void); -extern MemoryContext TopSharedMemoryContext; +extern long PAGES_PER_BLOCK; /* (BLCKSZ / PAGE_SIZE) */ +#define PAGE_ALIGN(x) TYPEALIGN(PAGE_SIZE,(x)) +#define PGSTROM_CHUNK_SIZE ((size_t)(65534UL << 10)) /* - * gpu_mmgr.c - */ -extern CUresult __gpuMemAllocRaw(GpuContext *gcontext, - CUdeviceptr *p_devptr, - size_t bytesize, - const char *filename, int lineno); -extern CUresult __gpuMemAllocManagedRaw(GpuContext *gcontext, - CUdeviceptr *p_devptr, - size_t bytesize, - int flags, - const char *filename, int lineno); -extern CUresult __gpuMemAllocHostRaw(GpuContext *gcontext, - void **p_hostptr, - size_t bytesize, - const char *filename, int lineno); -extern CUresult __gpuMemAllocDev(GpuContext *gcontext, - CUdeviceptr *p_deviceptr, - size_t bytesize, - CUipcMemHandle *p_mhandle, - const char *filename, int lineno); -extern CUresult __gpuMemAlloc(GpuContext *gcontext, - CUdeviceptr *p_devptr, - size_t bytesize, - const char *filename, int lineno); -extern CUresult __gpuMemAllocManaged(GpuContext *gcontext, - CUdeviceptr *p_devptr, - size_t bytesize, - int flags, - const char *filename, int lineno); -extern CUresult __gpuMemAllocIOMap(GpuContext *gcontext, - CUdeviceptr *p_devptr, - size_t bytesize, - const char *filename, int lineno); -extern size_t gpuMemAllocIOMapMaxLength(void); -extern CUresult __gpuMemAllocHost(GpuContext *gcontext, - void **p_hostptr, - size_t bytesize, - const char *filename, int lineno); -extern CUresult __gpuMemAllocPreserved(cl_int cuda_dindex, - CUipcMemHandle *ipc_mhandle, - ssize_t bytesize, - const char *filename, int lineno); -extern CUresult __gpuIpcOpenMemHandle(GpuContext *gcontext, - CUdeviceptr *p_deviceptr, - CUipcMemHandle m_handle, - unsigned int flags, - const char *filename, int lineno); -extern CUresult gpuMemFree(GpuContext *gcontext, - CUdeviceptr devptr); -extern CUresult gpuMemFreeHost(GpuContext *gcontext, - void *hostptr); -extern CUresult gpuMemFreePreserved(cl_int cuda_dindex, - CUipcMemHandle m_handle); -extern CUresult gpuIpcCloseMemHandle(GpuContext *gcontext, - CUdeviceptr m_deviceptr); - -#define gpuMemAllocRaw(a,b,c) \ - __gpuMemAllocRaw((a),(b),(c),__FILE__,__LINE__) -#define gpuMemAllocManagedRaw(a,b,c,d) \ - __gpuMemAllocManagedRaw((a),(b),(c),(d),__FILE__,__LINE__) -#define gpuMemAllocHostRaw(a,b,c) \ - __gpuMemAllocHostRaw((a),(b),(c),__FILE__,__LINE__) -#define gpuMemAllocDev(a,b,c,d) \ - __gpuMemAllocDev((a),(b),(c),(d),__FILE__,__LINE__) -#define gpuMemAlloc(a,b,c) \ - __gpuMemAlloc((a),(b),(c),__FILE__,__LINE__) -#define gpuMemAllocManaged(a,b,c,d) \ - __gpuMemAllocManaged((a),(b),(c),(d),__FILE__,__LINE__) -#define gpuMemAllocIOMap(a,b,c) \ - __gpuMemAllocIOMap((a),(b),(c),__FILE__,__LINE__) -#define gpuMemAllocHost(a,b,c) \ - __gpuMemAllocHost((a),(b),(c),__FILE__,__LINE__) -#define gpuMemAllocPreserved(a,b,c) \ - __gpuMemAllocPreserved((a),(b),(c),__FILE__,__LINE__) -#define gpuIpcOpenMemHandle(a,b,c,d) \ - __gpuIpcOpenMemHandle((a),(b),(c),(d),__FILE__,__LINE__) - -extern void gpuMemReclaimSegment(GpuContext *gcontext); - -extern void gpuMemCopyFromSSD(CUdeviceptr m_kds, pgstrom_data_store *pds); - -extern void pgstrom_gpu_mmgr_init_gpucontext(GpuContext *gcontext); -extern void pgstrom_gpu_mmgr_cleanup_gpucontext(GpuContext *gcontext); -extern void pgstrom_init_gpu_mmgr(void); - -/* - * gpu_context.c - */ -extern int pgstrom_max_async_tasks; /* GUC */ -extern __thread GpuContext *GpuWorkerCurrentContext; -extern __thread sigjmp_buf *GpuWorkerExceptionStack; -extern __thread int GpuWorkerIndex; -#define CU_CONTEXT_PER_THREAD \ - (GpuWorkerCurrentContext->cuda_context) -#define CU_DEVICE_PER_THREAD \ - (GpuWorkerCurrentContext->cuda_device) -#define CU_DINDEX_PER_THREAD \ - (GpuWorkerCurrentContext->cuda_dindex) - -extern __thread CUevent CU_EVENT_PER_THREAD; - -extern void GpuContextWorkerReportError(int elevel, - int errcode, - const char *__filename, int lineno, - const char *funcname, - const char *fmt, ...) - pg_attribute_printf(6,7); - -static inline void -CHECK_FOR_GPUCONTEXT(GpuContext *gcontext) -{ - uint32 error_level = pg_atomic_read_u32(&gcontext->error_level); - /* - * NOTE: The least bit of the error_level is a flag to indicate - * whether the error information is ready or not. - */ - if (error_level >= 2 * ERROR) - { - while ((error_level & 1) != 0) - { - pg_usleep(1000L); - error_level = pg_atomic_read_u32(&gcontext->error_level); - } - ereport(error_level / 2, - (errcode(gcontext->error_code), - errmsg("%s", gcontext->error_message), - (pgstrom_regression_test_mode ? 0 : - errdetail("GPU kernel location: %s:%d [%s]", - gcontext->error_filename, - gcontext->error_lineno, - gcontext->error_funcname)))); - } - CHECK_FOR_INTERRUPTS(); -} -extern CUresult gpuInit(unsigned int flags); -extern GpuContext *AllocGpuContext(const Bitmapset *optimal_gpus, - bool activate_context, - bool activate_workers); -extern void ActivateGpuContext(GpuContext *gcontext); -extern void ActivateGpuContextNoWorkers(GpuContext *gcontext); -extern GpuContext *GetGpuContext(GpuContext *gcontext); -extern void PutGpuContext(GpuContext *gcontext); -extern void SynchronizeGpuContext(GpuContext *gcontext); -extern void SynchronizeGpuContextOnDSMDetach(dsm_segment *seg, Datum arg); - -#define GPUMEM_DEVICE_RAW_EXTRA ((void *)(~0L)) -#define GPUMEM_HOST_RAW_EXTRA ((void *)(~1L)) - -extern bool trackCudaProgram(GpuContext *gcontext, ProgramId program_id, - const char *filename, int lineno); -extern void untrackCudaProgram(GpuContext *gcontext, ProgramId program_id); -extern bool trackGpuMem(GpuContext *gcontext, CUdeviceptr devptr, void *extra, - const char *filename, int lineno); -extern void *lookupGpuMem(GpuContext *gcontext, CUdeviceptr devptr); -extern void *untrackGpuMem(GpuContext *gcontext, CUdeviceptr devptr); -extern bool trackGpuMemIPC(GpuContext *gcontext, - CUdeviceptr devptr, void *extra, - const char *filename, int lineno); -extern void *untrackGpuMemIPC(GpuContext *gcontext, CUdeviceptr devptr); -extern bool trackRawFileDesc(GpuContext *gcontext, GPUDirectFileDesc *fdesc, - const char *filename, int lineno); -extern void untrackRawFileDesc(GpuContext *gcontext, GPUDirectFileDesc *fdesc); -extern CUmodule __GpuContextLookupModule(GpuContext *gcontext, - ProgramId program_id, - const char *filename, int lineno); -#define GpuContextLookupModule(a,b) \ - __GpuContextLookupModule((a),(b),__FILE__,__LINE__) - -extern void pgstrom_init_gpu_context(void); - -/* - * Exception handling for work-queue of GpuContext - */ -#define STROM_TRY() \ - do { \ - sigjmp_buf *saved_exception_stack = GpuWorkerExceptionStack; \ - sigjmp_buf local_sigjmp_buf; \ - Assert(GpuWorkerCurrentContext != NULL); \ - if (sigsetjmp(local_sigjmp_buf, 0) == 0) \ - { \ - GpuWorkerExceptionStack = &local_sigjmp_buf; - -#define STROM_CATCH() \ - } \ - else \ - { \ - GpuWorkerExceptionStack = saved_exception_stack - -#define STROM_END_TRY() \ - } \ - GpuWorkerExceptionStack = saved_exception_stack; \ - } while(0) - -#define STROM_RE_THROW() \ - siglongjmp(*GpuWorkerExceptionStack, 1) - -#define STROM_REPORT_ERROR(elevel,elabel,fmt,...) \ - do { \ - if (!GpuWorkerCurrentContext) \ - elog((elevel), fmt, ##__VA_ARGS__); \ - else if ((elevel) < ERROR) \ - { \ - if ((elevel) >= log_min_messages) \ - fprintf(stderr, "%s: " fmt " (%s:%d)\n", \ - (elabel), ##__VA_ARGS__, \ - __FILE__, __LINE__); \ - } \ - else \ - { \ - GpuContextWorkerReportError((elevel), \ - ERRCODE_INTERNAL_ERROR, \ - __FILE__, __LINE__, \ - PG_FUNCNAME_MACRO, \ - fmt, ##__VA_ARGS__); \ - pg_unreachable(); \ - } \ - } while(0) - -#define wlog(fmt,...) \ - STROM_REPORT_ERROR(LOG,"Log",fmt,##__VA_ARGS__) -#define wnotice(fmt,...) \ - STROM_REPORT_ERROR(NOTICE,"Notice",fmt,##__VA_ARGS__) -#define werror(fmt,...) \ - STROM_REPORT_ERROR(ERROR,"Error",fmt,##__VA_ARGS__) -#define wfatal(fmt,...) \ - STROM_REPORT_ERROR(FATAL,"Fatal",fmt,##__VA_ARGS__) -#define wpanic(fmt,...) \ - STROM_REPORT_ERROR(PANIC,"Panic",fmt,##__VA_ARGS__) - -static inline void -CHECK_WORKER_TERMINATION(void) -{ - if (pg_atomic_read_u32(&GpuWorkerCurrentContext->terminate_workers)) - werror("GpuContext worker termination"); -} - -#define GPUCONTEXT_PUSH(gcontext) \ - do { \ - CUresult ____rc; \ - \ - ____rc = cuCtxPushCurrent((gcontext)->cuda_context); \ - if (____rc != CUDA_SUCCESS) \ - wfatal("failed on cuCtxPushCurrent: %s", errorText(____rc)) - -#define GPUCONTEXT_POP(gcontext) \ - ____rc = cuCtxPopCurrent(NULL); \ - if (____rc != CUDA_SUCCESS) \ - wfatal("failed on cuCtxPopCurrent: %s", errorText(____rc)); \ - } while(0) - -/* - * gpu_tasks.c - */ -extern CUdeviceptr pgstromSetupKernParambuf(GpuTaskState *gts); -extern void pgstromInitGpuTaskState(GpuTaskState *gts, - GpuContext *gcontext, - GpuTaskKind task_kind, - List *outer_quals, - List *outer_refs, - List *used_params, - const Bitmapset *optimal_gpus, - cl_uint outer_nrows_per_block, - cl_int eflags); -extern TupleTableSlot *pgstromExecGpuTaskState(GpuTaskState *gts); -extern void pgstromRescanGpuTaskState(GpuTaskState *gts); -extern void pgstromReleaseGpuTaskState(GpuTaskState *gts, - GpuTaskRuntimeStat *gt_rtstat); -extern void pgstromExplainGpuTaskState(GpuTaskState *gts, - ExplainState *es, - List *dcontext); -extern Size pgstromEstimateDSMGpuTaskState(GpuTaskState *gts, - ParallelContext *pcxt); -extern void pgstromInitDSMGpuTaskState(GpuTaskState *gts, - ParallelContext *pcxt, - void *coordinate); -extern void pgstromInitWorkerGpuTaskState(GpuTaskState *gts, - void *coordinate); -extern void pgstromReInitializeDSMGpuTaskState(GpuTaskState *gts); -extern void pgstromShutdownDSMGpuTaskState(GpuTaskState *gts); - -extern void pgstromInitGpuTask(GpuTaskState *gts, GpuTask *gtask); -extern void pgstrom_init_gputasks(void); - -/* - * cuda_program.c + * extra.c */ -extern ProgramId __pgstrom_create_cuda_program(GpuContext *gcontext, - cl_uint extra_flags, - cl_uint varlena_bufsz, - const char *kern_source, - const char *kern_define, - bool wait_for_build, - bool explain_only, - const char *filename, - int lineno); -#define pgstrom_create_cuda_program(a,b,c,d,e,f,g) \ - __pgstrom_create_cuda_program((a),(b),(c),(d),(e),(f),(g), \ - __FILE__,__LINE__) -extern CUmodule pgstrom_load_cuda_program(ProgramId program_id); -extern void pgstrom_put_cuda_program(GpuContext *gcontext, - ProgramId program_id); -extern void pgstrom_build_session_info(StringInfo str, - GpuTaskState *gts, - cl_uint extra_flags); - -extern char *pgstrom_cuda_source_string(ProgramId program_id); -extern const char *pgstrom_cuda_source_file(ProgramId program_id); -extern const char *pgstrom_cuda_binary_file(ProgramId program_id); -extern void pgstrom_init_cuda_program(void); +extern void pgstrom_init_extra(void); +extern bool heterodbValidateDevice(int gpu_device_id, + const char *gpu_device_name, + const char *gpu_device_uuid); +extern bool gpuDirectOpenDriver(void); +extern void gpuDirectCloseDriver(void); +extern bool gpuDirectMapGpuMemory(CUdeviceptr m_segment, + size_t segment_sz); +extern bool gpuDirectUnmapGpuMemory(CUdeviceptr m_segment); +extern bool gpuDirectFileReadIOV(const char *pathname, + CUdeviceptr m_segment, + off_t m_offset, + const strom_io_vector *iovec); +extern char *gpuDirectGetProperty(void); +extern void gpuDirectSetProperty(const char *key, const char *value); +extern bool gpuDirectIsAvailable(void); /* * codegen.c */ -#include "cuda_codegen.h" - -typedef struct codegen_context { - StringInfoData decl; /* declarations of functions for complex expression */ - int decl_count; /* # of temporary variabes in decl */ - PlannerInfo *root; //not necessary? - RelOptInfo *baserel; /* scope of Var-node, if any */ - List *used_params;/* list of Const/Param in use */ - List *used_vars; /* list of Var in use */ - List *pseudo_tlist; /* pseudo tlist expression, if any */ - uint32_t extra_flags; /* external libraries to be included */ - uint32_t extra_bufsz; /* required size of temporary varlena buffer */ - int devcost; /* relative device cost */ +typedef struct +{ + int elevel; /* ERROR or DEBUG2 */ + Expr *top_expr; + List *used_params; + uint32_t required_flags; + uint32_t extra_flags; + uint32_t extra_bufsz; + uint32_t device_cost; + uint32_t kexp_flags; + List *kvars_depth; + List *kvars_resno; + List *kvars_types; + List *kvars_exprs; + List *tlist_dev; + uint32_t kvars_nslots; + List *input_rels_tlist; } codegen_context; -extern size_t pgstrom_codegen_extra_devtypes(char *buf, size_t bufsz, - uint32 extra_flags); extern devtype_info *pgstrom_devtype_lookup(Oid type_oid); -extern devtype_info *pgstrom_devtype_lookup_and_track(Oid type_oid, - codegen_context *context); extern devfunc_info *pgstrom_devfunc_lookup(Oid func_oid, - Oid func_rettype, List *func_args, Oid func_collid); -extern devfunc_info *pgstrom_devfunc_lookup_type_equal(devtype_info *dtype, - Oid type_collid); -extern devfunc_info *pgstrom_devfunc_lookup_type_compare(devtype_info *dtype, - Oid type_collid); -extern void pgstrom_devfunc_track(codegen_context *context, - devfunc_info *dfunc); -extern devcast_info *pgstrom_devcast_lookup(Oid src_type_oid, - Oid dst_type_oid); -extern bool pgstrom_devtype_can_relabel(Oid src_type_oid, - Oid dst_type_oid); -extern devindex_info *pgstrom_devindex_lookup(Oid opcode, - Oid opfamily); -extern char *pgstrom_codegen_expression(Node *expr, codegen_context *context); -extern void pgstrom_union_type_declarations(StringInfo buf, - const char *name, - List *type_oid_list); -extern bool __pgstrom_device_expression(PlannerInfo *root, - RelOptInfo *baserel, - Expr *expr, - int *p_devcost, - int *p_extra_sz, - const char *filename, int lineno); -#define pgstrom_device_expression(a,b,c) \ - __pgstrom_device_expression((a),(b),(c),NULL,NULL, \ - __FILE__,__LINE__) -#define pgstrom_device_expression_devcost(a,b,c,d) \ - __pgstrom_device_expression((a),(b),(c),(d),NULL, \ - __FILE__,__LINE__) -#define pgstrom_device_expression_extrasz(a,b,c,d) \ - __pgstrom_device_expression((a),(b),(c),NULL,(d), \ - __FILE__,__LINE__) - -extern void pgstrom_init_codegen_context(codegen_context *context, - PlannerInfo *root, - RelOptInfo *baserel); -extern void pgstrom_init_codegen(void); - -/* - * datastore.c - */ -#define pgstrom_chunk_size() ((Size)(65534UL << 10)) /* almost 64MB */ - -extern cl_uint estimate_num_chunks(Path *pathnode); -extern bool KDS_fetch_tuple_row(TupleTableSlot *slot, - kern_data_store *kds, - HeapTuple tuple_buf, - size_t row_index); -extern bool KDS_fetch_tuple_slot(TupleTableSlot *slot, - kern_data_store *kds, - size_t row_index); -extern bool PDS_fetch_tuple(TupleTableSlot *slot, - pgstrom_data_store *pds, - GpuTaskState *gts); -extern kern_data_store *__KDS_clone(GpuContext *gcontext, - kern_data_store *kds, - const char *filename, int lineno); -extern pgstrom_data_store *__PDS_clone(pgstrom_data_store *pds, - const char *filename, int lineno); -extern pgstrom_data_store *PDS_retain(pgstrom_data_store *pds); -extern void PDS_release(pgstrom_data_store *pds); - -extern size_t KDS_calculateHeadSize(TupleDesc tupdesc); -extern bool KDS_schemaIsCompatible(TupleDesc tupdesc, - kern_data_store *kds); -extern void init_kernel_data_store(kern_data_store *kds, - TupleDesc tupdesc, - Size length, - int format, - uint nrooms); - -extern pgstrom_data_store *__PDS_create_row(GpuContext *gcontext, - TupleDesc tupdesc, - Size length, - const char *fname, int lineno); -extern pgstrom_data_store *__PDS_create_hash(GpuContext *gcontext, - TupleDesc tupdesc, - Size length, - const char *fname, int lineno); -extern pgstrom_data_store *__PDS_create_slot(GpuContext *gcontext, - TupleDesc tupdesc, - size_t bytesize, - const char *filename, int lineno); -extern pgstrom_data_store *__PDS_create_block(GpuContext *gcontext, - TupleDesc tupdesc, - NVMEScanState *nvme_sstate, - const char *fname, int lineno); -#define PDS_create_row(a,b,c) \ - __PDS_create_row((a),(b),(c),__FILE__,__LINE__) -#define PDS_create_hash(a,b,c) \ - __PDS_create_hash((a),(b),(c),__FILE__,__LINE__) -#define PDS_create_slot(a,b,c) \ - __PDS_create_slot((a),(b),(c),__FILE__,__LINE__) -#define PDS_create_block(a,b,c) \ - __PDS_create_block((a),(b),(c),__FILE__,__LINE__) -#define KDS_clone(a,b) \ - __KDS_clone((a),(b),__FILE__,__LINE__) -#define PDS_clone(a) \ - __PDS_clone((a),__FILE__,__LINE__) - -extern void KDS_dump_schema(kern_data_store *kds); -//XXX - to be gpu_task.c? -extern void PDS_init_heapscan_state(GpuTaskState *gts); -extern void PDS_end_heapscan_state(GpuTaskState *gts); -extern void PDS_fillup_blocks(pgstrom_data_store *pds); -extern void __PDS_fillup_arrow(pgstrom_data_store *pds_dst, - GpuContext *gcontext, - kern_data_store *kds_head, - int fdesc, strom_io_vector *iovec); -extern pgstrom_data_store *PDS_fillup_arrow(pgstrom_data_store *pds_src); -extern pgstrom_data_store *PDS_writeback_arrow(pgstrom_data_store *pds_src, - CUdeviceptr m_kds_src); -extern bool KDS_insert_tuple(kern_data_store *kds, - TupleTableSlot *slot); -#define PDS_insert_tuple(pds,slot) KDS_insert_tuple(&(pds)->kds,slot) - -extern bool KDS_insert_hashitem(kern_data_store *kds, - TupleTableSlot *slot, - cl_uint hash_value); -extern void pgstrom_init_datastore(void); - -/* - * relscan.c - */ -extern IndexOptInfo *pgstrom_tryfind_brinindex(PlannerInfo *root, - RelOptInfo *baserel, - List **p_indexConds, - List **p_indexQuals, - cl_long *p_indexNBlocks); -#define PGSTROM_RELSCAN_SSD2GPU 0x0001 -#define PGSTROM_RELSCAN_BRIN_INDEX 0x0002 -#define PGSTROM_RELSCAN_ARROW_FDW 0x0004 -#define PGSTROM_RELSCAN_GPU_CACHE 0x0008 -extern int pgstrom_common_relscan_cost(PlannerInfo *root, - RelOptInfo *scan_rel, - List *scan_quals, - int parallel_workers, +extern devfunc_info *devtype_lookup_equal_func(devtype_info *dtype, Oid coll_id); +extern devfunc_info *devtype_lookup_compare_func(devtype_info *dtype, Oid coll_id); + +extern void codegen_context_init(codegen_context *context, + uint32_t task_kind); +extern bytea *codegen_build_qualifiers(codegen_context *context, + List *dev_quals); +extern bytea *codegen_build_scan_loadvars(codegen_context *context); +extern bytea *codegen_build_scan_quals(codegen_context *context, + List *dev_quals); +extern bytea *codegen_build_join_loadvars(codegen_context *context); +extern bytea *codegen_build_packed_joinquals(codegen_context *context, + List *stacked_join_quals, + List *stacked_other_quals); +extern bytea *codegen_build_packed_hashkeys(codegen_context *context, + List *stacked_hash_values); +extern bytea *codegen_build_projection(codegen_context *context); +extern void codegen_build_groupby_actions(codegen_context *context, + pgstromPlanInfo *pp_info); +extern void codegen_build_packed_xpucode(bytea **p_xpucode, + List *exprs_list, + bool inject_hash_value, + List *input_rels_tlist, + uint32_t *p_extra_flags, + uint32_t *p_extra_bufsz, + uint32_t *p_kvars_nslots, + List **p_used_params); +extern bool pgstrom_xpu_expression(Expr *expr, + uint32_t task_kind, + List *input_rels_tlist, + int *p_devcost); +extern bool pgstrom_gpu_expression(Expr *expr, + List *input_rels_tlist, + int *p_devcost); +extern bool pgstrom_dpu_expression(Expr *expr, + List *input_rels_tlist, + int *p_devcost); +extern void pgstrom_explain_xpucode(const CustomScanState *css, + ExplainState *es, + List *dcontext, + const char *label, + bytea *xpucode); +extern char *pgstrom_xpucode_to_string(bytea *xpu_code); +extern void pgstrom_init_codegen(void); + +/* + * brin.c + */ +extern IndexOptInfo *pgstromTryFindBrinIndex(PlannerInfo *root, + RelOptInfo *baserel, + List **p_indexConds, + List **p_indexQuals, + int64_t *p_indexNBlocks); +extern Cost cost_brin_bitmap_build(PlannerInfo *root, + RelOptInfo *baserel, IndexOptInfo *indexOpt, - List *indexQuals, - cl_long indexNBlocks, - double *p_parallel_divisor, - double *p_scan_ntuples, - double *p_scan_nchunks, - cl_uint *p_nrows_per_block, - Cost *p_startup_cost, - Cost *p_run_cost); -extern Bitmapset *pgstrom_pullup_outer_refs(PlannerInfo *root, - RelOptInfo *base_rel, - Bitmapset *referenced); - -extern const Bitmapset *GetOptimalGpusForRelation(PlannerInfo *root, - RelOptInfo *rel); -extern bool ScanPathWillUseNvmeStrom(PlannerInfo *root, - RelOptInfo *baserel); -extern bool RelationCanUseNvmeStrom(Relation relation); - -extern void pgstromExecInitBrinIndexMap(GpuTaskState *gts, - Oid index_oid, - List *index_conds, - List *index_quals); -extern Size pgstromSizeOfBrinIndexMap(GpuTaskState *gts); -extern void pgstromExecGetBrinIndexMap(GpuTaskState *gts); -extern void pgstromExecEndBrinIndexMap(GpuTaskState *gts); -extern void pgstromExecRewindBrinIndexMap(GpuTaskState *gts); -extern void pgstromExplainBrinIndexMap(GpuTaskState *gts, - ExplainState *es, - List *dcontext); - -extern pgstrom_data_store *pgstromExecScanChunk(GpuTaskState *gts); -extern void pgstromRewindScanChunk(GpuTaskState *gts); - -extern void pgstromExplainOuterScan(GpuTaskState *gts, - List *deparse_context, - List *ancestors, - ExplainState *es, - List *outer_quals, - Cost outer_startup_cost, - Cost outer_total_cost, - double outer_plan_rows, - int outer_plan_width); - -extern void pgstrom_init_relscan(void); + List *indexQuals); + +extern void pgstromBrinIndexExecBegin(pgstromTaskState *pts, + Oid index_oid, + List *index_conds, + List *index_quals); +extern bool pgstromBrinIndexNextChunk(pgstromTaskState *pts); +extern TBMIterateResult *pgstromBrinIndexNextBlock(pgstromTaskState *pts); +extern void pgstromBrinIndexExecEnd(pgstromTaskState *pts); +extern void pgstromBrinIndexExecReset(pgstromTaskState *pts); +extern Size pgstromBrinIndexEstimateDSM(pgstromTaskState *pts); +extern Size pgstromBrinIndexInitDSM(pgstromTaskState *pts, char *dsm_addr); +extern Size pgstromBrinIndexAttachDSM(pgstromTaskState *pts, char *dsm_addr); +extern void pgstromBrinIndexShutdownDSM(pgstromTaskState *pts); +extern void pgstromBrinIndexExplain(pgstromTaskState *pts, + List *dcontext, + ExplainState *es); +extern void pgstrom_init_brin(void); /* - * gpuscan.c + * relscan.c */ -extern bool enable_gpuscan; /* GUC */ -extern Cost cost_for_dma_receive(RelOptInfo *rel, double ntuples); -extern void codegen_gpuscan_quals(StringInfo kern, - codegen_context *context, - const char *component, - Index scanrelid, - List *dev_quals_list); -extern bool pgstrom_pullup_outer_scan(PlannerInfo *root, - const Path *outer_path, - Index *p_outer_relid, - List **p_outer_quals, - const Bitmapset **p_optimal_gpus, - IndexOptInfo **p_index_opt, - List **p_index_conds, - List **p_index_quals, - cl_long *p_index_nblocks); -extern bool pgstrom_path_is_gpuscan(const Path *path); -extern bool pgstrom_plan_is_gpuscan(const Plan *plan); -extern bool pgstrom_planstate_is_gpuscan(const PlanState *ps); -extern Path *pgstrom_copy_gpuscan_path(const Path *pathnode); -extern void assign_gpuscan_session_info(StringInfo buf, GpuTaskState *gts); -extern void pgstrom_init_gpuscan(void); +extern Bitmapset *pickup_outer_referenced(PlannerInfo *root, + RelOptInfo *base_rel, + Bitmapset *referenced); +extern size_t estimate_kern_data_store(TupleDesc tupdesc); +extern size_t setup_kern_data_store(kern_data_store *kds, + TupleDesc tupdesc, + size_t length, + char format); +extern XpuCommand *pgstromRelScanChunkDirect(pgstromTaskState *pts, + struct iovec *xcmd_iov, + int *xcmd_iovcnt); +extern XpuCommand *pgstromRelScanChunkNormal(pgstromTaskState *pts, + struct iovec *xcmd_iov, + int *xcmd_iovcnt); +extern void pgstromStoreFallbackTuple(pgstromTaskState *pts, HeapTuple tuple); +extern TupleTableSlot *pgstromFetchFallbackTuple(pgstromTaskState *pts); +extern void pgstrom_init_relscan(void); + +/* + * optimizer.c + */ + + + +/* + * executor.c + */ +extern void __xpuClientOpenSession(pgstromTaskState *pts, + const XpuCommand *session, + pgsocket sockfd, + const char *devname, + int dev_index); +extern int +xpuConnectReceiveCommands(pgsocket sockfd, + void *(*alloc_f)(void *priv, size_t sz), + void (*attach_f)(void *priv, XpuCommand *xcmd), + void *priv, + const char *error_label); +extern void xpuClientCloseSession(XpuConnection *conn); +extern void xpuClientSendCommand(XpuConnection *conn, const XpuCommand *xcmd); +extern void xpuClientPutResponse(XpuCommand *xcmd); +extern const XpuCommand *pgstromBuildSessionInfo(pgstromTaskState *pts, + uint32_t join_inner_handle, + TupleDesc tdesc_final); +extern void pgstromExecInitTaskState(CustomScanState *node, + EState *estate, + int eflags); +extern TupleTableSlot *pgstromExecTaskState(CustomScanState *node); +extern void pgstromExecEndTaskState(CustomScanState *node); +extern void pgstromExecResetTaskState(CustomScanState *node); +extern Size pgstromSharedStateEstimateDSM(CustomScanState *node, + ParallelContext *pcxt); +extern void pgstromSharedStateInitDSM(CustomScanState *node, + ParallelContext *pcxt, + void *coordinate); +extern void pgstromSharedStateAttachDSM(CustomScanState *node, + shm_toc *toc, + void *coordinate); +extern void pgstromSharedStateShutdownDSM(CustomScanState *node); +extern void pgstromExplainTaskState(CustomScanState *node, + List *ancestors, + ExplainState *es); +extern void pgstrom_init_executor(void); /* - * gpujoin.c + * pcie.c */ -struct GpuJoinSharedState; -struct kern_gpujoin; - -extern bool pgstrom_path_is_gpujoin(const Path *pathnode); -extern bool pgstrom_plan_is_gpujoin(const Plan *plannode); -extern bool pgstrom_planstate_is_gpujoin(const PlanState *ps); -extern Path *pgstrom_copy_gpujoin_path(const Path *pathnode); -extern const Bitmapset *gpujoin_get_optimal_gpus(const Path *pathnode); - -#if PG_VERSION_NUM >= 110000 -extern List *extract_partitionwise_pathlist(PlannerInfo *root, - Path *outer_path, - bool try_outer_parallel, - bool try_inner_parallel, - AppendPath **p_append_path, - int *p_parallel_nworkers, - Cost *p_discount_cost); -#endif -extern int gpujoin_process_task(GpuTask *gtask, CUmodule cuda_module); -extern void gpujoin_release_task(GpuTask *gtask); -extern void assign_gpujoin_session_info(StringInfo buf, - GpuTaskState *gts); -extern void pgstrom_init_gpujoin(void); - -extern Size GpuJoinSetupTask(struct kern_gpujoin *kgjoin, - GpuTaskState *gts, - pgstrom_data_store *pds_src); -extern ProgramId GpuJoinCreateCombinedProgram(PlanState *node, - GpuTaskState *gpa_gts, - cl_uint gpa_extra_flags, - cl_uint gpa_varlena_bufsz, - const char *gpa_kern_source, - bool explain_only); -extern bool GpuJoinInnerPreload(GpuTaskState *gts, CUdeviceptr *p_m_kmrels); -extern void GpuJoinInnerUnload(GpuTaskState *gts, bool is_rescan); -extern pgstrom_data_store *GpuJoinExecOuterScanChunk(GpuTaskState *gts); -extern int gpujoinNextRightOuterJoinIfAny(GpuTaskState *gts); -extern TupleTableSlot *gpujoinNextTupleFallbackUpper(GpuTaskState *gts, - struct kern_gpujoin *kgjoin, - pgstrom_data_store *pds_src, - cl_int outer_depth); -extern void gpujoinUpdateRunTimeStat(GpuTaskState *gts, - struct kern_gpujoin *kgjoin); +extern const Bitmapset *GetOptimalGpuForFile(const char *pathname); +extern const Bitmapset *GetOptimalGpuForRelation(Relation relation); +extern const Bitmapset *GetOptimalGpuForBaseRel(PlannerInfo *root, + RelOptInfo *baserel); +extern void pgstrom_init_pcie(void); /* - * gpupreagg.c + * gpu_device.c */ -extern int pgstrom_hll_register_bits; -extern bool pgstrom_path_is_gpupreagg(const Path *pathnode); -extern bool pgstrom_plan_is_gpupreagg(const Plan *plan); -extern bool pgstrom_planstate_is_gpupreagg(const PlanState *ps); -extern Path *pgstrom_copy_gpupreagg_path(const Path *pathnode); -extern void gpupreagg_post_planner(PlannedStmt *pstmt, CustomScan *cscan); -extern void assign_gpupreagg_session_info(StringInfo buf, - GpuTaskState *gts); -extern void pgstrom_init_gpupreagg(void); +extern double pgstrom_gpu_setup_cost; /* GUC */ +extern double pgstrom_gpu_tuple_cost; /* GUC */ +extern double pgstrom_gpu_operator_cost; /* GUC */ +extern double pgstrom_gpu_direct_seq_page_cost; /* GUC */ +extern double pgstrom_gpu_operator_ratio(void); +extern void gpuClientOpenSession(pgstromTaskState *pts, + const XpuCommand *session); +extern CUresult gpuOptimalBlockSize(int *p_grid_sz, + int *p_block_sz, + unsigned int *p_shmem_sz, + CUfunction kern_function, + size_t dynamic_shmem_per_block, + size_t dynamic_shmem_per_warp); +extern bool pgstrom_init_gpu_device(void); /* - * arrow_fdw.c and arrow_read.c + * gpu_service.c */ -extern bool baseRelIsArrowFdw(RelOptInfo *baserel); -extern bool RelationIsArrowFdw(Relation frel); -extern Bitmapset *GetOptimalGpusForArrowFdw(PlannerInfo *root, - RelOptInfo *baserel); -extern bool KDS_fetch_tuple_arrow(TupleTableSlot *slot, - kern_data_store *kds, - size_t row_index); - -extern ArrowFdwState *ExecInitArrowFdw(ScanState *ss, - GpuContext *gcontext, - List *outer_quals, - Bitmapset *outer_refs); -extern pgstrom_data_store *ExecScanChunkArrowFdw(GpuTaskState *gts); -extern void ExecReScanArrowFdw(ArrowFdwState *af_state); -extern void ExecEndArrowFdw(ArrowFdwState *af_state); +struct gpuClient +{ + struct gpuContext *gcontext;/* per-device status */ + dlist_node chain; /* gcontext->client_list */ + CUmodule cuda_module;/* preload cuda binary */ + kern_session_info *session; /* per session info (on cuda managed memory) */ + struct gpuQueryBuffer *gq_buf; /* per query join/preagg device buffer */ + pg_atomic_uint32 refcnt; /* odd number, if error status */ + pthread_mutex_t mutex; /* mutex to write the socket */ + int sockfd; /* connection to PG backend */ + pthread_t worker; /* receiver thread */ +}; +typedef struct gpuClient gpuClient; + +extern int pgstrom_max_async_gpu_tasks; /* GUC */ +extern bool pgstrom_load_gpu_debug_module; /* GUC */ +extern const char *cuStrError(CUresult rc); +extern void __gpuClientELogRaw(gpuClient *gclient, + kern_errorbuf *errorbuf); +extern void __gpuClientELog(gpuClient *gclient, + int errcode, + const char *filename, int lineno, + const char *funcname, + const char *fmt, ...); +#define gpuClientELog(gclient,fmt,...) \ + __gpuClientELog((gclient), ERRCODE_DEVICE_INTERNAL, \ + __FILE__, __LINE__, __FUNCTION__, \ + (fmt), ##__VA_ARGS__) +#define gpuClientFatal(gclient,fmt,...) \ + __gpuClientELog((gclient), ERRCODE_DEVICE_FATAL, \ + __FILE__, __LINE__, __FUNCTION__, \ + (fmt), ##__VA_ARGS__) + +extern __thread int CU_DINDEX_PER_THREAD; +extern __thread CUdevice CU_DEVICE_PER_THREAD; +extern __thread CUcontext CU_CONTEXT_PER_THREAD; +extern __thread CUevent CU_EVENT_PER_THREAD; -extern void ExecInitDSMArrowFdw(ArrowFdwState *af_state, - GpuTaskSharedState *gtss); -extern void ExecReInitDSMArrowFdw(ArrowFdwState *af_state); -extern void ExecInitWorkerArrowFdw(ArrowFdwState *af_state, - GpuTaskSharedState *gtss); -extern void ExecShutdownArrowFdw(ArrowFdwState *af_state); -extern void ExplainArrowFdw(ArrowFdwState *af_state, - Relation frel, - ExplainState *es, - List *dcontext); -extern void pgstrom_init_arrow_fdw(void); +typedef struct +{ + CUdeviceptr __base__; + size_t __offset__; + size_t __length__; + CUdeviceptr m_devptr; +} gpuMemChunk; + +extern const gpuMemChunk *gpuMemAlloc(size_t bytesize); +extern void gpuMemFree(const gpuMemChunk *chunk); +extern const gpuMemChunk *gpuservLoadKdsBlock(gpuClient *gclient, + kern_data_store *kds, + const char *pathname, + strom_io_vector *kds_iovec); +extern const gpuMemChunk *gpuservLoadKdsArrow(gpuClient *gclient, + kern_data_store *kds, + const char *pathname, + strom_io_vector *kds_iovec); +extern bool gpuServiceGoingTerminate(void); +extern void gpuClientWriteBack(gpuClient *gclient, + XpuCommand *resp, + size_t resp_sz, + int kds_nitems, + kern_data_store **kds_array); +extern void pgstrom_init_gpu_service(void); /* * gpu_cache.c */ -extern bool baseRelHasGpuCache(PlannerInfo *root, - RelOptInfo *baserel); -extern bool RelationHasGpuCache(Relation rel); -extern GpuCacheState *ExecInitGpuCache(ScanState *ss, int eflags, - Bitmapset *outer_refs); -extern pgstrom_data_store *ExecScanChunkGpuCache(GpuTaskState *gts); -extern void ExecReScanGpuCache(GpuCacheState *gcache_state); -extern void ExecEndGpuCache(GpuCacheState *gcache_state); - -extern void ExecInitDSMGpuCache(GpuCacheState *gcache_state, - GpuTaskSharedState *gtss); -extern void ExecReInitDSMGpuCache(GpuCacheState *gcache_state); -extern void ExecInitWorkerGpuCache(GpuCacheState *gcache_state, - GpuTaskSharedState *gtss); -extern void ExecShutdownGpuCache(GpuCacheState *gcache_state); -extern void ExplainGpuCache(GpuCacheState *gcache_state, - Relation frel, ExplainState *es); -extern CUresult gpuCacheMapDeviceMemory(GpuContext *gcontext, - pgstrom_data_store *pds); -extern void gpuCacheUnmapDeviceMemory(GpuContext *gcontext, - pgstrom_data_store *pds); -extern void gpuCacheBgWorkerBegin(int cuda_dindex); -extern bool gpuCacheBgWorkerDispatch(int cuda_dindex); -extern bool gpuCacheBgWorkerIdleTask(int cuda_dindex); -extern void gpuCacheBgWorkerEnd(int cuda_dindex); -extern void pgstrom_init_gpu_cache(void); - -/* - * misc.c - */ -extern Node *fixup_varnode_to_origin(Node *expr, List *cscan_tlist); -extern Expr *make_flat_ands_explicit(List *andclauses); -extern AppendRelInfo **find_appinfos_by_relids_nofail(PlannerInfo *root, - Relids relids, - int *nappinfos); -extern double get_parallel_divisor(Path *path); -#if PG_VERSION_NUM < 110000 -/* PG11 changed pg_proc definition */ -extern char get_func_prokind(Oid funcid); -#define PROKIND_FUNCTION 'f' -#define PROKIND_AGGREGATE 'a' -#define PROKIND_WINDOW 'w' -#define PROKIND_PROCEDURE 'p' -#endif -extern int get_relnatts(Oid relid); -extern Oid get_function_oid(const char *func_name, - oidvector *func_args, - Oid namespace_oid, - bool missing_ok); -extern Oid get_type_oid(const char *type_name, - Oid namespace_oid, - bool missing_ok); -extern char *get_type_name(Oid type_oid, bool missing_ok); -extern char *get_proc_library(HeapTuple protup); -extern Oid get_object_extension_oid(Oid class_id, - Oid object_id, - int32 objsub_id, - bool missing_ok); -extern char *bms_to_cstring(Bitmapset *x); -extern List *bms_to_pglist(const Bitmapset *bms); -extern Bitmapset *bms_from_pglist(List *pglist); -extern bool pathtree_has_gpupath(Path *node); -extern bool pathtree_has_parallel_aware(Path *node); -extern Path *pgstrom_copy_pathnode(const Path *pathnode); -extern const char *errorText(int errcode); - -extern ssize_t __readFile(int fdesc, void *buffer, size_t nbytes); -extern ssize_t __writeFile(int fdesc, const void *buffer, size_t nbytes); -extern ssize_t __preadFile(int fdesc, void *buffer, size_t nbytes, off_t f_pos); -extern ssize_t __pwriteFile(int fdesc, const void *buffer, size_t nbytes, off_t f_pos); -extern void *__mmapFile(void *addr, size_t length, - int prot, int flags, int fdesc, off_t offset); -extern int __munmapFile(void *mmap_addr); -extern void *__mremapFile(void *mmap_addr, size_t new_size); - -/* - * nvrtc.c - */ -extern int pgstrom_nvrtc_version(void); -extern void pgstrom_init_nvrtc(void); - -/* - * cufile.c - */ -extern bool cuFileDriverLoaded(void); -extern void pgstrom_init_cufile(void); - -/* - * extra.c - */ -extern bool pgstrom_gpudirect_enabled(void); -extern Size pgstrom_gpudirect_threshold(void); -extern void pgstrom_init_extra(void); -extern bool heterodbLicenseCheck(void); -extern int gpuDirectInitDriver(void); -extern void gpuDirectFileDescOpen(GPUDirectFileDesc *gds_fdesc, - File pg_fdesc); -extern void gpuDirectFileDescOpenByPath(GPUDirectFileDesc *gds_fdesc, - const char *pathname); -extern void gpuDirectFileDescClose(const GPUDirectFileDesc *gds_fdesc); -extern CUresult gpuDirectMapGpuMemory(CUdeviceptr m_segment, - size_t m_segment_sz, - unsigned long *p_iomap_handle); -extern CUresult gpuDirectUnmapGpuMemory(CUdeviceptr m_segment, - unsigned long iomap_handle); - -extern void gpuDirectFileReadIOV(const GPUDirectFileDesc *gds_fdesc, - CUdeviceptr m_segment, - unsigned long iomap_handle, - off_t m_offset, - strom_io_vector *iovec); -extern void extraSysfsSetupDistanceMap(const char *manual_config); -extern Bitmapset *extraSysfsLookupOptimalGpus(File filp); -extern ssize_t extraSysfsPrintNvmeInfo(int index, char *buffer, ssize_t buffer_sz); - -/* - * float2.c - */ -#ifndef FLOAT2OID -#define FLOAT2OID 421 -#endif - -/* - * tinyint.c - */ -#ifndef INT1OID -#define INT1OID 606 -#endif - -/* - * main.c - */ -extern int pgstrom_num_users_extra; -extern pgstromUsersExtraDescriptor pgstrom_users_extra_desc[]; -extern Path *pgstrom_create_dummy_path(PlannerInfo *root, Path *subpath); -extern const Path *gpu_path_find_cheapest(PlannerInfo *root, - RelOptInfo *rel, - bool outer_parallel, - bool inner_parallel); -extern bool gpu_path_remember(PlannerInfo *root, - RelOptInfo *rel, - bool outer_parallel, - bool inner_parallel, - const Path *gpu_path); - -extern void _PG_init(void); -extern const char *pgstrom_strerror(cl_int errcode); - -extern void pgstrom_explain_expression(List *expr_list, const char *qlabel, - PlanState *planstate, - List *deparse_context, - List *ancestors, ExplainState *es, - bool force_prefix, - bool convert_to_and); -extern void show_scan_qual(List *qual, const char *qlabel, - PlanState *planstate, List *ancestors, - ExplainState *es); -extern void show_instrumentation_count(const char *qlabel, int which, - PlanState *planstate, ExplainState *es); - -/* ---------------------------------------------------------------- - * - * Miscellaneous static inline functions - * - * ---------------------------------------------------------------- */ -/* looong label is not friendly for indent */ -#define NumOfSystemAttrs (-(1+FirstLowInvalidHeapAttributeNumber)) -/* Max/Min macros that takes 3 or more arguments */ -#define Max3(a,b,c) ((a) > (b) ? Max((a),(c)) : Max((b),(c))) -#define Max4(a,b,c,d) Max(Max((a),(b)), Max((c),(d))) - -#define Min3(a,b,c) ((a) > (b) ? Min((a),(c)) : Min((b),(c))) -#define Min4(a,b,c,d) Min(Min((a),(b)), Min((c),(d))) - -#ifndef SAMESIGN -#define SAMESIGN(a,b) (((a) < 0) == ((b) < 0)) -#endif - -/* - * trim_cstring - remove spaces from head/tail - */ -static inline char * -trim_cstring(char *str) -{ - char *end; - while (isspace(*str)) - str++; - end = str + strlen(str) - 1; - while (end >= str && isspace(*end)) - *end-- = '\0'; - return str; -} /* - * pmakeFloat - for convenient; makeFloat + psprintf + * gpu_scan.c */ -#define pmakeFloat(fval) \ - makeFloat(psprintf("%.*e", DBL_DIG+3, (double)(fval))) +extern void sort_device_qualifiers(List *dev_quals_list, + List *dev_costs_list); +extern CustomPath *buildXpuScanPath(PlannerInfo *root, + RelOptInfo *baserel, + bool parallel_path, + bool allow_host_quals, + bool allow_no_device_quals, + uint32_t task_kind); +extern CustomScan *PlanXpuScanPathCommon(PlannerInfo *root, + RelOptInfo *baserel, + CustomPath *best_path, + List *tlist, + List *clauses, + pgstromPlanInfo *pp_info, + const CustomScanMethods *methods); +extern void ExecFallbackCpuScan(pgstromTaskState *pts, + kern_data_store *kds, + HeapTuple tuple); +extern void gpuservHandleGpuScanExec(gpuClient *gclient, XpuCommand *xcmd); +extern void pgstrom_init_gpu_scan(void); + +/* + * gpu_join.c + */ +extern void form_pgstrom_plan_info(CustomScan *cscan, + pgstromPlanInfo *pp_info); +extern pgstromPlanInfo *deform_pgstrom_plan_info(CustomScan *cscan); +extern void extract_input_path_params(const Path *input_path, + const Path *inner_path, /* optional */ + pgstromPlanInfo **p_pp_info, + List **p_input_paths_tlist, + List **p_inner_paths_list); +extern void xpujoin_add_custompath(PlannerInfo *root, + RelOptInfo *joinrel, + RelOptInfo *outerrel, + RelOptInfo *innerrel, + JoinType join_type, + JoinPathExtraData *extra, + uint32_t task_kind, + const CustomPathMethods *methods); +extern List *build_fallback_exprs_scan(Index scan_relid, List *scan_exprs); +extern List *build_fallback_exprs_join(codegen_context *context, + List *join_exprs); +extern CustomScan *PlanXpuJoinPathCommon(PlannerInfo *root, + RelOptInfo *joinrel, + CustomPath *cpath, + List *tlist, + List *custom_plans, + pgstromPlanInfo *pp_info, + const CustomScanMethods *methods); +extern uint32_t GpuJoinInnerPreload(pgstromTaskState *pts); +extern void ExecFallbackCpuJoin(pgstromTaskState *pts, + kern_data_store *kds, + HeapTuple tuple); +extern void ExecFallbackCpuJoinRightOuter(pgstromTaskState *pts); +extern void pgstrom_init_gpu_join(void); /* - * get_prev_log2 - * - * It returns N of the largest 2^N value that is smaller than or equal to - * the supplied value. + * gpu_groupby.c */ -static inline int -get_prev_log2(Size size) -{ - int shift = 0; - - if (size == 0 || size == 1) - return 0; - size >>= 1; -#if __GNUC__ - shift = sizeof(Size) * BITS_PER_BYTE - __builtin_clzl(size); -#else -#if SIZEOF_VOID_P == 8 - if ((size & 0xffffffff00000000UL) != 0) - { - size >>= 32; - shift += 32; - } -#endif - if ((size & 0xffff0000UL) != 0) - { - size >>= 16; - shift += 16; - } - if ((size & 0x0000ff00UL) != 0) - { - size >>= 8; - shift += 8; - } - if ((size & 0x000000f0UL) != 0) - { - size >>= 4; - shift += 4; - } - if ((size & 0x0000000cUL) != 0) - { - size >>= 2; - shift += 2; - } - if ((size & 0x00000002UL) != 0) - { - size >>= 1; - shift += 1; - } - if ((size & 0x00000001UL) != 0) - shift += 1; -#endif /* !__GNUC__ */ - return shift; -} +extern int pgstrom_hll_register_bits; +extern void xpupreagg_add_custompath(PlannerInfo *root, + RelOptInfo *input_rel, + RelOptInfo *group_rel, + void *extra, + uint32_t task_kind, + const CustomPathMethods *methods); +extern void ExecFallbackCpuPreAgg(pgstromTaskState *pts, + kern_data_store *kds, + HeapTuple tuple); +extern void pgstrom_init_gpu_preagg(void); /* - * get_next_log2 - * - * It returns N of the least 2^N value that is larger than or equal to - * the supplied value. + * arrow_fdw.c and arrow_read.c */ -static inline int -get_next_log2(Size size) -{ - int shift = 0; - - if (size == 0 || size == 1) - return 0; - size--; -#ifdef __GNUC__ - shift = sizeof(Size) * BITS_PER_BYTE - __builtin_clzl(size); -#else -#if SIZEOF_VOID_P == 8 - if ((size & 0xffffffff00000000UL) != 0) - { - size >>= 32; - shift += 32; - } -#endif - if ((size & 0xffff0000UL) != 0) - { - size >>= 16; - shift += 16; - } - if ((size & 0x0000ff00UL) != 0) - { - size >>= 8; - shift += 8; - } - if ((size & 0x000000f0UL) != 0) - { - size >>= 4; - shift += 4; - } - if ((size & 0x0000000cUL) != 0) - { - size >>= 2; - shift += 2; - } - if ((size & 0x00000002UL) != 0) - { - size >>= 1; - shift += 1; - } - if ((size & 0x00000001UL) != 0) - shift += 1; -#endif /* !__GNUC__ */ - return shift; -} +extern bool baseRelIsArrowFdw(RelOptInfo *baserel); +extern bool RelationIsArrowFdw(Relation frel); +extern const Bitmapset *GetOptimalGpusForArrowFdw(PlannerInfo *root, + RelOptInfo *baserel); +extern const DpuStorageEntry *GetOptimalDpuForArrowFdw(PlannerInfo *root, + RelOptInfo *baserel); +extern bool pgstromArrowFdwExecInit(pgstromTaskState *pts, + List *outer_quals, + const Bitmapset *outer_refs); +extern XpuCommand *pgstromScanChunkArrowFdw(pgstromTaskState *pts, + struct iovec *xcmd_iov, + int *xcmd_iovcnt); +extern void pgstromArrowFdwExecEnd(ArrowFdwState *arrow_state); +extern void pgstromArrowFdwExecReset(ArrowFdwState *arrow_state); +extern void pgstromArrowFdwInitDSM(ArrowFdwState *arrow_state, + pgstromSharedState *ps_state); +extern void pgstromArrowFdwAttachDSM(ArrowFdwState *arrow_state, + pgstromSharedState *ps_state); +extern void pgstromArrowFdwShutdown(ArrowFdwState *arrow_state); +extern void pgstromArrowFdwExplain(ArrowFdwState *arrow_state, + Relation frel, + ExplainState *es, + List *dcontext); +extern bool kds_arrow_fetch_tuple(TupleTableSlot *slot, + kern_data_store *kds, + size_t index, + const Bitmapset *referenced); +extern void pgstrom_init_arrow_fdw(void); /* - * __trim - remove whitespace at the head/tail of cstring + * dpu_device.c */ -static inline char * -__trim(char *token) -{ - char *tail = token + strlen(token) - 1; +extern double pgstrom_dpu_setup_cost; +extern double pgstrom_dpu_operator_cost; +extern double pgstrom_dpu_seq_page_cost; +extern double pgstrom_dpu_tuple_cost; +extern bool pgstrom_dpu_handle_cached_pages; +extern double pgstrom_dpu_operator_ratio(void); - while (*token == ' ' || *token == '\t') - token++; - while (tail >= token && (*tail == ' ' || *tail == '\t')) - *tail-- = '\0'; - return token; -} +extern const DpuStorageEntry *GetOptimalDpuForFile(const char *filename, + const char **p_dpu_pathname); +extern const DpuStorageEntry *GetOptimalDpuForBaseRel(PlannerInfo *root, + RelOptInfo *baserel); +extern const DpuStorageEntry *GetOptimalDpuForRelation(Relation relation, + const char **p_dpu_pathname); +extern const char *DpuStorageEntryBaseDir(const DpuStorageEntry *ds_entry); +extern bool DpuStorageEntryIsEqual(const DpuStorageEntry *ds_entry1, + const DpuStorageEntry *ds_entry2); +extern int DpuStorageEntryGetEndpointId(const DpuStorageEntry *ds_entry); +extern const DpuStorageEntry *DpuStorageEntryByEndpointId(int endpoint_id); +extern int DpuStorageEntryCount(void); +extern void DpuClientOpenSession(pgstromTaskState *pts, + const XpuCommand *session); +extern void explainDpuStorageEntry(const DpuStorageEntry *ds_entry, + ExplainState *es); +extern bool pgstrom_init_dpu_device(void); /* - * It translate an alignment character into width + * dpu_scan.c */ -static inline int -typealign_get_width(char type_align) -{ - switch (type_align) - { - case 'c': - return 1; - case 's': - return ALIGNOF_SHORT; - case 'i': - return ALIGNOF_INT; - case 'd': - return ALIGNOF_DOUBLE; - default: - elog(ERROR, "unexpected type alignment: %c", type_align); - } - return -1; /* be compiler quiet */ -} - -#ifndef forfour -/* XXX - PG12 added forfour() macro */ -#define forfour(lc1, list1, lc2, list2, lc3, list3, lc4, list4) \ - for ((lc1) = list_head(list1), (lc2) = list_head(list2), \ - (lc3) = list_head(list3), (lc4) = list_head(list4); \ - (lc1) != NULL && (lc2) != NULL && (lc3) != NULL && \ - (lc4) != NULL; \ - (lc1) = lnext(lc1), (lc2) = lnext(lc2), (lc3) = lnext(lc3),\ - (lc4) = lnext(lc4)) -#endif - -/* XXX - PG10 added lfirst_node() and related */ -#ifndef lfirst_node -#define lfirst_node(T,x) ((T *)lfirst(x)) -#endif -#ifndef linitial_node -#define linitial_node(T,x) ((T *)linitial(x)) -#endif -#ifndef lsecond_node -#define lsecond_node(T,x) ((T *)lsecond(x)) -#endif -#ifndef lthird_node -#define lthird_node(T,x) ((T *)lthird(x)) -#endif - -/* lappend on the specified memory-context */ -static inline List * -lappend_cxt(MemoryContext memcxt, List *list, void *datum) -{ - MemoryContext oldcxt = MemoryContextSwitchTo(memcxt); - List *r; - - r = lappend(list, datum); - MemoryContextSwitchTo(oldcxt); - - return r; -} - -/* initStringInfo on a particular memory context */ -static inline void -initStringInfoContext(StringInfo str, MemoryContext memcxt) -{ - MemoryContext oldcxt = MemoryContextSwitchTo(memcxt); - initStringInfo(str); - MemoryContextSwitchTo(oldcxt); -} - -static inline char * -format_numeric(cl_long value) -{ - if (value > 8000000000000L || value < -8000000000000L) - return psprintf("%.2fT", (double)value / 1000000000000.0); - else if (value > 8000000000L || value < -8000000000L) - return psprintf("%.2fG", (double)value / 1000000000.0); - else if (value > 8000000L || value < -8000000L) - return psprintf("%.2fM", (double)value / 1000000.0); - else if (value > 8000L || value < -8000L) - return psprintf("%.2fK", (double)value / 1000.0); - else - return psprintf("%ld", value); -} - -static inline char * -format_bytesz(Size nbytes) -{ - if (nbytes > (Size)(1UL << 43)) - return psprintf("%.2fTB", (double)nbytes / (double)(1UL << 40)); - else if (nbytes > (double)(1UL << 33)) - return psprintf("%.2fGB", (double)nbytes / (double)(1UL << 30)); - else if (nbytes > (double)(1UL << 23)) - return psprintf("%.2fMB", (double)nbytes / (double)(1UL << 20)); - else if (nbytes > (double)(1UL << 13)) - return psprintf("%.2fKB", (double)nbytes / (double)(1UL << 10)); - return psprintf("%uB", (unsigned int)nbytes); -} - -static inline char * -format_millisec(double milliseconds) -{ - if (milliseconds > 300000.0) /* more then 5min */ - return psprintf("%.2fmin", milliseconds / 60000.0); - else if (milliseconds > 8000.0) /* more than 8sec */ - return psprintf("%.2fsec", milliseconds / 1000.0); - return psprintf("%.2fms", milliseconds); -} - -static inline const char * -__basename(const char *filename) -{ - const char *pos = strrchr(filename, '/'); - - return pos ? pos + 1 : filename; -} +extern CustomPathMethods dpuscan_path_methods; +extern void pgstrom_init_dpu_scan(void); /* - * merge two dlist_head + * dpu_join.c */ -static inline void -dlist_append_tail(dlist_head *base, dlist_head *items) -{ - if (dlist_is_empty(items)) - return; - items->head.next->prev = base->head.prev; - items->head.prev->next = &base->head; - base->head.prev->next = items->head.next; - base->head.prev = items->head.prev; -} +extern bool pgstrom_enable_dpujoin; +extern bool pgstrom_enable_dpuhashjoin; +extern bool pgstrom_enable_dpugistindex; +extern void pgstrom_init_dpu_join(void); /* - * Some usuful memory allocation wrapper + * dpu_preagg.c */ -#define palloc_huge(sz) MemoryContextAllocHuge(CurrentMemoryContext,(sz)) -static inline void * -pmemdup(const void *src, Size sz) -{ - void *dst = palloc(sz); - - memcpy(dst, src, sz); - - return dst; -} +extern void pgstrom_init_dpu_preagg(void); /* - * simple wrapper for pthread_mutex_lock + * misc.c */ -static inline void -pthreadMutexInit(pthread_mutex_t *mutex, int pshared) -{ - pthread_mutexattr_t mattr; - - if ((errno = pthread_mutexattr_init(&mattr)) != 0) - wfatal("failed on pthread_mutexattr_init: %m"); - if ((errno = pthread_mutexattr_setpshared(&mattr, pshared)) != 0) - wfatal("failed on pthread_mutexattr_setpshared: %m"); - if ((errno = pthread_mutex_init(mutex, &mattr)) != 0) - wfatal("failed on pthread_mutex_init: %m"); - if ((errno = pthread_mutexattr_destroy(&mattr)) != 0) - wfatal("failed on pthread_mutexattr_destroy: %m"); -} - -static inline void -pthreadMutexLock(pthread_mutex_t *mutex) -{ - if ((errno = pthread_mutex_lock(mutex)) != 0) - wfatal("failed on pthread_mutex_lock: %m"); -} - -static inline bool -pthreadMutexLockTimeout(pthread_mutex_t *mutex, cl_ulong timeout_ms) -{ - struct timespec tm; - - if (clock_gettime(CLOCK_REALTIME, &tm) != 0) - wfatal("failed on clock_gettime: %m"); - tm.tv_sec += (timeout_ms / 1000); - tm.tv_nsec += (timeout_ms % 1000) * 1000000; - if (tm.tv_nsec >= 1000000000L) - { - tm.tv_sec += tm.tv_nsec / 1000000000L; - tm.tv_nsec = tm.tv_nsec % 1000000000L; - } - - errno = pthread_mutex_timedlock(mutex, &tm); - if (errno == ETIMEDOUT) - return false; - else if (errno != 0) - wfatal("failed on pthread_mutex_timedlock: %m"); - return true; -} - -static inline void -pthreadMutexUnlock(pthread_mutex_t *mutex) -{ - if ((errno = pthread_mutex_unlock(mutex)) != 0) - wfatal("failed on pthread_mutex_unlock: %m"); -} - -static inline void -pthreadRWLockInit(pthread_rwlock_t *rwlock) -{ - pthread_rwlockattr_t rwattr; - - if ((errno = pthread_rwlockattr_init(&rwattr)) != 0) - wfatal("failed on pthread_rwlockattr_init: %m"); - if ((errno = pthread_rwlockattr_setpshared(&rwattr, 1)) != 0) - wfatal("failed on pthread_rwlockattr_setpshared: %m"); - if ((errno = pthread_rwlock_init(rwlock, &rwattr)) != 0) - wfatal("failed on pthread_rwlock_init: %m"); -} - -static inline void -pthreadRWLockReadLock(pthread_rwlock_t *rwlock) -{ - if ((errno = pthread_rwlock_rdlock(rwlock)) != 0) - wfatal("failed on pthread_rwlock_rdlock: %m"); -} - -static inline void -pthreadRWLockWriteLock(pthread_rwlock_t *rwlock) -{ - if ((errno = pthread_rwlock_wrlock(rwlock)) != 0) - wfatal("failed on pthread_rwlock_wrlock: %m"); -} - -static inline bool -pthreadRWLockWriteTryLock(pthread_rwlock_t *rwlock) -{ - if ((errno = pthread_rwlock_trywrlock(rwlock)) == 0) - return true; - if (errno != EBUSY) - wfatal("failed on pthread_rwlock_trywrlock: %m"); - return false; -} - -static inline void -pthreadRWLockUnlock(pthread_rwlock_t *rwlock) -{ - if ((errno = pthread_rwlock_unlock(rwlock)) != 0) - wfatal("failed on pthread_rwlock_unlock: %m"); -} - -static inline void -pthreadCondInit(pthread_cond_t *cond, int pshared) -{ - pthread_condattr_t condattr; - - if ((errno = pthread_condattr_init(&condattr)) != 0) - wfatal("failed on pthread_condattr_init: %m"); - if ((errno = pthread_condattr_setpshared(&condattr, pshared)) != 0) - wfatal("failed on pthread_condattr_setpshared: %m"); - if ((errno = pthread_cond_init(cond, &condattr)) != 0) - wfatal("failed on pthread_cond_init: %m"); - if ((errno = pthread_condattr_destroy(&condattr)) != 0) - wfatal("failed on pthread_condattr_destroy: %m"); -} - -static inline void -pthreadCondWait(pthread_cond_t *cond, pthread_mutex_t *mutex) -{ - if ((errno = pthread_cond_wait(cond, mutex)) != 0) - wfatal("failed on pthread_cond_wait: %m"); -} - -static inline bool -pthreadCondWaitTimeout(pthread_cond_t *cond, pthread_mutex_t *mutex, - long timeout_ms) -{ - struct timespec tm; - - clock_gettime(CLOCK_REALTIME, &tm); - tm.tv_sec += timeout_ms / 1000; - tm.tv_nsec += (timeout_ms % 1000) * 1000000; - if (tm.tv_nsec > 1000000000) - { - tm.tv_sec += tm.tv_nsec / 1000000000; - tm.tv_nsec = tm.tv_nsec % 1000000000; - } - - errno = pthread_cond_timedwait(cond, mutex, &tm); - if (errno == 0) - return true; - else if (errno == ETIMEDOUT) - return false; - wfatal("failed on pthread_cond_timedwait: %m"); -} +extern Node *fixup_varnode_to_origin(Node *node, List *cscan_tlist); +extern int __appendBinaryStringInfo(StringInfo buf, + const void *data, int datalen); +extern int __appendZeroStringInfo(StringInfo buf, int nbytes); +extern char *get_type_name(Oid type_oid, bool missing_ok); +extern Oid get_relation_am(Oid rel_oid, bool missing_ok); +extern List *bms_to_pglist(const Bitmapset *bms); +extern Bitmapset *bms_from_pglist(List *pglist); +extern Float *__makeFloat(double fval); +extern Const *__makeByteaConst(bytea *data); +extern bytea *__getByteaConst(Const *con); +extern ssize_t __readFile(int fdesc, void *buffer, size_t nbytes); +extern ssize_t __preadFile(int fdesc, void *buffer, size_t nbytes, off_t f_pos); +extern ssize_t __writeFile(int fdesc, const void *buffer, size_t nbytes); +extern ssize_t __pwriteFile(int fdesc, const void *buffer, size_t nbytes, off_t f_pos); -static inline void -pthreadCondBroadcast(pthread_cond_t *cond) -{ - if ((errno = pthread_cond_broadcast(cond)) != 0) - wfatal("failed on pthread_cond_broadcast: %m"); -} +extern uint32_t __shmemCreate(const DpuStorageEntry *ds_entry); +extern void __shmemDrop(uint32_t shmem_handle); +extern void *__mmapShmem(uint32_t shmem_handle, + size_t shmem_length, + const DpuStorageEntry *ds_entry); +extern bool __munmapShmem(void *mmap_addr); -static inline void -pthreadCondSignal(pthread_cond_t *cond) -{ - if ((errno = pthread_cond_signal(cond)) != 0) - wfatal("failed on pthread_cond_signal: %m"); -} +extern Path *pgstrom_copy_pathnode(const Path *pathnode); /* - * utility to calculate time diff + * main.c */ -#define TV_DIFF(tv2,tv1) \ - (((double)(tv2.tv_sec - tv1.tv_sec) * 1000000.0 + \ - (double)(tv2.tv_usec - tv1.tv_usec)) / 1000.0) -#define TP_DIFF(tp2,tp1) \ - ((tp2.tv_sec - tp1.tv_sec) * 1000000000UL + (tp2.tv_nsec - tp1.tv_nsec)) +extern bool pgstrom_enabled; +extern bool pgstrom_cpu_fallback_enabled; +extern bool pgstrom_regression_test_mode; +extern int pgstrom_max_async_tasks; +extern CustomPath *custom_path_find_cheapest(PlannerInfo *root, + RelOptInfo *rel, + bool parallel_aware, + uint32_t devkind); +extern bool custom_path_remember(PlannerInfo *root, + RelOptInfo *rel, + bool parallel_aware, + uint32_t devkind, + const CustomPath *cpath); +extern Path *pgstrom_create_dummy_path(PlannerInfo *root, Path *subpath); +extern void _PG_init(void); #endif /* PG_STROM_H */ diff --git a/next/pg_utils.h b/src/pg_utils.h similarity index 100% rename from next/pg_utils.h rename to src/pg_utils.h diff --git a/src/relscan.c b/src/relscan.c index 70219efe6..6e49914c5 100644 --- a/src/relscan.c +++ b/src/relscan.c @@ -1,638 +1,37 @@ /* * relscan.c * - * Common routines related to relation scan + * Routines related to outer relation scan * ---- - * Copyright 2011-2021 (C) KaiGai Kohei - * Copyright 2014-2021 (C) PG-Strom Developers Team + * Copyright 2011-2023 (C) KaiGai Kohei + * Copyright 2014-2023 (C) PG-Strom Developers Team * * This program is free software; you can redistribute it and/or modify * it under the terms of the PostgreSQL License. */ #include "pg_strom.h" -/* Data structure for collecting qual clauses that match an index */ -typedef struct -{ - bool nonempty; /* True if lists are not all empty */ - /* Lists of RestrictInfos, one per index column */ - List *indexclauses[INDEX_MAX_KEYS]; -} IndexClauseSet; - -/*--- static variables ---*/ -static bool pgstrom_enable_brin; - -/* - * simple_match_clause_to_indexcol +/* ---------------------------------------------------------------- * - * It is a simplified version of match_clause_to_indexcol. - * Also see optimizer/path/indxpath.c - */ -static bool -simple_match_clause_to_indexcol(IndexOptInfo *index, - int indexcol, - RestrictInfo *rinfo) -{ - Expr *clause = rinfo->clause; - Index index_relid = index->rel->relid; - Oid opfamily = index->opfamily[indexcol]; - Oid idxcollation = index->indexcollations[indexcol]; - Node *leftop; - Node *rightop; - Relids left_relids; - Relids right_relids; - Oid expr_op; - Oid expr_coll; - - /* Clause must be a binary opclause */ - if (!is_opclause(clause)) - return false; - - leftop = get_leftop(clause); - rightop = get_rightop(clause); - if (!leftop || !rightop) - return false; - left_relids = rinfo->left_relids; - right_relids = rinfo->right_relids; - expr_op = ((OpExpr *) clause)->opno; - expr_coll = ((OpExpr *) clause)->inputcollid; - - if (OidIsValid(idxcollation) && idxcollation != expr_coll) - return false; - - /* - * Check for clauses of the form: - * (indexkey operator constant) OR - * (constant operator indexkey) - */ - if (match_index_to_operand(leftop, indexcol, index) && - !bms_is_member(index_relid, right_relids) && - !contain_volatile_functions(rightop) && - op_in_opfamily(expr_op, opfamily)) - return true; - - if (match_index_to_operand(rightop, indexcol, index) && - !bms_is_member(index_relid, left_relids) && - !contain_volatile_functions(leftop) && - op_in_opfamily(get_commutator(expr_op), opfamily)) - return true; - - return false; -} - -/* - * simple_match_clause_to_index + * Routines to support optimization / path or plan construction * - * It is a simplified version of match_clause_to_index. - * Also see optimizer/path/indxpath.c - */ -static void -simple_match_clause_to_index(IndexOptInfo *index, - RestrictInfo *rinfo, - IndexClauseSet *clauseset) -{ - int indexcol; - - /* - * Never match pseudoconstants to indexes. (Normally a match could not - * happen anyway, since a pseudoconstant clause couldn't contain a Var, - * but what if someone builds an expression index on a constant? It's not - * totally unreasonable to do so with a partial index, either.) - */ - if (rinfo->pseudoconstant) - return; - - /* - * If clause can't be used as an indexqual because it must wait till after - * some lower-security-level restriction clause, reject it. - */ - if (!restriction_is_securely_promotable(rinfo, index->rel)) - return; - - /* OK, check each index column for a match */ - for (indexcol = 0; indexcol < index->ncolumns; indexcol++) - { - if (simple_match_clause_to_indexcol(index, - indexcol, - rinfo)) - { - clauseset->indexclauses[indexcol] = - list_append_unique_ptr(clauseset->indexclauses[indexcol], - rinfo); - clauseset->nonempty = true; - break; - } - } -} - -/* - * estimate_brinindex_scan_nblocks - * - * Also see brincostestimate at utils/adt/selfuncs.c - */ -static cl_long -estimate_brinindex_scan_nblocks(PlannerInfo *root, - RelOptInfo *baserel, - IndexOptInfo *index, - IndexClauseSet *clauseset, - List **p_indexQuals) -{ - Relation indexRel; - BrinStatsData statsData; - List *indexQuals = NIL; - ListCell *lc __attribute__((unused)); - int icol __attribute__((unused)); - Selectivity qualSelectivity; - Selectivity indexSelectivity; - double indexCorrelation = 0.0; - double indexRanges; - double minimalRanges; - double estimatedRanges; - - /* Obtain some data from the index itself. */ - indexRel = index_open(index->indexoid, AccessShareLock); - brinGetStats(indexRel, &statsData); - index_close(indexRel, AccessShareLock); - - /* Get selectivity of the index qualifiers */ - icol = 1; - foreach (lc, index->indextlist) - { - TargetEntry *tle = lfirst(lc); - ListCell *cell; - VariableStatData vardata; - - foreach (cell, clauseset->indexclauses[icol-1]) - { - RestrictInfo *rinfo = lfirst(cell); - - indexQuals = lappend(indexQuals, rinfo); - } - - if (IsA(tle->expr, Var)) - { - Var *var = (Var *) tle->expr; - RangeTblEntry *rte; - - /* in case of BRIN index on simple column */ - rte = root->simple_rte_array[var->varno]; - if (get_relation_stats_hook && - (*get_relation_stats_hook)(root, rte, var->varattno, - &vardata)) - { - if (HeapTupleIsValid(vardata.statsTuple) && !vardata.freefunc) - elog(ERROR, "no callback to release stats variable"); - } - else - { - vardata.statsTuple = - SearchSysCache3(STATRELATTINH, - ObjectIdGetDatum(rte->relid), - Int16GetDatum(var->varattno), - BoolGetDatum(false)); - vardata.freefunc = ReleaseSysCache; - } - } - else - { - if (get_index_stats_hook && - (*get_index_stats_hook)(root, index->indexoid, icol, - &vardata)) - { - if (HeapTupleIsValid(vardata.statsTuple) && !vardata.freefunc) - elog(ERROR, "no callback to release stats variable"); - } - else - { - vardata.statsTuple - = SearchSysCache3(STATRELATTINH, - ObjectIdGetDatum(index->indexoid), - Int16GetDatum(icol), - BoolGetDatum(false)); - vardata.freefunc = ReleaseSysCache; - } - } - - if (HeapTupleIsValid(vardata.statsTuple)) - { - AttStatsSlot sslot; - - if (get_attstatsslot(&sslot, vardata.statsTuple, - STATISTIC_KIND_CORRELATION, - InvalidOid, - ATTSTATSSLOT_NUMBERS)) - { - double varCorrelation = 0.0; - - if (sslot.nnumbers > 0) - varCorrelation = Abs(sslot.numbers[0]); - - if (varCorrelation > indexCorrelation) - indexCorrelation = varCorrelation; - - free_attstatsslot(&sslot); - } - } - ReleaseVariableStats(vardata); - - icol++; - } - qualSelectivity = clauselist_selectivity(root, - indexQuals, - baserel->relid, - JOIN_INNER, - NULL); - - /* estimate number of blocks to read */ - indexRanges = ceil((double) baserel->pages / statsData.pagesPerRange); - if (indexRanges < 1.0) - indexRanges = 1.0; - minimalRanges = ceil(indexRanges * qualSelectivity); - - //elog(INFO, "strom: qualSelectivity=%.6f indexRanges=%.6f minimalRanges=%.6f indexCorrelation=%.6f", qualSelectivity, indexRanges, minimalRanges, indexCorrelation); - - if (indexCorrelation < 1.0e-10) - estimatedRanges = indexRanges; - else - estimatedRanges = Min(minimalRanges / indexCorrelation, indexRanges); - - indexSelectivity = estimatedRanges / indexRanges; - if (indexSelectivity < 0.0) - indexSelectivity = 0.0; - if (indexSelectivity > 1.0) - indexSelectivity = 1.0; - - /* index quals, if any */ - if (p_indexQuals) - *p_indexQuals = indexQuals; - /* estimated number of blocks to read */ - return (cl_long)(indexSelectivity * (double) baserel->pages); -} - -/* - * extract_index_conditions - */ -static Node * -__fixup_indexqual_operand(Node *node, IndexOptInfo *indexOpt) -{ - ListCell *lc; - - if (!node) - return NULL; - - if (IsA(node, RelabelType)) - { - RelabelType *relabel = (RelabelType *) node; - - return __fixup_indexqual_operand((Node *)relabel->arg, indexOpt); - } - - foreach (lc, indexOpt->indextlist) - { - TargetEntry *tle = lfirst(lc); - - if (equal(node, tle->expr)) - { - return (Node *)makeVar(INDEX_VAR, - tle->resno, - exprType((Node *)tle->expr), - exprTypmod((Node *) tle->expr), - exprCollation((Node *) tle->expr), - 0); - } - } - if (IsA(node, Var)) - elog(ERROR, "Bug? variable is not found at index tlist"); - return expression_tree_mutator(node, __fixup_indexqual_operand, indexOpt); -} - -static List * -extract_index_conditions(List *index_quals, IndexOptInfo *indexOpt) -{ - List *result = NIL; - ListCell *lc; - - foreach (lc, index_quals) - { - RestrictInfo *rinfo = lfirst(lc); - OpExpr *op = (OpExpr *) rinfo->clause; - - if (!IsA(rinfo->clause, OpExpr)) - elog(ERROR, "Bug? unexpected index clause: %s", - nodeToString(rinfo->clause)); - if (list_length(((OpExpr *)rinfo->clause)->args) != 2) - elog(ERROR, "indexqual clause must be binary opclause"); - op = (OpExpr *)copyObject(rinfo->clause); - if (!bms_equal(rinfo->left_relids, indexOpt->rel->relids)) - CommuteOpExpr(op); - /* replace the indexkey expression with an index Var */ - linitial(op->args) = __fixup_indexqual_operand(linitial(op->args), - indexOpt); - result = lappend(result, op); - } - return result; -} - -/* - * pgstrom_tryfind_brinindex - */ -IndexOptInfo * -pgstrom_tryfind_brinindex(PlannerInfo *root, - RelOptInfo *baserel, - List **p_indexConds, - List **p_indexQuals, - cl_long *p_indexNBlocks) -{ - cl_long indexNBlocks = LONG_MAX; - IndexOptInfo *indexOpt = NULL; - List *indexQuals = NIL; - ListCell *cell; - - /* skip if GUC disables BRIN-index */ - if (!pgstrom_enable_brin) - return NULL; - - /* skip if no indexes */ - if (baserel->indexlist == NIL) - return NULL; - - foreach (cell, baserel->indexlist) - { - IndexOptInfo *index = (IndexOptInfo *) lfirst(cell); - List *temp = NIL; - ListCell *lc; - cl_long nblocks; - IndexClauseSet clauseset; - - /* Protect limited-size array in IndexClauseSets */ - Assert(index->ncolumns <= INDEX_MAX_KEYS); - - /* Ignore partial indexes that do not match the query. */ - if (index->indpred != NIL && !index->predOK) - continue; - - /* Only BRIN-indexes are now supported */ - if (index->relam != BRIN_AM_OID) - continue; - - /* see match_clauses_to_index */ - memset(&clauseset, 0, sizeof(IndexClauseSet)); - foreach (lc, index->indrestrictinfo) - { - RestrictInfo *rinfo = lfirst_node(RestrictInfo, lc); - - simple_match_clause_to_index(index, rinfo, &clauseset); - } - if (!clauseset.nonempty) - continue; - - /* - * In case when multiple BRIN-indexes are configured, - * the one with minimal selectivity is the best choice. - */ - nblocks = estimate_brinindex_scan_nblocks(root, baserel, - index, - &clauseset, - &temp); - if (indexNBlocks > nblocks) - { - indexOpt = index; - indexQuals = temp; - indexNBlocks = nblocks; - } - } - - if (indexOpt) - { - if (p_indexConds) - *p_indexConds = extract_index_conditions(indexQuals, indexOpt); - if (p_indexQuals) - *p_indexQuals = indexQuals; - if (p_indexNBlocks) - *p_indexNBlocks = indexNBlocks; - } - return indexOpt; -} - -/* - * pgstrom_common_relscan_cost - */ -int -pgstrom_common_relscan_cost(PlannerInfo *root, - RelOptInfo *scan_rel, - List *scan_quals, - int parallel_workers, - IndexOptInfo *indexOpt, - List *indexQuals, - cl_long indexNBlocks, - double *p_parallel_divisor, - double *p_scan_ntuples, - double *p_scan_nchunks, - cl_uint *p_nrows_per_block, - Cost *p_startup_cost, - Cost *p_run_cost) -{ - int scan_mode = 0; - Cost startup_cost = 0.0; - Cost run_cost = 0.0; - Cost index_scan_cost = 0.0; - Cost disk_scan_cost = 0.0; - double gpu_ratio = pgstrom_gpu_operator_cost / cpu_operator_cost; - double parallel_divisor; - double ntuples = scan_rel->tuples; - double nblocks = scan_rel->pages; - double nchunks; - double selectivity; - double spc_seq_page_cost; - double spc_rand_page_cost; - cl_uint nrows_per_block = 0; - Size heap_size; - Size htup_size; - QualCost qcost; - ListCell *lc; - - Assert((scan_rel->reloptkind == RELOPT_BASEREL || - scan_rel->reloptkind == RELOPT_OTHER_MEMBER_REL) && - scan_rel->relid > 0 && - scan_rel->relid < root->simple_rel_array_size); - /* mark if special storage layer */ - if (baseRelIsArrowFdw(scan_rel)) - scan_mode |= PGSTROM_RELSCAN_ARROW_FDW; - if (baseRelHasGpuCache(root, scan_rel)) - scan_mode |= PGSTROM_RELSCAN_GPU_CACHE; - - /* selectivity of device executable qualifiers */ - selectivity = clauselist_selectivity(root, - scan_quals, - scan_rel->relid, - JOIN_INNER, - NULL); - /* cost of full-table scan, if not gpu memory store */ - if ((scan_mode & PGSTROM_RELSCAN_GPU_CACHE) == 0) - { - get_tablespace_page_costs(scan_rel->reltablespace, - &spc_rand_page_cost, - &spc_seq_page_cost); - disk_scan_cost = spc_seq_page_cost * nblocks; - } - - /* consideration for BRIN-index, if any */ - if (indexOpt) - { - BrinStatsData statsData; - Relation index_rel; - Cost x; - - index_rel = index_open(indexOpt->indexoid, AccessShareLock); - brinGetStats(index_rel, &statsData); - index_close(index_rel, AccessShareLock); - - get_tablespace_page_costs(indexOpt->reltablespace, - &spc_rand_page_cost, - &spc_seq_page_cost); - index_scan_cost = spc_seq_page_cost * statsData.revmapNumPages; - foreach (lc, indexQuals) - { - cost_qual_eval_node(&qcost, (Node *)lfirst(lc), root); - index_scan_cost += qcost.startup + qcost.per_tuple; - } - - x = index_scan_cost + spc_rand_page_cost * (double)indexNBlocks; - if (disk_scan_cost > x) - { - disk_scan_cost = x; - ntuples = scan_rel->tuples * ((double) indexNBlocks / nblocks); - nblocks = indexNBlocks; - scan_mode |= PGSTROM_RELSCAN_BRIN_INDEX; - } - } - - /* check whether NVMe-Strom is capable */ - if (ScanPathWillUseNvmeStrom(root, scan_rel)) - scan_mode |= PGSTROM_RELSCAN_SSD2GPU; - - /* - * Cost adjustment by CPU parallelism, if used. - * (overall logic is equivalent to cost_seqscan()) - */ - if (parallel_workers > 0) - { - parallel_divisor = (double) parallel_workers; -#if PG_VERSION_NUM >= 110000 - if (parallel_leader_participation) -#endif - { - double leader_contribution; - - leader_contribution = 1.0 - (0.3 * (double) parallel_workers); - if (leader_contribution > 0) - parallel_divisor += leader_contribution; - } - /* number of tuples to be actually processed */ - ntuples = clamp_row_est(ntuples / parallel_divisor); - - /* - * After the v2.0, pg_strom.gpu_setup_cost represents the cost for - * run-time code build by NVRTC. Once binary is constructed, it can - * be shared with all the worker process, so we can discount the - * cost by parallel_divisor. - */ - startup_cost += pgstrom_gpu_setup_cost / 2 - + (pgstrom_gpu_setup_cost / (2 * parallel_divisor)); - } - else - { - parallel_divisor = 1.0; - startup_cost += pgstrom_gpu_setup_cost; - } - /* - * Cost discount for more efficient I/O with multiplexing. - * PG background workers can issue read request to filesystem - * concurrently. It enables to work I/O subsystem during blocking- - * time for other workers, then, it pulls up usage ratio of the - * storage system. - */ - disk_scan_cost /= Min(2.0, sqrt(parallel_divisor)); - - /* more disk i/o discount if NVMe-Strom is available */ - if ((scan_mode & PGSTROM_RELSCAN_SSD2GPU) != 0) - disk_scan_cost /= 1.5; - run_cost += disk_scan_cost; - - /* - * Rough estimation for number of chunks if KDS_FORMAT_ROW. - * Also note that we roughly assume KDS_HeadSz is BLCKSZ to - * reduce estimation cycle. - */ - heap_size = (double)(BLCKSZ - SizeOfPageHeaderData) * nblocks; - htup_size = (MAXALIGN(offsetof(HeapTupleHeaderData, - t_bits[BITMAPLEN(scan_rel->max_attr)])) + - MAXALIGN(heap_size / Max(scan_rel->tuples, 1.0) - - sizeof(ItemIdData) - SizeofHeapTupleHeader)); - nchunks = (((double)(offsetof(kern_tupitem, htup) + htup_size + - sizeof(cl_uint)) * Max(ntuples, 1.0)) / - ((double)(pgstrom_chunk_size() - BLCKSZ))); - nchunks = Max(nchunks, 1); - - /* - * estimation of the tuple density per block - this logic follows - * the manner in estimate_rel_size() - */ - if (scan_rel->pages > 0) - nrows_per_block = ceil(scan_rel->tuples / (double)scan_rel->pages); - else - { - RangeTblEntry *rte = root->simple_rte_array[scan_rel->relid]; - size_t tuple_width = get_relation_data_width(rte->relid, NULL); - - tuple_width += MAXALIGN(SizeofHeapTupleHeader); - tuple_width += sizeof(ItemIdData); - /* note: integer division is intentional here */ - nrows_per_block = (BLCKSZ - SizeOfPageHeaderData) / tuple_width; - } - - /* Cost for GPU qualifiers */ - cost_qual_eval_node(&qcost, (Node *)scan_quals, root); - startup_cost += qcost.startup; - run_cost += qcost.per_tuple * gpu_ratio * ntuples; - ntuples *= selectivity; - - /* Cost for DMA transfer (host/storage --> GPU) */ - run_cost += pgstrom_gpu_dma_cost * nchunks; - - *p_parallel_divisor = parallel_divisor; - *p_scan_ntuples = ntuples / parallel_divisor; - *p_scan_nchunks = nchunks / parallel_divisor; - *p_nrows_per_block = - ((scan_mode & PGSTROM_RELSCAN_SSD2GPU) != 0 ? nrows_per_block : 0); - *p_startup_cost = startup_cost; - *p_run_cost = run_cost; - - return scan_mode; -} - -/* - * pgstrom_pullup_outer_refs - * - * setup outer_refs bitmap according to the attr_needed of RelOptInfo. - * If base_rel is a partition leaf, we have to look at parent relation - * instead. + * ---------------------------------------------------------------- */ Bitmapset * -pgstrom_pullup_outer_refs(PlannerInfo *root, - RelOptInfo *base_rel, - Bitmapset *referenced) +pickup_outer_referenced(PlannerInfo *root, + RelOptInfo *base_rel, + Bitmapset *referenced) { ListCell *lc; - int i, j, k; + int j, k; if (base_rel->reloptkind == RELOPT_BASEREL) { - for (i=base_rel->min_attr, j=0; i <= base_rel->max_attr; i++, j++) + for (j=base_rel->min_attr; j <= base_rel->max_attr; j++) { - if (i < 0 || base_rel->attr_needed[j] == NULL) + if (j <= 0 || !base_rel->attr_needed[j - base_rel->min_attr]) continue; - k = i - FirstLowInvalidHeapAttributeNumber; + k = j - FirstLowInvalidHeapAttributeNumber; referenced = bms_add_member(referenced, k); } } @@ -649,1574 +48,871 @@ pgstrom_pullup_outer_refs(PlannerInfo *root, continue; Assert(apinfo->parent_relid < root->simple_rel_array_size); parent_rel = root->simple_rel_array[apinfo->parent_relid]; - parent_refs = pgstrom_pullup_outer_refs(root, parent_rel, NULL); + parent_refs = pickup_outer_referenced(root, parent_rel, NULL); for (k = bms_next_member(parent_refs, -1); k >= 0; k = bms_next_member(parent_refs, k)) { - i = k + FirstLowInvalidHeapAttributeNumber; - if (i <= 0) + j = k + FirstLowInvalidHeapAttributeNumber; + if (j <= 0) bms_add_member(referenced, k); - if (i > list_length(apinfo->translated_vars)) + else if (j > list_length(apinfo->translated_vars)) elog(ERROR, "Bug? column reference out of range"); - var = list_nth(apinfo->translated_vars, i-1); - Assert(IsA(var, Var)); - j = var->varattno - FirstLowInvalidHeapAttributeNumber; - referenced = bms_add_member(referenced, j); + else + { + var = list_nth(apinfo->translated_vars, j-1); + Assert(IsA(var, Var)); + j = var->varattno - FirstLowInvalidHeapAttributeNumber; + referenced = bms_add_member(referenced, j); + } } break; } - if (lc == NULL) + if (!lc) elog(ERROR, "Bug? AppendRelInfo not found (relid=%u)", base_rel->relid); } else { - elog(ERROR, "Bug? outer is not a simple relation"); + elog(ERROR, "Bug? outer relation is not a simple relation"); } return referenced; } /* ---------------------------------------------------------------- * - * GPUDirectSQL related routines + * Routines to setup kern_data_store * * ---------------------------------------------------------------- */ -typedef struct +static int +count_num_of_subfields(Oid type_oid) { - Oid tablespace_oid; - bool is_valid; - Bitmapset optimal_gpus; -} tablespace_optimal_gpu_hentry; - -static HTAB *tablespace_optimal_gpu_htable = NULL; + TypeCacheEntry *tcache; + int j, count = 0; -static void -tablespace_optimal_gpu_cache_callback(Datum arg, int cacheid, uint32 hashvalue) -{ - /* invalidate all the cached status */ - if (tablespace_optimal_gpu_htable) + tcache = lookup_type_cache(type_oid, TYPECACHE_TUPDESC); + if (OidIsValid(tcache->typelem) && tcache->typlen == -1) { - hash_destroy(tablespace_optimal_gpu_htable); - tablespace_optimal_gpu_htable = NULL; + /* array type */ + count = 1 + count_num_of_subfields(tcache->typelem); } -} - -/* - * GetOptimalGpusForTablespace - */ -static const Bitmapset * -GetOptimalGpusForTablespace(Oid tablespace_oid) -{ - tablespace_optimal_gpu_hentry *hentry; - bool found; - - if (!pgstrom_gpudirect_enabled()) - return NULL; - - if (!OidIsValid(tablespace_oid)) - tablespace_oid = MyDatabaseTableSpace; - - if (!tablespace_optimal_gpu_htable) + else if (tcache->tupDesc) { - HASHCTL hctl; - int nwords = (numDevAttrs / BITS_PER_BITMAPWORD) + 1; - - memset(&hctl, 0, sizeof(HASHCTL)); - hctl.keysize = sizeof(Oid); - hctl.entrysize = MAXALIGN(offsetof(tablespace_optimal_gpu_hentry, - optimal_gpus.words[nwords])); - tablespace_optimal_gpu_htable - = hash_create("TablespaceOptimalGpu", 128, - &hctl, HASH_ELEM | HASH_BLOBS); - } - hentry = (tablespace_optimal_gpu_hentry *) - hash_search(tablespace_optimal_gpu_htable, - &tablespace_oid, - HASH_ENTER, - &found); - if (!found || !hentry->is_valid) - { - char *pathname; - File filp; - Bitmapset *optimal_gpus; - - Assert(hentry->tablespace_oid == tablespace_oid); + /* composite type */ + TupleDesc tupdesc = tcache->tupDesc; - pathname = GetDatabasePath(MyDatabaseId, tablespace_oid); - filp = PathNameOpenFile(pathname, O_RDONLY); - if (filp < 0) + for (j=0; j < tupdesc->natts; j++) { - elog(WARNING, "failed on open('%s') of tablespace %u: %m", - pathname, tablespace_oid); - return NULL; - } - optimal_gpus = extraSysfsLookupOptimalGpus(filp); - if (!optimal_gpus) - hentry->optimal_gpus.nwords = 0; - else - { - Assert(optimal_gpus->nwords <= (numDevAttrs / BITS_PER_BITMAPWORD) + 1); - memcpy(&hentry->optimal_gpus, optimal_gpus, - offsetof(Bitmapset, words[optimal_gpus->nwords])); - bms_free(optimal_gpus); + Form_pg_attribute attr = TupleDescAttr(tupdesc, j); + + count += count_num_of_subfields(attr->atttypid); } - FileClose(filp); - hentry->is_valid = true; } - Assert(hentry->is_valid); - return (hentry->optimal_gpus.nwords > 0 ? &hentry->optimal_gpus : NULL); + return count; } -const Bitmapset * -GetOptimalGpusForRelation(PlannerInfo *root, RelOptInfo *rel) -{ - RangeTblEntry *rte; - HeapTuple tup; - char relpersistence; - const Bitmapset *optimal_gpus; - - if (baseRelIsArrowFdw(rel)) +static void +__setup_kern_colmeta(kern_data_store *kds, + int column_index, + const char *attname, + int attnum, + bool attbyval, + char attalign, + int16 attlen, + Oid atttypid, + int atttypmod, + int *p_attcacheoff) +{ + kern_colmeta *cmeta = &kds->colmeta[column_index]; + TypeCacheEntry *tcache; + + memset(cmeta, 0, sizeof(kern_colmeta)); + cmeta->attbyval = attbyval; + cmeta->attalign = typealign_get_width(attalign); + cmeta->attlen = attlen; + if (attlen == 0 || attlen < -1) + elog(ERROR, "attribute %s has unexpected length (%d)", attname, attlen); + else if (attlen == -1) + kds->has_varlena = true; + cmeta->attnum = attnum; + + if (!p_attcacheoff || *p_attcacheoff < 0) + cmeta->attcacheoff = -1; + else if (attlen > 0) + { + cmeta->attcacheoff = att_align_nominal(*p_attcacheoff, attalign); + *p_attcacheoff = cmeta->attcacheoff + attlen; + } + else if (attlen == -1) { - if (pgstrom_gpudirect_enabled()) - return GetOptimalGpusForArrowFdw(root, rel); - return NULL; - } + /* + * Note that attcacheoff is also available on varlena datum + * only if it appeared at the first, and its offset is aligned. + * Elsewhere, we cannot utilize the attcacheoff for varlena + */ + uint32_t __off = att_align_nominal(*p_attcacheoff, attalign); - optimal_gpus = GetOptimalGpusForTablespace(rel->reltablespace); - if (!bms_is_empty(optimal_gpus)) - { - /* only permanent / unlogged table can use NVMe-Strom */ - rte = root->simple_rte_array[rel->relid]; - tup = SearchSysCache1(RELOID, ObjectIdGetDatum(rte->relid)); - if (!HeapTupleIsValid(tup)) - elog(ERROR, "cache lookup failed for relation %u", rte->relid); - relpersistence = ((Form_pg_class) GETSTRUCT(tup))->relpersistence; - ReleaseSysCache(tup); - - if (relpersistence == RELPERSISTENCE_PERMANENT || - relpersistence == RELPERSISTENCE_UNLOGGED) - return optimal_gpus; + if (*p_attcacheoff == __off) + cmeta->attcacheoff = __off; + else + cmeta->attcacheoff = -1; + *p_attcacheoff = -1; } - return NULL; -} - -bool -RelationCanUseNvmeStrom(Relation relation) -{ - Oid tablespace_oid = RelationGetForm(relation)->reltablespace; - - /* SSD2GPU on temp relation is not supported */ - if (RelationUsesLocalBuffers(relation)) - return false; - return !bms_is_empty(GetOptimalGpusForTablespace(tablespace_oid)); -} - -/* - * ScanPathWillUseNvmeStrom - Optimizer Hint - */ -bool -ScanPathWillUseNvmeStrom(PlannerInfo *root, RelOptInfo *baserel) -{ - size_t num_scan_pages = 0; - - if (!pgstrom_gpudirect_enabled()) - return false; - - /* - * Check expected amount of the scan i/o. - * If 'baserel' is children of partition table, threshold shall be - * checked towards the entire partition size, because the range of - * child tables fully depend on scan qualifiers thus variable time - * by time. Once user focus on a particular range, but he wants to - * focus on other area. It leads potential thrashing on i/o. - */ - if (baserel->reloptkind == RELOPT_BASEREL) + else { - if (!bms_is_empty(GetOptimalGpusForRelation(root, baserel))) - num_scan_pages = baserel->pages; + cmeta->attcacheoff = *p_attcacheoff = -1; + } + cmeta->atttypid = atttypid; + cmeta->atttypmod = atttypmod; + strncpy(cmeta->attname, attname, NAMEDATALEN); + + /* array? composite type? */ + tcache = lookup_type_cache(atttypid, TYPECACHE_TUPDESC); + if (OidIsValid(tcache->typelem) && tcache->typlen == -1) + { + char elem_name[NAMEDATALEN+10]; + int16 elem_len; + bool elem_byval; + char elem_align; + + cmeta->atttypkind = TYPE_KIND__ARRAY; + cmeta->idx_subattrs = kds->nr_colmeta++; + cmeta->num_subattrs = 1; + + snprintf(elem_name, sizeof(elem_name), "__%s", attname); + get_typlenbyvalalign(tcache->typelem, + &elem_len, + &elem_byval, + &elem_align); + __setup_kern_colmeta(kds, + cmeta->idx_subattrs, + elem_name, /* attname */ + 1, /* attnum */ + elem_byval, /* attbyval */ + elem_align, /* attalign */ + elem_len, /* attlen */ + tcache->typelem, /* atttypid */ + -1, /* atttypmod */ + NULL); /* attcacheoff */ + } + else if (tcache->tupDesc) + { + TupleDesc tupdesc = tcache->tupDesc; + int j, attcacheoff = -1; + + cmeta->atttypkind = TYPE_KIND__COMPOSITE; + cmeta->idx_subattrs = kds->nr_colmeta; + cmeta->num_subattrs = tupdesc->natts; + kds->nr_colmeta += tupdesc->natts; + + for (j=0; j < tupdesc->natts; j++) + { + Form_pg_attribute attr = TupleDescAttr(tupdesc, j); + + __setup_kern_colmeta(kds, + cmeta->idx_subattrs + j, + NameStr(attr->attname), + attr->attnum, + attr->attbyval, + attr->attalign, + attr->attlen, + attr->atttypid, + attr->atttypmod, + &attcacheoff); + } } - else if (baserel->reloptkind == RELOPT_OTHER_MEMBER_REL) + else { - ListCell *lc; - Index parent_relid = 0; - - foreach (lc, root->append_rel_list) + switch (tcache->typtype) { - AppendRelInfo *appinfo = (AppendRelInfo *) lfirst(lc); - - if (appinfo->child_relid == baserel->relid) - { - parent_relid = appinfo->parent_relid; + case TYPTYPE_BASE: + cmeta->atttypkind = TYPE_KIND__BASE; + break; + case TYPTYPE_DOMAIN: + cmeta->atttypkind = TYPE_KIND__DOMAIN; + break; + case TYPTYPE_ENUM: + cmeta->atttypkind = TYPE_KIND__ENUM; + break; + case TYPTYPE_PSEUDO: + cmeta->atttypkind = TYPE_KIND__PSEUDO; + break; + case TYPTYPE_RANGE: + cmeta->atttypkind = TYPE_KIND__RANGE; + break; + default: + elog(ERROR, "Unexpected typtype ('%c')", tcache->typtype); break; - } - } - if (!lc) - { - elog(NOTICE, "Bug? child table (%d) not found in append_rel_list", - baserel->relid); - return false; - } - - foreach (lc, root->append_rel_list) - { - AppendRelInfo *appinfo = (AppendRelInfo *) lfirst(lc); - RelOptInfo *rel; - - if (appinfo->parent_relid != parent_relid) - continue; - rel = root->simple_rel_array[appinfo->child_relid]; - if (!bms_is_empty(GetOptimalGpusForRelation(root, rel))) - num_scan_pages += rel->pages; } } - else - elog(ERROR, "Bug? unexpected reloptkind of base relation: %d", - (int)baserel->reloptkind); - - if (num_scan_pages < pgstrom_gpudirect_threshold() / BLCKSZ) - return false; - /* ok, this table scan can use nvme-strom */ - return true; + /* + * for the reverse references to KDS + */ + cmeta->kds_format = kds->format; + cmeta->kds_offset = (char *)cmeta - (char *)kds; } -/* - * pgstromIndexState - runtime status of BRIN-index for relation scan - */ -typedef struct pgstromIndexState -{ - Oid index_oid; - Relation index_rel; - Node *index_quals; /* for EXPLAIN */ - BlockNumber nblocks; - BlockNumber range_sz; - BrinRevmap *brin_revmap; - BrinDesc *brin_desc; - ScanKey scan_keys; - int num_scan_keys; - IndexRuntimeKeyInfo *runtime_keys_info; - int num_runtime_keys; - bool runtime_key_ready; - ExprContext *runtime_econtext; -} pgstromIndexState; +size_t +setup_kern_data_store(kern_data_store *kds, + TupleDesc tupdesc, + size_t length, + char format) +{ + int j, attcacheoff = -1; + + memset(kds, 0, offsetof(kern_data_store, colmeta)); + kds->length = length; + kds->nitems = 0; + kds->usage = 0; + kds->ncols = tupdesc->natts; + kds->format = format; + kds->tdhasoid = false; /* PG12 removed 'oid' system column */ + kds->tdtypeid = tupdesc->tdtypeid; + kds->tdtypmod = tupdesc->tdtypmod; + kds->table_oid = InvalidOid; /* to be set by the caller */ + kds->hash_nslots = 0; /* to be set by the caller, if any */ + kds->nr_colmeta = tupdesc->natts; + + if (format == KDS_FORMAT_ROW || + format == KDS_FORMAT_HASH || + format == KDS_FORMAT_BLOCK) + attcacheoff = 0; + + for (j=0; j < tupdesc->natts; j++) + { + Form_pg_attribute attr = TupleDescAttr(tupdesc, j); + + __setup_kern_colmeta(kds, j, + NameStr(attr->attname), + attr->attnum, + attr->attbyval, + attr->attalign, + attr->attlen, + attr->atttypid, + attr->atttypmod, + &attcacheoff); + } + /* internal system attribute */ + if (format == KDS_FORMAT_COLUMN) + { + kern_colmeta *cmeta = &kds->colmeta[kds->nr_colmeta++]; + + memset(cmeta, 0, sizeof(kern_colmeta)); + cmeta->attbyval = true; + cmeta->attalign = sizeof(int32_t); + cmeta->attlen = sizeof(GpuCacheSysattr); + cmeta->attnum = -1; + cmeta->attcacheoff = -1; + cmeta->atttypid = InvalidOid; + cmeta->atttypmod = -1; + cmeta->atttypkind = TYPE_KIND__BASE; + strcpy(cmeta->attname, "__gcache_sysattr__"); + } + return MAXALIGN(offsetof(kern_data_store, colmeta[kds->nr_colmeta])); +} -/* - * pgstromExecInitBrinIndexMap - */ -void -pgstromExecInitBrinIndexMap(GpuTaskState *gts, - Oid index_oid, - List *index_conds, - List *index_quals) +size_t +estimate_kern_data_store(TupleDesc tupdesc) { - pgstromIndexState *pi_state = NULL; - Relation relation = gts->css.ss.ss_currentRelation; - EState *estate = gts->css.ss.ps.state; - Index scanrelid; - LOCKMODE lockmode = NoLock; + int j, nr_colmeta = tupdesc->natts; - if (!OidIsValid(index_oid)) + for (j=0; j < tupdesc->natts; j++) { - Assert(index_conds == NIL); - gts->outer_index_state = NULL; - return; - } - Assert(relation != NULL); - scanrelid = ((Scan *) gts->css.ss.ps.plan)->scanrelid; - if (!ExecRelationIsTargetRelation(estate, scanrelid)) - lockmode = AccessShareLock; - - pi_state = palloc0(sizeof(pgstromIndexState)); - pi_state->index_oid = index_oid; - pi_state->index_rel = index_open(index_oid, lockmode); - pi_state->index_quals = (Node *)make_ands_explicit(index_quals); - ExecIndexBuildScanKeys(>s->css.ss.ps, - pi_state->index_rel, - index_conds, - false, - &pi_state->scan_keys, - &pi_state->num_scan_keys, - &pi_state->runtime_keys_info, - &pi_state->num_runtime_keys, - NULL, - NULL); - - /* ExprContext to evaluate runtime keys, if any */ - if (pi_state->num_runtime_keys != 0) - pi_state->runtime_econtext = CreateExprContext(estate); - else - pi_state->runtime_econtext = NULL; - - /* BRIN index specific initialization */ - pi_state->nblocks = RelationGetNumberOfBlocks(relation); - pi_state->brin_revmap = brinRevmapInitialize(pi_state->index_rel, - &pi_state->range_sz, - estate->es_snapshot); - pi_state->brin_desc = brin_build_desc(pi_state->index_rel); + Form_pg_attribute attr = TupleDescAttr(tupdesc, j); - /* save the state */ - gts->outer_index_state = pi_state; + nr_colmeta += count_num_of_subfields(attr->atttypid); + } + /* internal system attribute if KDS_FORMAT_COLUMN */ + nr_colmeta++; + return MAXALIGN(offsetof(kern_data_store, colmeta[nr_colmeta])); } /* - * pgstromSizeOfBrinIndexMap + * Routines to store/fetch fallback tuples */ -Size -pgstromSizeOfBrinIndexMap(GpuTaskState *gts) +void +pgstromStoreFallbackTuple(pgstromTaskState *pts, HeapTuple htuple) { - pgstromIndexState *pi_state = gts->outer_index_state; - int nwords; - - if (!pi_state) - return 0; + MemoryContext memcxt = pts->css.ss.ps.state->es_query_cxt; + kern_tupitem *titem; + size_t sz; - nwords = (pi_state->nblocks + - pi_state->range_sz - 1) / pi_state->range_sz; - return STROMALIGN(offsetof(Bitmapset, words) + - sizeof(bitmapword) * nwords); - -} - -/* - * pgstromExecGetBrinIndexMap - * - * Also see bringetbitmap - */ -static void -__pgstromExecGetBrinIndexMap(pgstromIndexState *pi_state, - Bitmapset *brin_map, - Snapshot snapshot) -{ - BrinDesc *bdesc = pi_state->brin_desc; - TupleDesc bd_tupdesc = bdesc->bd_tupdesc; - BlockNumber nblocks = pi_state->nblocks; - BlockNumber range_sz = pi_state->range_sz; - BlockNumber heapBlk; - BlockNumber index; - Buffer buf = InvalidBuffer; - FmgrInfo *consistentFn; - BrinMemTuple *dtup; - BrinTuple *btup __attribute__((unused)) = NULL; - Size btupsz __attribute__((unused)) = 0; - int nranges; - int nwords; - MemoryContext oldcxt; - MemoryContext perRangeCxt; - - /* rooms for the consistent support procedures of indexed columns */ - consistentFn = palloc0(sizeof(FmgrInfo) * bd_tupdesc->natts); - /* allocate an initial in-memory tuple */ - dtup = brin_new_memtuple(bdesc); - - /* moves to the working memory context per range */ - perRangeCxt = AllocSetContextCreate(CurrentMemoryContext, - "PG-Strom BRIN-index temporary", - ALLOCSET_DEFAULT_SIZES); - oldcxt = MemoryContextSwitchTo(perRangeCxt); - - nranges = (pi_state->nblocks + - pi_state->range_sz - 1) / pi_state->range_sz; - nwords = (nranges + BITS_PER_BITMAPWORD - 1) / BITS_PER_BITMAPWORD; - Assert(brin_map->nwords < 0); - memset(brin_map->words, 0, sizeof(bitmapword) * nwords); - /* - * Now scan the revmap. We start by querying for heap page 0, - * incrementing by the number of pages per range; this gives us a full - * view of the table. - */ - for (heapBlk = 0, index = 0; - heapBlk < nblocks; - heapBlk += range_sz, index++) + if (!pts->fallback_tuples) { - BrinTuple *tup; - OffsetNumber off; - Size size; - int keyno; - - CHECK_FOR_INTERRUPTS(); - - MemoryContextResetAndDeleteChildren(perRangeCxt); - - tup = brinGetTupleForHeapBlock(pi_state->brin_revmap, heapBlk, - &buf, &off, &size, - BUFFER_LOCK_SHARE, - snapshot); - if (tup) - { - btup = brin_copy_tuple(tup, size, btup, &btupsz); - - LockBuffer(buf, BUFFER_LOCK_UNLOCK); - dtup = brin_deform_tuple(bdesc, btup, dtup); - if (!dtup->bt_placeholder) - { - for (keyno = 0; keyno < pi_state->num_scan_keys; keyno++) - { - ScanKey key = &pi_state->scan_keys[keyno]; - AttrNumber keyattno = key->sk_attno; - BrinValues *bval = &dtup->bt_columns[keyattno - 1]; - Datum rv; - Form_pg_attribute keyattr __attribute__((unused)); - - keyattr = tupleDescAttr(bd_tupdesc, keyattno-1); - Assert((key->sk_flags & SK_ISNULL) || - (key->sk_collation == keyattr->attcollation)); - /* First time this column? look up consistent function */ - if (consistentFn[keyattno - 1].fn_oid == InvalidOid) - { - FmgrInfo *tmp; - - tmp = index_getprocinfo(pi_state->index_rel, keyattno, - BRIN_PROCNUM_CONSISTENT); - fmgr_info_copy(&consistentFn[keyattno - 1], tmp, - CurrentMemoryContext); - } - - /* - * Check whether the scan key is consistent with the page - * range values; if so, pages in the range shall be - * skipped on the scan. - */ - rv = FunctionCall3Coll(&consistentFn[keyattno - 1], - key->sk_collation, - PointerGetDatum(bdesc), - PointerGetDatum(bval), - PointerGetDatum(key)); - if (!DatumGetBool(rv)) - { - if (index / BITS_PER_BITMAPWORD < nwords) - brin_map->words[index / BITS_PER_BITMAPWORD] - |= (1U << (index % BITS_PER_BITMAPWORD)); - break; - } - } - } - } + pts->fallback_index = 0; + pts->fallback_nitems = 0; + pts->fallback_nrooms = 1000; + pts->fallback_tuples = + MemoryContextAlloc(memcxt, sizeof(off_t) * pts->fallback_nrooms); } - MemoryContextSwitchTo(oldcxt); - MemoryContextDelete(perRangeCxt); - - if (buf != InvalidBuffer) - ReleaseBuffer(buf); - /* mark this bitmapset is ready */ - pg_memory_barrier(); - brin_map->nwords = nwords; -} - -void -pgstromExecGetBrinIndexMap(GpuTaskState *gts) -{ - pgstromIndexState *pi_state = gts->outer_index_state; - - if (!gts->outer_index_map || gts->outer_index_map->nwords < 0) + if (!pts->fallback_buffer) { - EState *estate = gts->css.ss.ps.state; - - if (!gts->outer_index_map) - { - Assert(!IsParallelWorker()); - gts->outer_index_map - = MemoryContextAlloc(estate->es_query_cxt, - pgstromSizeOfBrinIndexMap(gts)); - gts->outer_index_map->nwords = -1; - } - - ResetLatch(MyLatch); - while (gts->outer_index_map->nwords < 0) - { - if (!IsParallelWorker()) - { - __pgstromExecGetBrinIndexMap(pi_state, - gts->outer_index_map, - estate->es_snapshot); - /* wake up parallel workers if any */ - if (gts->pcxt) - { - ParallelContext *pcxt = gts->pcxt; - pid_t pid; - int i; - - for (i=0; i < pcxt->nworkers_launched; i++) - { - if (GetBackgroundWorkerPid(pcxt->worker[i].bgwhandle, - &pid) == BGWH_STARTED) - ProcSendSignal(pid); - } - } -#if 0 - { - Bitmapset *map = gts->outer_index_map; - int i; - - elog(INFO, "BRIN-index (%s) range_sz = %d", - RelationGetRelationName(pi_state->index_rel), - pi_state->range_sz); - for (i=0; i < map->nwords; i += 4) - { - elog(INFO, "% 6d: %08x %08x %08x %08x", - i * BITS_PER_BITMAPWORD, - i+3 < map->nwords ? map->words[i+3] : 0, - i+2 < map->nwords ? map->words[i+2] : 0, - i+1 < map->nwords ? map->words[i+1] : 0, - i < map->nwords ? map->words[i] : 0); - } - } -#endif - } - else - { - int ev; - - /* wait for completion of BRIN-index preload */ - CHECK_FOR_INTERRUPTS(); - - ev = WaitLatch(MyLatch, - WL_LATCH_SET | - WL_POSTMASTER_DEATH, - -1, - PG_WAIT_EXTENSION); - if (ev & WL_POSTMASTER_DEATH) - elog(FATAL, "unexpected postmaster dead"); - ResetLatch(MyLatch); - } - } + pts->fallback_usage = 0; + pts->fallback_bufsz = 8 * BLCKSZ; + pts->fallback_buffer = + MemoryContextAlloc(memcxt, pts->fallback_bufsz); } -} - -void -pgstromExecEndBrinIndexMap(GpuTaskState *gts) -{ - pgstromIndexState *pi_state = gts->outer_index_state; + sz = MAXALIGN(offsetof(kern_tupitem, htup) + htuple->t_len); + while (pts->fallback_usage + sz > pts->fallback_bufsz) + { + pts->fallback_bufsz *= 2 + BLCKSZ; + pts->fallback_buffer = repalloc_huge(pts->fallback_buffer, + pts->fallback_bufsz); + } + while (pts->fallback_nitems >= pts->fallback_nrooms) + { + pts->fallback_nrooms *= 2 + 100; + pts->fallback_tuples = repalloc_huge(pts->fallback_tuples, + sizeof(off_t) * pts->fallback_nrooms); + } + titem = (kern_tupitem *)(pts->fallback_buffer + + pts->fallback_usage); + titem->t_len = htuple->t_len; + titem->rowid = pts->fallback_nitems++; + memcpy(&titem->htup, htuple->t_data, htuple->t_len); - if (!pi_state) - return; - brinRevmapTerminate(pi_state->brin_revmap); - index_close(pi_state->index_rel, NoLock); + pts->fallback_tuples[titem->rowid] = pts->fallback_usage; + pts->fallback_usage += sz; } -void -pgstromExecRewindBrinIndexMap(GpuTaskState *gts) -{} - -/* - * pgstromExplainBrinIndexMap - */ -void -pgstromExplainBrinIndexMap(GpuTaskState *gts, - ExplainState *es, - List *dcontext) +TupleTableSlot * +pgstromFetchFallbackTuple(pgstromTaskState *pts) { - pgstromIndexState *pi_state = gts->outer_index_state; - char *conds_str; - char temp[128]; - - if (!pi_state) - return; - - conds_str = deparse_expression(pi_state->index_quals, - dcontext, es->verbose, false); - ExplainPropertyText("BRIN cond", conds_str, es); - if (es->analyze) + if (pts->fallback_tuples && + pts->fallback_buffer && + pts->fallback_index < pts->fallback_nitems) { - if (es->format == EXPLAIN_FORMAT_TEXT) - { - snprintf(temp, sizeof(temp), "%ld of %ld (%.2f%%)", - gts->outer_brin_count, - (long)pi_state->nblocks, - 100.0 * ((double) gts->outer_brin_count / - (double) pi_state->nblocks)); - ExplainPropertyText("BRIN skipped", temp, es); - } - else + TupleTableSlot *slot = pts->css.ss.ss_ScanTupleSlot; + HeapTuple htuple = palloc0(sizeof(HeapTupleData)); + kern_tupitem *titem; + + titem = (kern_tupitem *)(pts->fallback_buffer + + pts->fallback_tuples[pts->fallback_index++]); + htuple->t_len = titem->t_len; + htuple->t_data = &titem->htup; + ExecForceStoreHeapTuple(htuple, slot, true); + /* reset the buffer if last one */ + if (pts->fallback_index == pts->fallback_nitems) { - ExplainPropertyInteger("BRIN fetched", NULL, - pi_state->nblocks - - gts->outer_brin_count, es); - ExplainPropertyInteger("BRIN skipped", NULL, - gts->outer_brin_count, es); + pts->fallback_index = 0; + pts->fallback_nitems = 0; + pts->fallback_usage = 0; } + return slot; } + return NULL; } -/* - * PDS_exec_heapscan_block - PDS scan for KDS_FORMAT_BLOCK format +/* ---------------------------------------------------------------- + * + * Routines to load chunks from storage + * + * ---------------------------------------------------------------- */ -typedef struct { - strom_io_vector *iovec; - BlockNumber *blknum; -} PDSHeapScanBlockState; - -#define initPDSHeapScanBlockState(pds, bstate) \ - do{ \ - (bstate).iovec = alloca(offsetof(strom_io_vector, \ - ioc[(pds)->kds.nrooms])); \ - (bstate).iovec->nr_chunks = 0; \ - (bstate).blknum = alloca(sizeof(BlockNumber) * (pds)->kds.nrooms); \ - }while(0) - -static inline void -updatePDSHeapScanBlockState(pgstrom_data_store *pds, - PDSHeapScanBlockState *bstate, - BlockNumber blknum) -{ - strom_io_vector *iovec = bstate->iovec; - strom_io_chunk *iochunk; - cl_uint pages_per_block = (BLCKSZ / PAGE_SIZE); - cl_uint fchunk_id = (blknum % RELSEG_SIZE) * pages_per_block; +#define __XCMD_KDS_SRC_OFFSET(buf) \ + (((XpuCommand *)((buf)->data))->u.task.kds_src_offset) +#define __XCMD_GET_KDS_SRC(buf) \ + ((kern_data_store *)((buf)->data + __XCMD_KDS_SRC_OFFSET(buf))) - if (iovec->nr_chunks > 0) +static void +__relScanDirectFallbackBlock(pgstromTaskState *pts, + kern_data_store *kds, + BlockNumber block_num) +{ + pgstromSharedState *ps_state = pts->ps_state; + Relation relation = pts->css.ss.ss_currentRelation; + HeapScanDesc h_scan = (HeapScanDesc)pts->css.ss.ss_currentScanDesc; + Snapshot snapshot = pts->css.ss.ps.state->es_snapshot; + Buffer buffer; + Page page; + int lines; + OffsetNumber lineoff; + ItemId lpp; + + buffer = ReadBufferExtended(relation, + MAIN_FORKNUM, + block_num, + RBM_NORMAL, + h_scan->rs_strategy); + /* just like heapgetpage() */ + heap_page_prune_opt(relation, buffer); + /* pick up valid tuples from the target page */ + LockBuffer(buffer, BUFFER_LOCK_SHARE); + page = BufferGetPage(buffer); + lines = PageGetMaxOffsetNumber(page); + for (lineoff = FirstOffsetNumber, lpp = PageGetItemId(page, lineoff); + lineoff <= lines; + lineoff++, lpp++) { - iochunk = &iovec->ioc[iovec->nr_chunks - 1]; - if (iochunk->fchunk_id + iochunk->nr_pages == fchunk_id) - { - /* continuous region - expand the last chunk */ - iochunk->nr_pages += pages_per_block; - goto out; - } + HeapTupleData htup; + bool valid; + + if (!ItemIdIsNormal(lpp)) + continue; + + htup.t_tableOid = RelationGetRelid(relation); + htup.t_data = (HeapTupleHeader) PageGetItem((Page)page, lpp); + htup.t_len = ItemIdGetLength(lpp); + ItemPointerSet(&htup.t_self, block_num, lineoff); + + valid = HeapTupleSatisfiesVisibility(&htup, snapshot, buffer); + HeapCheckForSerializableConflictOut(valid, relation, &htup, + buffer, snapshot); + if (valid) + pts->cb_cpu_fallback(pts, kds, &htup); } - /* discontinuous region - add a new chunk */ - iochunk = &iovec->ioc[iovec->nr_chunks++]; - iochunk->m_offset = BLCKSZ * pds->nblocks_uncached; - iochunk->fchunk_id = fchunk_id; - iochunk->nr_pages = pages_per_block; -out: - bstate->blknum[pds->nblocks_uncached++] = blknum; + UnlockReleaseBuffer(buffer); + pg_atomic_fetch_add_u32(&ps_state->heap_fallback_nblocks, 1); } static void -mergePDSHeapScanBlockState(pgstrom_data_store *pds, - PDSHeapScanBlockState *bstate) -{ - strom_io_vector *iovec = bstate->iovec; - cl_uint nr_uncached = pds->nblocks_uncached; - cl_uint nr_loaded = pds->kds.nitems - nr_uncached; - BlockNumber *block_nums = (BlockNumber *)KERN_DATA_STORE_BODY(&pds->kds); - - Assert(pds->nblocks_uncached > 0); - Assert(iovec != NULL); - - /* copy BlockNumber array */ - memcpy(block_nums + nr_loaded, bstate->blknum, - sizeof(BlockNumber) * nr_uncached); - /* copy iovec */ - memcpy(pds->iovec, iovec, offsetof(strom_io_vector, - ioc[iovec->nr_chunks])); -} - -static bool -PDS_exec_heapscan_block(GpuTaskState *gts, - pgstrom_data_store *pds, - PDSHeapScanBlockState *bstate) +__relScanDirectCachedBlock(pgstromTaskState *pts, BlockNumber block_num) { - Relation relation = gts->css.ss.ss_currentRelation; - HeapScanDesc hscan = (HeapScanDesc)gts->css.ss.ss_currentScanDesc; - NVMEScanState *nvme_sstate = gts->nvme_sstate; - BlockNumber blknum = hscan->rs_cblock; - BlockNumber *block_nums; - Snapshot snapshot = ((TableScanDesc)hscan)->rs_snapshot; - BufferAccessStrategy strategy = hscan->rs_strategy; - SMgrRelation smgr = relation->rd_smgr; - Buffer buffer; - Page spage; - Page dpage; - cl_uint nr_loaded; - bool all_visible; - - /* PDS cannot eat any blocks more, obviously */ - if (pds->kds.nitems >= pds->kds.nrooms) - return false; - - /* array of block numbers */ - block_nums = (BlockNumber *)KERN_DATA_STORE_BODY(&pds->kds); - - /* - * NVMe-Strom can be applied only when filesystem supports the feature, - * and the current source block is all-visible. - * Elsewhere, we will go fallback with synchronized buffer scan. - */ - if (RelationCanUseNvmeStrom(relation) && - VM_ALL_VISIBLE(relation, blknum, - &nvme_sstate->curr_vmbuffer)) - { - BufferTag newTag; - uint32 newHash; - LWLock *newPartitionLock = NULL; - bool retval; - int buf_id; - - /* create a tag so we can lookup the buffer */ - INIT_BUFFERTAG(newTag, smgr->smgr_rnode.node, MAIN_FORKNUM, blknum); - /* determine its hash code and partition lock ID */ - newHash = BufTableHashCode(&newTag); - newPartitionLock = BufMappingPartitionLock(newHash); - - /* check whether the block exists on the shared buffer? */ - LWLockAcquire(newPartitionLock, LW_SHARED); - buf_id = BufTableLookup(&newTag, newHash); - if (buf_id < 0) - { - BlockNumber segno = blknum / RELSEG_SIZE; - GPUDirectFileDesc *dfile; + Relation relation = pts->css.ss.ss_currentRelation; + HeapScanDesc h_scan = (HeapScanDesc)pts->css.ss.ss_currentScanDesc; + Snapshot snapshot = pts->css.ss.ps.state->es_snapshot; + kern_data_store *kds; + Buffer buffer; + Page spage; + Page dpage; + bool has_valid_tuples = false; - Assert(segno < nvme_sstate->nr_segs); - /* - * We cannot mix up multiple source files in a single PDS chunk. - * If heapscan_block comes across segment boundary, rest of the - * blocks must be read on the next PDS chunk. - */ - dfile = &nvme_sstate->files[segno]; - if (pds->filedesc.rawfd >= 0 && - pds->filedesc.rawfd != dfile->rawfd) - retval = false; - else - { - if (pds->filedesc.rawfd < 0) - memcpy(&pds->filedesc, dfile, sizeof(GPUDirectFileDesc)); - updatePDSHeapScanBlockState(pds, bstate, blknum); - pds->kds.nitems++; - retval = true; - } - LWLockRelease(newPartitionLock); - return retval; - } - LWLockRelease(newPartitionLock); - } /* * Load the source buffer with synchronous read */ - buffer = ReadBufferExtended(relation, MAIN_FORKNUM, blknum, - RBM_NORMAL, strategy); -#if 1 - /* Just like heapgetpage(), however, jobs we focus on is OLAP - * workload, so it's uncertain whether we should vacuum the page - * here. - */ + buffer = ReadBufferExtended(relation, + MAIN_FORKNUM, + block_num, + RBM_NORMAL, + h_scan->rs_strategy); + /* prune the old items, if any */ heap_page_prune_opt(relation, buffer); -#endif - /* we will check tuple's visibility under the shared lock */ + /* let's check tuples visibility for each */ LockBuffer(buffer, BUFFER_LOCK_SHARE); - nr_loaded = pds->kds.nitems - pds->nblocks_uncached; spage = (Page) BufferGetPage(buffer); - dpage = (Page) KERN_DATA_STORE_BLOCK_PGPAGE(&pds->kds, nr_loaded); - memcpy(dpage, spage, BLCKSZ); - block_nums[nr_loaded] = blknum; + appendBinaryStringInfo(&pts->xcmd_buf, (const char *)spage, BLCKSZ); + UnlockReleaseBuffer(buffer); + kds = __XCMD_GET_KDS_SRC(&pts->xcmd_buf); + dpage = (Page) KDS_BLOCK_PGPAGE(kds, kds->block_nloaded); + Assert(dpage >= pts->xcmd_buf.data && + dpage + BLCKSZ <= pts->xcmd_buf.data + pts->xcmd_buf.len); + KDS_BLOCK_BLCKNR(kds, kds->block_nloaded) = block_num; /* - * Logic is almost same as heapgetpage() doing. We have to invalidate - * invisible tuples prior to GPU kernel execution, if not all-visible. + * Logic is almost equivalent as heapgetpage() doing. + * We have to invalidate tuples prior to GPU kernel + * execution, if not all-visible. */ - all_visible = PageIsAllVisible(dpage) && !snapshot->takenDuringRecovery; - if (!all_visible) + if (!PageIsAllVisible(dpage) || snapshot->takenDuringRecovery) { - int lines = PageGetMaxOffsetNumber(dpage); - OffsetNumber lineoff; - ItemId lpp; + int lines = PageGetMaxOffsetNumber(dpage); + ItemId lpp; + OffsetNumber lineoff; + for (lineoff = FirstOffsetNumber, lpp = PageGetItemId(dpage, lineoff); lineoff <= lines; lineoff++, lpp++) { - HeapTupleData tup; - bool valid; + HeapTupleData htup; + bool valid; if (!ItemIdIsNormal(lpp)) continue; - - tup.t_tableOid = RelationGetRelid(relation); - tup.t_data = (HeapTupleHeader) PageGetItem((Page) dpage, lpp); - tup.t_len = ItemIdGetLength(lpp); - ItemPointerSet(&tup.t_self, blknum, lineoff); - - valid = HeapTupleSatisfiesVisibility(&tup, snapshot, buffer); - HeapCheckForSerializableConflictOut(valid, relation, &tup, + htup.t_tableOid = RelationGetRelid(relation); + htup.t_data = (HeapTupleHeader) PageGetItem((Page) dpage, lpp); + Assert((((uintptr_t)htup.t_data - (uintptr_t)dpage) & 7) == 0); + htup.t_len = ItemIdGetLength(lpp); + ItemPointerSet(&htup.t_self, block_num, lineoff); + + valid = HeapTupleSatisfiesVisibility(&htup, snapshot, buffer); + HeapCheckForSerializableConflictOut(valid, relation, &htup, buffer, snapshot); - if (!valid) + if (valid) + has_valid_tuples = true; + else ItemIdSetUnused(lpp); } } - UnlockReleaseBuffer(buffer); - /* dpage became all-visible also */ - PageSetAllVisible(dpage); - pds->kds.nitems++; - - return true; -} - -/* - * PDS_exec_heapscan_row - PDS scan for KDS_FORMAT_ROW format - */ -static bool -PDS_exec_heapscan_row(GpuTaskState *gts, pgstrom_data_store *pds) -{ - Relation relation = gts->css.ss.ss_currentRelation; - HeapScanDesc hscan = (HeapScanDesc)gts->css.ss.ss_currentScanDesc; - BlockNumber blknum = hscan->rs_cblock; - Snapshot snapshot = ((TableScanDesc)hscan)->rs_snapshot; - BufferAccessStrategy strategy = hscan->rs_strategy; - kern_data_store *kds = &pds->kds; - Buffer buffer; - Page page; - int lines; - int ntup; - OffsetNumber lineoff; - ItemId lpp; - uint *tup_index; - kern_tupitem *tup_item; - bool all_visible; - Size max_consume; - - /* Load the target buffer */ - buffer = ReadBufferExtended(relation, MAIN_FORKNUM, blknum, - RBM_NORMAL, strategy); -#if 1 - /* Just like heapgetpage(), however, jobs we focus on is OLAP - * workload, so it's uncertain whether we should vacuum the page - * here. - */ - heap_page_prune_opt(relation, buffer); -#endif - /* we will check tuple's visibility under the shared lock */ - LockBuffer(buffer, BUFFER_LOCK_SHARE); - page = (Page) BufferGetPage(buffer); - lines = PageGetMaxOffsetNumber(page); - ntup = 0; - - /* - * Check whether we have enough rooms to store expected number of - * tuples on the remaining space. If it is hopeless to load all - * the items in a block, we inform the caller this block shall be - * loaded on the next data store. - */ - max_consume = KERN_DATA_STORE_HEAD_LENGTH(kds) + - STROMALIGN(sizeof(cl_uint) * (kds->nitems + lines)) + - offsetof(kern_tupitem, htup) * lines + BLCKSZ + - __kds_unpack(kds->usage); - if (max_consume > kds->length) + else { - UnlockReleaseBuffer(buffer); - return false; + has_valid_tuples = true; } /* - * Logic is almost same as heapgetpage() doing. + * If no tuples in this block are visible, we don't need to load + * them to xPU device (just wast of memory and bandwidth), + * so it shall be reverted from the xcmd-buffer. */ - all_visible = PageIsAllVisible(page) && !snapshot->takenDuringRecovery; - - /* TODO: make SerializationNeededForRead() an external function - * on the core side. It kills necessity of setting up HeapTupleData - * when all_visible and non-serialized transaction. - */ - tup_index = KERN_DATA_STORE_ROWINDEX(kds) + kds->nitems; - for (lineoff = FirstOffsetNumber, lpp = PageGetItemId(page, lineoff); - lineoff <= lines; - lineoff++, lpp++) + if (has_valid_tuples) { - HeapTupleData tup; - size_t curr_usage; - bool valid; - - if (!ItemIdIsNormal(lpp)) - continue; - - tup.t_tableOid = RelationGetRelid(relation); - tup.t_data = (HeapTupleHeader) PageGetItem((Page) page, lpp); - tup.t_len = ItemIdGetLength(lpp); - ItemPointerSet(&tup.t_self, blknum, lineoff); - - if (all_visible) - valid = true; - else - valid = HeapTupleSatisfiesVisibility(&tup, snapshot, buffer); - - HeapCheckForSerializableConflictOut(valid, relation, - &tup, buffer, snapshot); - if (!valid) - continue; - - /* put tuple */ - curr_usage = (__kds_unpack(kds->usage) + - MAXALIGN(offsetof(kern_tupitem, htup) + tup.t_len)); - tup_item = (kern_tupitem *)((char *)kds + kds->length - curr_usage); - tup_item->rowid = kds->nitems + ntup; - tup_item->t_len = tup.t_len; - memcpy(&tup_item->htup, tup.t_data, tup.t_len); - memcpy(&tup_item->htup.t_ctid, &tup.t_self, sizeof(ItemPointerData)); - - tup_index[ntup++] = __kds_packed((uintptr_t)tup_item - (uintptr_t)kds); - kds->usage = __kds_packed(curr_usage); + pts->xcmd_buf.len -= BLCKSZ; + return; } - UnlockReleaseBuffer(buffer); - Assert(ntup <= MaxHeapTuplesPerPage); - Assert(kds->nitems + ntup <= kds->nrooms); - kds->nitems += ntup; - - return true; -} - -/* - * heapscan_report_location - */ -static inline void -heapscan_report_location(HeapScanDesc hscan) -{ -#if PG_VERSION_NUM < 120000 - if (hscan->rs_syncscan) - ss_report_location(hscan->rs_rd, hscan->rs_cblock); -#else - if (hscan->rs_base.rs_flags & SO_ALLOW_SYNC) - ss_report_location(hscan->rs_base.rs_rd, hscan->rs_cblock); -#endif + /* dpage became all-visible also */ + PageSetAllVisible(dpage); + kds->nitems++; + kds->block_nloaded++; } -/* - * pgstromExecHeapScanChunkParallel - read the heap relation by parallel scan - */ -static pgstrom_data_store * -pgstromExecHeapScanChunkParallel(GpuTaskState *gts, - Bitmapset *brin_map, - cl_long brin_range_sz) -{ - GpuTaskSharedState *gtss = gts->gtss; - Relation relation = gts->css.ss.ss_currentRelation; - HeapScanDesc hscan = (HeapScanDesc)gts->css.ss.ss_currentScanDesc; - pgstrom_data_store *pds = NULL; - PDSHeapScanBlockState bstate; - - Assert(gts->css.ss.ss_currentScanDesc->rs_parallel); - memset(&bstate, 0, sizeof(PDSHeapScanBlockState)); - for (;;) - { - CHECK_FOR_INTERRUPTS(); - - if (!hscan->rs_inited) - { - if (hscan->rs_nblocks == 0) - { - /* no blocks to read */ - break; - } - hscan->rs_cblock = InvalidBlockNumber; - hscan->rs_numblocks = 0; /* force to get next blocks */ - hscan->rs_inited = true; - } - else if (hscan->rs_cblock == InvalidBlockNumber) - { - /* end of the scan */ - break; - } - - if (hscan->rs_numblocks == 0) - { - NVMEScanState *nvme_sstate = gts->nvme_sstate; - BlockNumber sync_startpage = InvalidBlockNumber; - cl_long nr_allocated; - cl_long startblock; - cl_long nr_blocks; - cl_long page; - - /* - * MEMO: A key of i/o performance is consolidation of continuous - * block reads with a small number of system-call invocation. - * The default one-by-one block read logic tend to generate i/o - * request fragmentation under CPU parallel execution, thus it - * leads larger number of read commands submit and performance - * slow-down. - * So, in case of NVMe-Strom under CPU parallel, we make the - * @scan->rs_cblock pointer advanced by multiple blocks at once. - * It ensures the block numbers to read are continuous, thus, - * i/o stack will be able to load storage blocks with minimum - * number of DMA requests. - */ - if (!nvme_sstate) - nr_blocks = 8; - else if (pds) - { - if (pds->kds.nitems >= pds->kds.nrooms) - break; /* no more rooms in this PDS */ - nr_blocks = pds->kds.nrooms - pds->kds.nitems; - } - else - nr_blocks = nvme_sstate->nblocks_per_chunk; - - retry_lock: - SpinLockAcquire(>ss->pbs_mutex); +XpuCommand * +pgstromRelScanChunkDirect(pgstromTaskState *pts, + struct iovec *xcmd_iov, int *xcmd_iovcnt) +{ + pgstromSharedState *ps_state = pts->ps_state; + Relation relation = pts->css.ss.ss_currentRelation; + HeapScanDesc h_scan = (HeapScanDesc)pts->css.ss.ss_currentScanDesc; + /* NOTE: 'smgr_rnode' always locates on the head of SMgrRelationData */ + RelFileNodeBackend *smgr_rnode = (RelFileNodeBackend *)RelationGetSmgr(relation); + XpuCommand *xcmd; + kern_data_store *kds; + unsigned long m_offset = 0UL; + BlockNumber segment_id = InvalidBlockNumber; + strom_io_vector *strom_iovec; + strom_io_chunk *strom_ioc = NULL; + BlockNumber *strom_blknums; + uint32_t strom_nblocks = 0; + uint32_t kds_src_pathname = 0; + uint32_t kds_src_iovec = 0; + uint32_t kds_nrooms; + + kds = __XCMD_GET_KDS_SRC(&pts->xcmd_buf); + kds_nrooms = (PGSTROM_CHUNK_SIZE - + KDS_HEAD_LENGTH(kds)) / (sizeof(BlockNumber) + BLCKSZ); + kds->nitems = 0; + kds->usage = 0; + kds->block_offset = (KDS_HEAD_LENGTH(kds) + + MAXALIGN(sizeof(BlockNumber) * kds_nrooms)); + kds->block_nloaded = 0; + pts->xcmd_buf.len = __XCMD_KDS_SRC_OFFSET(&pts->xcmd_buf) + kds->block_offset; + Assert(pts->xcmd_buf.len == MAXALIGN(pts->xcmd_buf.len)); + enlargeStringInfo(&pts->xcmd_buf, 0); + kds = __XCMD_GET_KDS_SRC(&pts->xcmd_buf); + + strom_iovec = alloca(offsetof(strom_io_vector, ioc[kds_nrooms])); + strom_iovec->nr_chunks = 0; + strom_blknums = alloca(sizeof(BlockNumber) * kds_nrooms); + strom_nblocks = 0; + while (!pts->scan_done) + { + while (pts->curr_block_num < pts->curr_block_tail && + kds->nitems < kds_nrooms) + { + BlockNumber block_num + = (pts->curr_block_num + h_scan->rs_startblock) % h_scan->rs_nblocks; /* - * If the scan's startblock has not yet been initialized, we must - * do it now. If this is not a synchronized scan, we just start - * at block 0, but if it is a synchronized scan, we must get - * the starting position from the synchronized scan facility. - * We can't hold the spinlock while doing that, though, so release - * the spinlock once, get the information we need, and retry. - * If nobody else has initialized the scan in the meantime, - * we'll fill in the value we fetched on the second time through. + * MEMO: Usually, CPU is (much) more powerful than DPUs. + * In case when the source cache is already on the shared- + * buffer, it makes no sense to handle this page on the + * DPU device. */ - if (gtss->pbs_startblock == InvalidBlockNumber) + if (pts->ds_entry && !pgstrom_dpu_handle_cached_pages) { - ParallelTableScanDesc ptscan - = gts->css.ss.ss_currentScanDesc->rs_parallel; - - if (!ptscan->phs_syncscan) - gtss->pbs_startblock = 0; - else if (sync_startpage != InvalidBlockNumber) - gtss->pbs_startblock = sync_startpage; - else + BufferTag bufTag; + uint32 bufHash; + LWLock *bufLock; + int buf_id; + + INIT_BUFFERTAG(bufTag, smgr_rnode->node, MAIN_FORKNUM, block_num); + bufHash = BufTableHashCode(&bufTag); + bufLock = BufMappingPartitionLock(bufHash); + + /* check whether the block exists on the shared buffer? */ + LWLockAcquire(bufLock, LW_SHARED); + buf_id = BufTableLookup(&bufTag, bufHash); + if (buf_id >= 0) { - SpinLockRelease(>ss->pbs_mutex); - sync_startpage = ss_get_location(relation, - hscan->rs_nblocks); - goto retry_lock; + LWLockRelease(bufLock); + __relScanDirectFallbackBlock(pts, kds, block_num); + pts->curr_block_num++; + continue; } + LWLockRelease(bufLock); } - hscan->rs_startblock = startblock = gtss->pbs_startblock; - nr_allocated = gtss->pbs_nallocated; - - if (nr_allocated >= (cl_long)hscan->rs_nblocks) - { - SpinLockRelease(>ss->pbs_mutex); - hscan->rs_cblock = InvalidBlockNumber; /* end of the scan */ - break; - } - if (nr_allocated + nr_blocks >= (cl_long)hscan->rs_nblocks) - nr_blocks = (cl_long)hscan->rs_nblocks - nr_allocated; - page = (startblock + nr_allocated) % (cl_long)hscan->rs_nblocks; - if (page + nr_blocks >= (cl_long)hscan->rs_nblocks) - nr_blocks = (cl_long)hscan->rs_nblocks - page; - - /* should never read the blocks across segment boundary */ - Assert(nr_blocks > 0 && nr_blocks <= RELSEG_SIZE); - if ((page / RELSEG_SIZE) != (page + nr_blocks - 1) / RELSEG_SIZE) - nr_blocks = RELSEG_SIZE - (page % RELSEG_SIZE); - Assert(nr_blocks > 0); - - if (brin_map) + + /* + * MEMO: right now, we allow GPU Direct SQL for the all-visible + * pages only, due to the restrictions about MVCC checks. + * However, it is too strict for the purpose. If we would have + * a mechanism to perform MVCC checks without commit logs. + * In other words, if all the tuples in a certain page have + * HEAP_XMIN_* or HEAP_XMAX_* flags correctly, we can have MVCC + * logic in the device code. + */ + if (VM_ALL_VISIBLE(relation, block_num, &pts->curr_vm_buffer)) { - long pos = page / brin_range_sz; - long end = (page + nr_blocks - 1) / brin_range_sz; - long s_page = -1; - long e_page = page + nr_blocks; - - /* find the first valid range */ - while (pos <= end) + /* + * We don't allow xPU Direct SQL across multiple heap + * segments (for the code simplification). So, once + * relation scan is broken out, then restart with new + * KDS buffer. + */ + unsigned int fchunk_id; + + if (segment_id == InvalidBlockNumber) + segment_id = block_num / RELSEG_SIZE; + else if (segment_id != block_num / RELSEG_SIZE) + goto out; + + fchunk_id = (block_num % RELSEG_SIZE) * PAGES_PER_BLOCK; + if (strom_ioc != NULL && (strom_ioc->fchunk_id + + strom_ioc->nr_pages) == fchunk_id) { - if (!bms_is_member(pos, brin_map)) - { - s_page = Max(page, pos * brin_range_sz); - break; - } - pos++; - } - - if (s_page < 0) - { - /* Oops, here is no valid range, so just skip it */ - gts->outer_brin_count += nr_blocks; - nr_allocated += nr_blocks; - nr_blocks = 0; + /* expand the iovec entry */ + strom_ioc->nr_pages += PAGES_PER_BLOCK; } else { - long prev = page; - /* find the continuous valid ranges */ - Assert(pos <= end); - Assert(!bms_is_member(pos, brin_map)); - while (pos <= end) - { - if (bms_is_member(pos, brin_map)) - { - e_page = Min(e_page, pos * brin_range_sz); - break; - } - pos++; - } - nr_allocated += (e_page - page); - nr_blocks = e_page - s_page; - page = s_page; - gts->outer_brin_count += page - prev; + /* add the next iovec entry */ + strom_ioc = &strom_iovec->ioc[strom_iovec->nr_chunks++]; + strom_ioc->m_offset = m_offset; + strom_ioc->fchunk_id = fchunk_id; + strom_ioc->nr_pages = PAGES_PER_BLOCK; } + kds->nitems++; + strom_blknums[strom_nblocks++] = block_num; + m_offset += BLCKSZ; } - else - { - /* elsewhere, just walk on the following blocks */ - nr_allocated += nr_blocks; - } - /* update # of blocks already allocated to workers */ - gtss->pbs_nallocated = nr_allocated; - SpinLockRelease(>ss->pbs_mutex); - - hscan->rs_cblock = page; - hscan->rs_numblocks = nr_blocks; - continue; - } - /* scan next block */ - if (gts->nvme_sstate) - { - /* KDS_FORMAT_BLOCK */ - if (!pds) + else if (pts->ds_entry) { - pds = PDS_create_block(gts->gcontext, - RelationGetDescr(relation), - gts->nvme_sstate); - pds->kds.table_oid = RelationGetRelid(relation); - initPDSHeapScanBlockState(pds, bstate); + /* + * For DPU devices, it makes no sense to move the data blocks + * to the (relatively) poor performance devices instead of CPUs. + * So, we run CPU fallback for the tuples in dirty pages. + */ + __relScanDirectFallbackBlock(pts, kds, block_num); } - if (!PDS_exec_heapscan_block(gts, pds, &bstate)) - break; - } - else - { - /* KDS_FORMAT_ROW */ - if (!pds) + else { - pds = PDS_create_row(gts->gcontext, - RelationGetDescr(relation), - pgstrom_chunk_size()); - pds->kds.table_oid = RelationGetRelid(relation); + __relScanDirectCachedBlock(pts, block_num); } - if (!PDS_exec_heapscan_row(gts, pds)) - break; + pts->curr_block_num++; } - /* move to the next block */ - hscan->rs_numblocks--; - hscan->rs_cblock++; - if (hscan->rs_cblock >= hscan->rs_nblocks) - hscan->rs_cblock = 0; - heapscan_report_location(hscan); - /* end of the scan? */ - if (hscan->rs_cblock == hscan->rs_startblock) - hscan->rs_cblock = InvalidBlockNumber; - } - /* merge strom_io_vector to the PDS, if KDS_FORMAT_BLOCK */ - if (pds && pds->nblocks_uncached > 0) - mergePDSHeapScanBlockState(pds, &bstate); - - return pds; -} - -/* - * pgstromExecHeapScanChunk - */ -static pgstrom_data_store * -pgstromExecHeapScanChunk(GpuTaskState *gts, - Bitmapset *brin_map, cl_long brin_range_sz) -{ - Relation rel = gts->css.ss.ss_currentRelation; - HeapScanDesc hscan = (HeapScanDesc)gts->css.ss.ss_currentScanDesc; - pgstrom_data_store *pds = NULL; - PDSHeapScanBlockState bstate; - - memset(&bstate, 0, sizeof(PDSHeapScanBlockState)); - for (;;) - { - cl_long page; - - CHECK_FOR_INTERRUPTS(); - if (!hscan->rs_inited) - { - /* no blocks to read? */ - if (hscan->rs_nblocks == 0) - break; - hscan->rs_cblock = hscan->rs_startblock; - Assert(hscan->rs_numblocks == InvalidBlockNumber); - hscan->rs_inited = true; - } - else if (hscan->rs_cblock == InvalidBlockNumber) + if (kds->nitems >= kds_nrooms) { - /* no more blocks to read */ + /* ok, we cannot load more pages in this chunk */ break; } - page = hscan->rs_cblock; - - /* - * If any, check BRIN-index bitmap, then moves to the next range - * boundary if no tuple can match in this range. - */ - if (brin_map) + else if (pts->br_state) { - long pos = page / brin_range_sz; - - if (bms_is_member(pos, brin_map)) - { - long prev = page; - - page = (pos + 1) * brin_range_sz; - if (page <= (cl_long)MaxBlockNumber) - hscan->rs_cblock = (BlockNumber)page; - else - hscan->rs_cblock = 0; - gts->outer_brin_count += (page - prev); - goto skip; - } + if (!pgstromBrinIndexNextChunk(pts)) + pts->scan_done = true; } - /* scan the next block */ - if (gts->nvme_sstate) + else if (!h_scan->rs_base.rs_parallel) { - if (!pds) + /* single process scan */ + BlockNumber num_blocks = kds_nrooms - kds->nitems; + + if (!h_scan->rs_inited) { - pds = PDS_create_block(gts->gcontext, - RelationGetDescr(rel), - gts->nvme_sstate); - pds->kds.table_oid = RelationGetRelid(rel); - initPDSHeapScanBlockState(pds, bstate); + h_scan->rs_cblock = 0; + h_scan->rs_inited = true; } - if (!PDS_exec_heapscan_block(gts, pds, &bstate)) - break; + pts->curr_block_num = h_scan->rs_cblock; + if (pts->curr_block_num >= h_scan->rs_nblocks) + pts->scan_done = true; + else if (pts->curr_block_num + num_blocks > h_scan->rs_nblocks) + num_blocks = h_scan->rs_nblocks - pts->curr_block_num; + h_scan->rs_cblock += num_blocks; + pts->curr_block_tail = pts->curr_block_num + num_blocks; } else { - if (!pds) + /* parallel processes scan */ + ParallelBlockTableScanDesc pb_scan = + (ParallelBlockTableScanDesc)h_scan->rs_base.rs_parallel; + BlockNumber num_blocks = kds_nrooms - kds->nitems; + + if (!h_scan->rs_inited) { - pds = PDS_create_row(gts->gcontext, - RelationGetDescr(rel), - pgstrom_chunk_size()); - pds->kds.table_oid = RelationGetRelid(rel); + /* see table_block_parallelscan_startblock_init */ + BlockNumber start_block = InvalidBlockNumber; + + retry_parallel_init: + SpinLockAcquire(&pb_scan->phs_mutex); + if (pb_scan->phs_startblock == InvalidBlockNumber) + { + if (!pb_scan->base.phs_syncscan) + pb_scan->phs_startblock = 0; + else if (start_block != InvalidBlockNumber) + pb_scan->phs_startblock = start_block; + else + { + SpinLockRelease(&pb_scan->phs_mutex); + start_block = ss_get_location(relation, pb_scan->phs_nblocks); + goto retry_parallel_init; + } + } + h_scan->rs_nblocks = pb_scan->phs_nblocks; + h_scan->rs_startblock = pb_scan->phs_startblock; + SpinLockRelease(&pb_scan->phs_mutex); + h_scan->rs_inited = true; } - if (!PDS_exec_heapscan_row(gts, pds)) - break; + pts->curr_block_num = pg_atomic_fetch_add_u64(&pb_scan->phs_nallocated, + num_blocks); + if (pts->curr_block_num >= h_scan->rs_nblocks) + pts->scan_done = true; + else if (pts->curr_block_num + num_blocks > h_scan->rs_nblocks) + num_blocks = h_scan->rs_nblocks - pts->curr_block_num; + pts->curr_block_tail = pts->curr_block_num + num_blocks; } - /* move to the next block */ - hscan->rs_cblock++; - skip: - if (hscan->rs_cblock >= hscan->rs_nblocks) - hscan->rs_cblock = 0; - Assert(hscan->rs_numblocks == InvalidBlockNumber); - heapscan_report_location(hscan); - /* end of the scan? */ - if (hscan->rs_cblock == hscan->rs_startblock) - hscan->rs_cblock = InvalidBlockNumber; } - /* merge strom_io_vector to the PDS, if any */ - if (pds && pds->nblocks_uncached > 0) - mergePDSHeapScanBlockState(pds, &bstate); - - /* PDS is valid, or end of the relation */ - Assert(pds || !BlockNumberIsValid(hscan->rs_cblock)); - - return pds; -} - -/* - * pgstromExecScanChunk - read the relation by one chunk - */ -pgstrom_data_store * -pgstromExecScanChunk(GpuTaskState *gts) -{ - Relation rel = gts->css.ss.ss_currentRelation; - TableScanDesc tscan = gts->css.ss.ss_currentScanDesc; - Bitmapset *brin_map; - cl_long brin_range_sz = 0; - pgstrom_data_store *pds = NULL; - - /* - * Setup scan-descriptor, if the scan is not parallel, of if we're - * executing a scan that was intended to be parallel serially. - */ - if (!tscan) +out: + Assert(kds->nitems == kds->block_nloaded + strom_nblocks); + pg_atomic_fetch_add_u32(&ps_state->heap_normal_nblocks, kds->block_nloaded); + pg_atomic_fetch_add_u32(&ps_state->heap_direct_nblocks, strom_nblocks); + kds->length = kds->block_offset + BLCKSZ * kds->nitems; + if (kds->nitems == 0) + return NULL; + if (strom_iovec->nr_chunks > 0) { - EState *estate = gts->css.ss.ps.state; - - if (!gts->gtss) - tscan = table_beginscan(rel, estate->es_snapshot, 0, NULL); - else - tscan = table_beginscan_parallel(rel, >s->gtss->phscan); + size_t sz; - gts->css.ss.ss_currentScanDesc = tscan; - /* - * Try to choose NVMe-Strom, if relation is deployed on the supported - * tablespace and expected total i/o size is enough large than cache- - * only scan. - */ - PDS_init_heapscan_state(gts); - } - InstrStartNode(>s->outer_instrument); - /* Load the BRIN-index bitmap, if any */ - if (gts->outer_index_state) - pgstromExecGetBrinIndexMap(gts); - brin_map = gts->outer_index_map; - if (brin_map) - brin_range_sz = gts->outer_index_state->range_sz; - - if (gts->gtss) - pds = pgstromExecHeapScanChunkParallel(gts, brin_map, brin_range_sz); - else - pds = pgstromExecHeapScanChunk(gts, brin_map, brin_range_sz); - - if (pds) - { - if (pds->kds.nitems == 0) - { - /* empty result */ - PDS_release(pds); - pds = NULL; - } - else if (pds->kds.format == KDS_FORMAT_BLOCK && - pds->kds.nitems < pds->kds.nrooms && - pds->nblocks_uncached > 0) - { - /* - * MEMO: Special case handling if KDS_FORMAT_BLOCK was not filled - * up entirely. KDS_FORMAT_BLOCK has an array of block-number to - * support "ctid" system column, located on next to the KDS-head. - * Block-numbers of pre-loaded blocks (hit on shared buffer) are - * used from the head, and others (to be read from the file) are - * used from the tail. If nitems < nrooms, this array has a hole - * on the middle of array. - * So, we have to move later half of the array to close the hole - * and make a flat array. - */ - BlockNumber *block_nums - = (BlockNumber *)KERN_DATA_STORE_BODY(&pds->kds); + kds_src_pathname = pts->xcmd_buf.len; + appendStringInfoString(&pts->xcmd_buf, pts->kds_pathname); + if (segment_id > 0) + appendStringInfo(&pts->xcmd_buf, ".%u", segment_id); + appendStringInfoChar(&pts->xcmd_buf, '\0'); - memmove(block_nums + (pds->kds.nitems - pds->nblocks_uncached), - block_nums + (pds->kds.nrooms - pds->nblocks_uncached), - sizeof(BlockNumber) * pds->nblocks_uncached); - } - } - /* update statistics */ - if (pds) - { - if (pds->kds.format == KDS_FORMAT_BLOCK) - gts->nvme_count += pds->nblocks_uncached; - InstrStopNode(>s->outer_instrument, (double)pds->kds.nitems); + sz = offsetof(strom_io_vector, ioc[strom_iovec->nr_chunks]); + kds_src_iovec = __appendBinaryStringInfo(&pts->xcmd_buf, + (const char *)strom_iovec, sz); } else { - InstrStopNode(>s->outer_instrument, 0.0); + Assert(segment_id == InvalidBlockNumber); } - return pds; -} + xcmd = (XpuCommand *)pts->xcmd_buf.data; + xcmd->u.task.kds_src_pathname = kds_src_pathname; + xcmd->u.task.kds_src_iovec = kds_src_iovec; + xcmd->length = pts->xcmd_buf.len; -/* - * pgstromRewindScanChunk - */ -void -pgstromRewindScanChunk(GpuTaskState *gts) -{ - TableScanDesc tscan = gts->css.ss.ss_currentScanDesc; + xcmd_iov[0].iov_base = xcmd; + xcmd_iov[0].iov_len = xcmd->length; + *xcmd_iovcnt = 1; - InstrEndLoop(>s->outer_instrument); - if (tscan) - { - table_rescan(tscan, NULL); - ExecScanReScan(>s->css.ss); - } + return xcmd; } -/* - * pgstromExplainOuterScan - */ -void -pgstromExplainOuterScan(GpuTaskState *gts, - List *deparse_context, - List *ancestors, - ExplainState *es, - List *outer_quals, - Cost outer_startup_cost, - Cost outer_total_cost, - double outer_plan_rows, - int outer_plan_width) -{ - Plan *plannode = gts->css.ss.ps.plan; - Index scanrelid = ((Scan *) plannode)->scanrelid; - Instrumentation *instrument = >s->outer_instrument; - RangeTblEntry *rte; - const char *refname; - const char *relname; - const char *nspname = NULL; - StringInfoData str; - - /* Does this GpuTaskState has outer simple scan? */ - if (scanrelid == 0) - return; +static bool +__kds_row_insert_tuple(kern_data_store *kds, TupleTableSlot *slot) +{ + uint32_t *rowindex = KDS_GET_ROWINDEX(kds); + HeapTuple tuple; + size_t sz, __usage; + bool should_free; + kern_tupitem *titem; + + Assert(kds->format == KDS_FORMAT_ROW && kds->hash_nslots == 0); + tuple = ExecFetchSlotHeapTuple(slot, false, &should_free); + + __usage = (__kds_unpack(kds->usage) + + MAXALIGN(offsetof(kern_tupitem, htup) + tuple->t_len)); + sz = KDS_HEAD_LENGTH(kds) + sizeof(uint32_t) * (kds->nitems + 1) + __usage; + if (sz > kds->length) + return false; /* no more items! */ + titem = (kern_tupitem *)((char *)kds + kds->length - __usage); + titem->t_len = tuple->t_len; + titem->rowid = kds->nitems; + memcpy(&titem->htup, tuple->t_data, tuple->t_len); + kds->usage = rowindex[kds->nitems++] = __kds_packed(__usage); + + if (should_free) + heap_freetuple(tuple); + ExecClearTuple(slot); - /* - * See the logic in ExplainTargetRel() - */ - rte = rt_fetch(scanrelid, es->rtable); - Assert(rte->rtekind == RTE_RELATION); - refname = (char *) list_nth(es->rtable_names, scanrelid - 1); - if (!refname) - refname = rte->eref->aliasname; - relname = get_rel_name(rte->relid); - if (es->verbose) - nspname = get_namespace_name(get_rel_namespace(rte->relid)); - - initStringInfo(&str); - if (es->format == EXPLAIN_FORMAT_TEXT) - { - if (nspname != NULL) - appendStringInfo(&str, "%s.%s", - quote_identifier(nspname), - quote_identifier(relname)); - else if (relname) - appendStringInfo(&str, "%s", - quote_identifier(relname)); - if (!relname || strcmp(refname, relname) != 0) - { - if (str.len > 0) - appendStringInfoChar(&str, ' '); - appendStringInfo(&str, "%s", refname); - } - } - else - { - ExplainPropertyText("Outer Scan Relation", relname, es); - if (nspname) - ExplainPropertyText("Outer Scan Schema", nspname, es); - ExplainPropertyText("Outer Scan Alias", refname, es); - } + return true; +} - if (es->costs) - { - if (es->format == EXPLAIN_FORMAT_TEXT) - appendStringInfo(&str, " (cost=%.2f..%.2f rows=%.0f width=%d)", - outer_startup_cost, - outer_total_cost, - outer_plan_rows, - outer_plan_width); - else - { - ExplainPropertyFloat("Outer Startup Cost", - NULL, outer_startup_cost, 2, es); - ExplainPropertyFloat("Outer Total Cost", - NULL, outer_total_cost, 2, es); - ExplainPropertyFloat("Outer Plan Rows", - NULL, outer_plan_rows, 0, es); - ExplainPropertyFloat("Outer Plan Width", - NULL, outer_plan_width, 0, es); - } - } +XpuCommand * +pgstromRelScanChunkNormal(pgstromTaskState *pts, + struct iovec *xcmd_iov, int *xcmd_iovcnt) +{ + EState *estate = pts->css.ss.ps.state; + TableScanDesc scan = pts->css.ss.ss_currentScanDesc; + TupleTableSlot *slot = pts->base_slot; + kern_data_store *kds; + XpuCommand *xcmd; + size_t sz1, sz2; - /* - * We have to forcibly clean up the instrumentation state because we - * haven't done ExecutorEnd yet. This is pretty grotty ... - * See the comment in ExplainNode() - */ - InstrEndLoop(instrument); + pts->xcmd_buf.len = __XCMD_KDS_SRC_OFFSET(&pts->xcmd_buf) + PGSTROM_CHUNK_SIZE; + enlargeStringInfo(&pts->xcmd_buf, 0); + kds = __XCMD_GET_KDS_SRC(&pts->xcmd_buf); + kds->nitems = 0; + kds->usage = 0; + kds->length = PGSTROM_CHUNK_SIZE; - if (es->analyze && instrument->nloops > 0) + if (pts->br_state) { - double nloops = instrument->nloops; - double startup_sec = 1000.0 * instrument->startup / nloops; - double total_sec = 1000.0 * instrument->total / nloops; - double rows = instrument->ntuples / nloops; - - if (es->format == EXPLAIN_FORMAT_TEXT) - { - if (es->timing) - appendStringInfo( - &str, - " (actual time=%.3f..%.3f rows=%.0f loops=%.0f)", - startup_sec, total_sec, rows, nloops); - else - appendStringInfo( - &str, - " (actual rows=%.0f loops=%.0f)", - rows, nloops); - } - else + /* scan by BRIN index */ + while (!pts->scan_done) { - if (es->timing) + if (!pts->curr_tbm) { - ExplainPropertyFloat("Outer Actual Startup Time", - NULL, startup_sec, 3, es); - ExplainPropertyFloat("Outer Actual Total Time", - NULL, total_sec, 3, es); + TBMIterateResult *next_tbm = pgstromBrinIndexNextBlock(pts); + + if (!next_tbm) + { + pts->scan_done = true; + break; + } + if (!table_scan_bitmap_next_block(scan, next_tbm)) + elog(ERROR, "failed on table_scan_bitmap_next_block"); + pts->curr_tbm = next_tbm; } - ExplainPropertyFloat("Outer Actual Rows", NULL, rows, 0, es); - ExplainPropertyFloat("Outer Actual Loops", NULL, nloops, 0, es); + if (!TTS_EMPTY(slot) && + !__kds_row_insert_tuple(kds, slot)) + break; + if (!table_scan_bitmap_next_tuple(scan, pts->curr_tbm, slot)) + pts->curr_tbm = NULL; + else if (!__kds_row_insert_tuple(kds, slot)) + break; } } - else if (es->analyze) + else { - if (es->format == EXPLAIN_FORMAT_TEXT) - appendStringInfoString(&str, " (never executed)"); - else + /* full table scan */ + while (!pts->scan_done) { - if (es->timing) + if (!TTS_EMPTY(slot) && + !__kds_row_insert_tuple(kds, slot)) + break; + if (!table_scan_getnextslot(scan, estate->es_direction, slot)) { - ExplainPropertyFloat("Outer Actual Startup Time", - NULL, 0.0, 3, es); - ExplainPropertyFloat("Outer Actual Total Time", - NULL, 0.0, 3, es); + pts->scan_done = true; + break; } - ExplainPropertyFloat("Outer Actual Rows", - NULL, 0.0, 0, es); - ExplainPropertyFloat("Outer Actual Loops", - NULL, 0.0, 0, es); + if (!__kds_row_insert_tuple(kds, slot)) + break; } } - if (es->format == EXPLAIN_FORMAT_TEXT) - ExplainPropertyText("Outer Scan", str.data, es); - if (outer_quals) - { - Expr *quals_expr; - char *temp; - - quals_expr = make_ands_explicit(outer_quals); - temp = deparse_expression((Node *)quals_expr, - deparse_context, - es->verbose, false); - ExplainPropertyText("Outer Scan Filter", temp, es); - - if (gts->outer_instrument.nfiltered1 > 0.0) - ExplainPropertyFloat("Rows Removed by Outer Scan Filter", - NULL, - gts->outer_instrument.nfiltered1 / - gts->outer_instrument.nloops, - 0, es); - } - /* properties of BRIN-index */ - pgstromExplainBrinIndexMap(gts, es, deparse_context); + if (kds->nitems == 0) + return NULL; + + /* setup iovec that may skip the hole between row-index and tuples-buffer */ + sz1 = ((KDS_BODY_ADDR(kds) - pts->xcmd_buf.data) + + MAXALIGN(sizeof(uint32_t) * kds->nitems)); + sz2 = __kds_unpack(kds->usage); + Assert(sz1 + sz2 <= pts->xcmd_buf.len); + kds->length = (KDS_HEAD_LENGTH(kds) + + MAXALIGN(sizeof(uint32_t) * kds->nitems) + sz2); + xcmd = (XpuCommand *)pts->xcmd_buf.data; + xcmd->length = sz1 + sz2; + xcmd_iov[0].iov_base = xcmd; + xcmd_iov[0].iov_len = sz1; + xcmd_iov[1].iov_base = (pts->xcmd_buf.data + pts->xcmd_buf.len - sz2); + xcmd_iov[1].iov_len = sz2; + *xcmd_iovcnt = 2; + + return xcmd; } -/* - * pgstrom_init_relscan - */ void pgstrom_init_relscan(void) { - static char *nvme_manual_distance_map = NULL; - char buffer[1280]; - int index = 0; - - /* pg_strom.enable_brin */ - DefineCustomBoolVariable("pg_strom.enable_brin", - "Enables to use BRIN-index", - NULL, - &pgstrom_enable_brin, - true, - PGC_USERSET, - GUC_NOT_IN_SAMPLE, - NULL, NULL, NULL); - /* - * pg_strom.nvme_distance_map - * - * config := [,...] - * token := nvmeXX:gpuXX - * - * eg) nvme0:gpu0,nvme1:gpu1 - */ - DefineCustomStringVariable("pg_strom.nvme_distance_map", - "Manual configuration of optimal GPU for each NVME", - NULL, - &nvme_manual_distance_map, - NULL, - PGC_POSTMASTER, - GUC_NOT_IN_SAMPLE, - NULL, NULL, NULL); - extraSysfsSetupDistanceMap(nvme_manual_distance_map); - while (extraSysfsPrintNvmeInfo(index, buffer, sizeof(buffer)) >= 0) - { - elog(LOG, "- %s", buffer); - index++; - } - - /* hash table for tablespace <-> optimal GPU */ - tablespace_optimal_gpu_htable = NULL; - CacheRegisterSyscacheCallback(TABLESPACEOID, - tablespace_optimal_gpu_cache_callback, - (Datum) 0); + /* nothing to do */ } diff --git a/next/pg_strom--5.0.sql b/src/sql/pg_strom--5.0.sql similarity index 100% rename from next/pg_strom--5.0.sql rename to src/sql/pg_strom--5.0.sql diff --git a/src/tinyint.c b/src/tinyint.c index 3df806f51..6fcc47211 100644 --- a/src/tinyint.c +++ b/src/tinyint.c @@ -3,8 +3,8 @@ * * 8bit-width integer data type support * ---- - * Copyright 2011-2021 (C) KaiGai Kohei - * Copyright 2014-2021 (C) PG-Strom Developers Team + * Copyright 2011-2023 (C) KaiGai Kohei + * Copyright 2014-2023 (C) PG-Strom Developers Team * * This program is free software; you can redistribute it and/or modify * it under the terms of the PostgreSQL License. diff --git a/next/xpu_basetype.cu b/src/xpu_basetype.cu similarity index 100% rename from next/xpu_basetype.cu rename to src/xpu_basetype.cu diff --git a/next/xpu_basetype.h b/src/xpu_basetype.h similarity index 100% rename from next/xpu_basetype.h rename to src/xpu_basetype.h diff --git a/next/xpu_common.cu b/src/xpu_common.cu similarity index 100% rename from next/xpu_common.cu rename to src/xpu_common.cu diff --git a/next/xpu_common.h b/src/xpu_common.h similarity index 100% rename from next/xpu_common.h rename to src/xpu_common.h diff --git a/next/xpu_misclib.cu b/src/xpu_misclib.cu similarity index 100% rename from next/xpu_misclib.cu rename to src/xpu_misclib.cu diff --git a/next/xpu_misclib.h b/src/xpu_misclib.h similarity index 100% rename from next/xpu_misclib.h rename to src/xpu_misclib.h diff --git a/next/xpu_numeric.cu b/src/xpu_numeric.cu similarity index 100% rename from next/xpu_numeric.cu rename to src/xpu_numeric.cu diff --git a/next/xpu_numeric.h b/src/xpu_numeric.h similarity index 100% rename from next/xpu_numeric.h rename to src/xpu_numeric.h diff --git a/next/xpu_opcodes.h b/src/xpu_opcodes.h similarity index 100% rename from next/xpu_opcodes.h rename to src/xpu_opcodes.h diff --git a/next/xpu_textlib.cu b/src/xpu_textlib.cu similarity index 100% rename from next/xpu_textlib.cu rename to src/xpu_textlib.cu diff --git a/next/xpu_textlib.h b/src/xpu_textlib.h similarity index 100% rename from next/xpu_textlib.h rename to src/xpu_textlib.h diff --git a/next/xpu_timelib.cu b/src/xpu_timelib.cu similarity index 100% rename from next/xpu_timelib.cu rename to src/xpu_timelib.cu diff --git a/next/xpu_timelib.h b/src/xpu_timelib.h similarity index 100% rename from next/xpu_timelib.h rename to src/xpu_timelib.h diff --git a/utils/Makefile b/utils/Makefile deleted file mode 100644 index e7b608198..000000000 --- a/utils/Makefile +++ /dev/null @@ -1,4 +0,0 @@ -ifndef STROM_BUILD_ROOT -STROM_BUILD_ROOT = .. -endif -include $(STROM_BUILD_ROOT)/Makefile diff --git a/utils/ssbm/bcd2.c b/utils/ssbm/bcd2.c deleted file mode 100644 index 30038ba6f..000000000 --- a/utils/ssbm/bcd2.c +++ /dev/null @@ -1,237 +0,0 @@ -/* @(#)bcd2.c 2.1.8.1 */ -/* - * bcd.c: conversion routines for multi-byte arithmetic - * - * defined routines: - * bin_bcd2(long binary, long *low_res, long *high_res) - * bcd2_bin(long *dest, long bcd) - * bcd2_add(long *bcd_low, long *bcd_high, long addend) - * bcd2_sub(long *bcd_low, long *bcd_high, long subend) - * bcd2_mul(long *bcd_low, long *bcd_high, long multiplier) - * bcd2_div(long *bcd_low, long *bcd_high, long divisor) - * long bcd2_mod(long *bcd_low, long *bcd_high, long modulo) - * long bcd2_cmp(long *bcd_low, long *bcd_high, long compare) - */ -#include -#include "bcd2.h" /* for function prototypes */ - -#define DIGITS_PER_LONG 7 -#define WORD_DIVISOR 10000000 -#define GET_DIGIT(num, low, high) \ - ((num) >= DIGITS_PER_LONG)? \ - (high & (0xF << (4 * ((num) - DIGITS_PER_LONG)))) \ - >> (((num) - DIGITS_PER_LONG) * 4): \ - (low & (0xF << (4 * (num)))) >> ((num) * 4) -#define SET_DIGIT(value, num, low, high) \ - if ((num) >= DIGITS_PER_LONG) \ - { \ - *high &= \ - (0xFFFFFFF ^ (0xF << (4 * ((num) - DIGITS_PER_LONG)))); \ - *high |= (value << (4 * ((num) - DIGITS_PER_LONG))); \ - } \ - else \ - { \ - *low = (*low & (0xFFFFFFF ^ (0xF << (4 * (num))))); \ - *low |= (value << (4 * (num))); \ - } -int -bin_bcd2(long binary, long *low_res, long *high_res) -{ - char number[15], - *current; - int count; - long *dest; - - *low_res = *high_res = 0; - sprintf(number, "%014ld", binary); - for (current = number, count=13; *current; current++, count--) - { - dest = (count < DIGITS_PER_LONG)?low_res:high_res; - *dest = *dest << 4; - *dest |= *current - '0'; - } - return(0); -} - -int -bcd2_bin(long *dest, long bcd) -{ - int count; - long mask; - - count = DIGITS_PER_LONG - 1; - mask = 0xF000000; - *dest = 0; - while (mask) - { - *dest *= 10; - *dest += (bcd & mask) >> (4 * count); - mask = mask >> 4; - count -= 1; - } - return(0); -} - -int -bcd2_add(long *bcd_low, long *bcd_high, long addend) -{ - long tmp_lo, tmp_hi, carry, res; - int digit; - - bin_bcd2(addend, &tmp_lo, &tmp_hi); - carry = 0; - for (digit=0; digit < 14; digit++) - { - res = GET_DIGIT(digit, *bcd_low, *bcd_high); - res += GET_DIGIT(digit, tmp_lo, tmp_hi); - res += carry; - carry = res / 10; - res %= 10; - SET_DIGIT(res, digit, bcd_low, bcd_high); - } - return(carry); -} - -int -bcd2_sub(long *bcd_low, long *bcd_high, long subend) -{ - long tmp_lo, tmp_hi, carry, res; - int digit; - - bin_bcd2(subend, &tmp_lo, &tmp_hi); - carry = 0; - for (digit=0; digit < 14; digit++) - { - res = GET_DIGIT(digit, *bcd_low, *bcd_high); - res -= GET_DIGIT(digit, tmp_lo, tmp_hi); - res -= carry; - if (res < 0) - { - res += 10; - carry = 1; - } - SET_DIGIT(res, digit, bcd_low, bcd_high); - } - return(carry); -} - -int -bcd2_mul(long *bcd_low, long *bcd_high, long multiplier) -{ - long tmp_lo, tmp_hi, carry, m_lo, m_hi, m1, m2; - int udigit, ldigit, res; - - tmp_lo = *bcd_low; - tmp_hi = *bcd_high; - bin_bcd2(multiplier, &m_lo, &m_hi); - *bcd_low = 0; - *bcd_high = 0; - carry = 0; - for (ldigit=0; ldigit < 14; ldigit++) - { - m1 = GET_DIGIT(ldigit, m_lo, m_hi); - carry = 0; - for (udigit=0; udigit < 14; udigit++) - { - m2 = GET_DIGIT(udigit, tmp_lo, tmp_hi); - res = m1 * m2; - res += carry; - if (udigit + ldigit < 14) - { - carry = GET_DIGIT(udigit + ldigit, *bcd_low, *bcd_high); - res += carry; - } - carry = res / 10; - res %= 10; - if (udigit + ldigit < 14) - SET_DIGIT(res, udigit + ldigit, bcd_low, bcd_high); - } - } - return(carry); -} - -int -bcd2_div(long *bcd_low, long *bcd_high, long divisor) -{ - long tmp_lo, tmp_hi, carry, d1, res, digit; - - - carry = 0; - tmp_lo = *bcd_low; - tmp_hi = *bcd_high; - *bcd_low = *bcd_high = 0; - for (digit=13; digit >= 0; digit--) - { - d1 = GET_DIGIT(digit, tmp_lo, tmp_hi); - d1 += 10 * carry; - res = d1 / divisor; - carry = d1 % divisor; - SET_DIGIT(res, digit, bcd_low, bcd_high); - } - return(carry); -} - -long -bcd2_mod(long *bcd_low, long *bcd_high, long modulo) -{ - long tmp_low, tmp_high; - - tmp_low = *bcd_low; - tmp_high = *bcd_high; - while (tmp_high || tmp_low > modulo) - bcd2_sub(&tmp_low, &tmp_high, modulo); - return(tmp_low); -} - -long -bcd2_cmp(long *low1, long *high1, long comp) -{ - long temp = 0; - - bcd2_bin(&temp, *high1); - if (temp > 214) - return(1); - bcd2_bin(&temp, *low1); - return(temp - comp); -} - -#ifdef TEST_BCD -#include - -main() -{ -long bin, low_bcd, high_bcd; -int i; - -bin = MAXINT; -printf("%ld\n", bin); -bin_bcd2(bin, &low_bcd, &high_bcd); -printf("%ld %ld\n", high_bcd, low_bcd); -bin = 0; -bcd2_bin(&bin, high_bcd); -bcd2_bin(&bin, low_bcd); -printf( "%ld\n", bin); -for (i=9; i >= 0; i--) - printf("%dth digit in %d is %d\n", - i, bin, GET_DIGIT(i, low_bcd, high_bcd)); -bcd2_add(&low_bcd, &high_bcd, MAXINT); -bin = 0; -bcd2_bin(&bin, high_bcd); -high_bcd = bin; -bin = 0; -bcd2_bin(&bin, low_bcd); -low_bcd = bin; -printf( "%ld%07ld\n", high_bcd, low_bcd); -bin_bcd2(14, &low_bcd, &high_bcd); -bcd2_mul(&low_bcd, &high_bcd, 23L); -bin = 0; -bcd2_bin(&bin, high_bcd); -bcd2_bin(&bin, low_bcd); -printf( "%ld\n", bin); -bcd2_div(&low_bcd, &high_bcd, 10L); -bin = 0; -bcd2_bin(&bin, high_bcd); -bcd2_bin(&bin, low_bcd); -printf( "%ld\n", bin); -} -#endif /* TEST */ diff --git a/utils/ssbm/bcd2.h b/utils/ssbm/bcd2.h deleted file mode 100644 index 6ea92a130..000000000 --- a/utils/ssbm/bcd2.h +++ /dev/null @@ -1,11 +0,0 @@ -/* - * Sccsid: @(#)bcd2.h 2.1.8.1 - */ -int bin_bcd2(long binary, long *low_res, long *high_res); -int bcd2_bin(long *dest, long bcd); -int bcd2_add(long *bcd_low, long *bcd_high, long addend); -int bcd2_sub(long *bcd_low, long *bcd_high, long subend); -int bcd2_mul(long *bcd_low, long *bcd_high, long multiplier); -int bcd2_div(long *bcd_low, long *bcd_high, long divisor); -long bcd2_mod(long *bcd_low, long *bcd_high, long modulo); -long bcd2_cmp(long *bcd_low, long *bcd_high, long compare); diff --git a/utils/ssbm/bm_utils.c b/utils/ssbm/bm_utils.c deleted file mode 100644 index c537ee8eb..000000000 --- a/utils/ssbm/bm_utils.c +++ /dev/null @@ -1,638 +0,0 @@ -/* @(#)bm_utils.c 2.1.8.2 */ -/* - * - * Various routines that handle distributions, value selections and - * seed value management for the DSS benchmark. Current functions: - * env_config -- set config vars with optional environment override - * yes_no -- ask simple yes/no question and return boolean result - * a_rnd(min, max) -- random alphanumeric within length range - * pick_str(size, set) -- select a string from the set of size - * read_dist(file, name, distribution *) -- read named dist from file - * tbl_open(path, mode) -- std fopen with lifenoise - * julian(date) -- julian date correction - * rowcnt(tbl) -- proper scaling of given table - * e_str(set, min, max) -- build an embedded str - * agg_str() -- build a string from the named set - * dsscasecmp() -- version of strcasecmp() - * dssncasecmp() -- version of strncasecmp() - * getopt() - * set_state() -- initialize the RNG - */ - -/*this has to be put on top...*/ -#ifdef LINUX -/* turn on GNU extensions, incl O_DIRECT */ -/* O_LARGEFILE is defined in fcntl.h*/ -#define _GNU_SOURCE -#endif - -#include "dss.h" -#include -#include -#include -#include - -#ifdef HP -#include -#endif /* HP */ -#include -#include -#ifndef _POSIX_SOURCE -#include -#endif /* POSIX_SOURCE */ - -#include - -#ifdef IBM -#include -#endif /* IBM */ -#include -#include -/* Lines added by Chuck McDevitt for WIN32 support */ -#if (defined(WIN32)||defined(DOS)) -#ifndef _POSIX_ -#include -#ifndef S_ISREG - -#define S_ISREG(m) ( ((m) & _S_IFMT) == _S_IFREG ) -#define S_ISFIFO(m) ( ((m) & _S_IFMT) == _S_IFIFO ) - -#endif -#endif -#ifndef stat -#define stat _stat -#endif -#ifndef fdopen -#define fdopen _fdopen -#endif -#ifndef open -#define open _open -#endif -#ifndef O_RDONLY -#define O_RDONLY _O_RDONLY -#endif -#ifndef O_WRONLY -#define O_WRONLY _O_WRONLY -#endif -#ifndef O_CREAT -#define O_CREAT _O_CREAT -#endif -#endif -/* End of lines added by Chuck McDevitt for WIN32 support */ -#include "dsstypes.h" - - -static char alpha_num[65] = -"0123456789abcdefghijklmnopqrstuvwxyz ABCDEFGHIJKLMNOPQRSTUVWXYZ,"; - -#if defined(__STDC__) || defined(__cplusplus) -#define PROTO(s) s -#else -#define PROTO(s) () -#endif - -char *getenv PROTO((const char *name)); -void usage(); -long *permute_dist(distribution *d, long stream); -extern long Seed[]; - -/* - * env_config: look for a environmental variable setting and return its - * value; otherwise return the default supplied - */ -char * -env_config(char *var, char *dflt) -{ - static char *evar; - - if ((evar = getenv(var)) != NULL) - return (evar); - else - return (dflt); -} - -/* - * return the answer to a yes/no question as a boolean - */ -long -yes_no(char *prompt) -{ - char reply[128]; - -#ifdef WIN32 -/* Disable warning about conditional expression is constant */ -#pragma warning(disable:4127) -#endif - - while (1) - { -#ifdef WIN32 -#pragma warning(default:4127) -#endif - printf("%s [Y/N]: ", prompt); - if (!fgets(reply, sizeof(reply), stdin)) - reply[0] = '\0'; - switch (*reply) - { - case 'y': - case 'Y': - return (1); - case 'n': - case 'N': - return (0); - default: - printf("Please answer 'yes' or 'no'.\n"); - } - } -} - -/* - * generate a random string with length randomly selected in [min, max] - * and using the characters in alphanum (currently includes a space - * and comma) - */ -int -a_rnd(int min, int max, int column, char *dest) -{ - long i, - len, - char_int; - - RANDOM(len, min, max, column); - for (i = 0; i < len; i++) - { - if (i % 5 == 0) - RANDOM(char_int, 0, MAX_LONG, column); - *(dest + i) = alpha_num[char_int & 077]; - char_int >>= 6; - } - *(dest + len) = '\0'; - return (len); -} - -/* - * embed a randomly selected member of distribution d in alpha-numeric - * noise of a length rendomly selected between min and max at a random - * position - */ -void -e_str(distribution *d, int min, int max, int stream, char *dest) -{ - char strtmp[MAXAGG_LEN + 1]; - long loc; - int len; - - a_rnd(min, max, stream, dest); - pick_str(d, stream, strtmp); - len = strlen(strtmp); - RANDOM(loc, 0, (strlen(dest) - 1 - len), stream); - strncpy(dest + loc, strtmp, len); - - return; -} - - -/* - * return the string associate with the LSB of a uniformly selected - * long in [1, max] where max is determined by the distribution - * being queried - */ -int -pick_str(distribution *s, int c, char *target) -{ - long i = 0; - long j; - - RANDOM(j, 1, s->list[s->count - 1].weight, c); - while (s->list[i].weight < j) - i++; - strcpy(target, s->list[i].text); - return(i); -} - -/* - * unjulian (long date) -- return(date - STARTDATE) - */ -long -unjulian(long date) -{ - int i; - long res = 0; - - for (i = STARTDATE / 1000; i < date / 1000; i++) - res += 365 + LEAP(i); - res += date % 1000 - 1; - - return(res); -} - -long -julian(long date) -{ - long offset; - long result; - long yr; - long yend; - - offset = date - STARTDATE; - result = STARTDATE; - -#ifdef WIN32 -/* Disable warning about conditional expression is constant */ -#pragma warning(disable:4127) -#endif - - while (1) - { -#ifdef WIN32 -#pragma warning(default:4127) -#endif - yr = result / 1000; - yend = yr * 1000 + 365 + LEAP(yr); - if (result + offset > yend) /* overflow into next year */ - { - offset -= yend - result + 1; - result += 1000; - continue; - } - else - break; - } - return (result + offset); -} - -/* - * static dists_dss - */ -#define STATIC_DISTS 1 -#ifdef STATIC_DISTS -#include "dists.dss.h" -static char * -static_dist_fgets(char *d, int len, size_t *pos) -{ - static size_t limit = 0; - size_t cur = *pos; - const char *s = static_dists_dss + cur; - int i = 0; - - if (limit == 0) - limit = strlen(static_dists_dss); - if (cur >= limit || *s == '\0') - return NULL; - while (i < len - 1) - { - int c = s[i]; - - if (c == '\0') - break; - d[i++] = c; - if (c == '\n') - break; - } - d[i] = '\0'; - *pos += i; - - return d; -} -#endif - -/* -* load a distribution from a flat file into the target structure; -* should be rewritten to allow multiple dists in a file -*/ -void -read_dist(char *path, char *name, distribution *target) -{ -#ifndef STATIC_DISTS - FILE *fp; -#else - size_t pos = 0; -#endif -char line[256], - token[256], - *c; -long weight, - count = 0, - name_set = 0; -#ifndef STATIC_DISTS - if (d_path == NULL) - { - sprintf(line, "%s%c%s", - env_config(CONFIG_TAG, CONFIG_DFLT), PATH_SEP, path); - fp = fopen(line, "r"); - OPEN_CHECK(fp, line); - } - else - { - fp = fopen(d_path, "r"); - OPEN_CHECK(fp, d_path); - } - while (fgets(line, sizeof(line), fp) != NULL) -#else - while (static_dist_fgets(line, sizeof(line), &pos) != NULL) -#endif - { - if ((c = strchr(line, '\n')) != NULL) - *c = '\0'; - if ((c = strchr(line, '#')) != NULL) - *c = '\0'; - if (*line == '\0') - continue; - - if (!name_set) - { - if (dsscasecmp(strtok(line, "\n\t "), "BEGIN")) - continue; - if (dsscasecmp(strtok(NULL, "\n\t "), name)) - continue; - name_set = 1; - continue; - } - else - { - if (!dssncasecmp(line, "END", 3)) - { -#ifndef STATIC_DISTS - fclose(fp); -#endif - return; - } - } - - if (sscanf(line, "%[^|]|%ld", token, &weight) != 2) - continue; - - if (!dsscasecmp(token, "count")) - { - target->count = weight; - target->list = - (set_member *) - malloc((size_t)(weight * sizeof(set_member))); - MALLOC_CHECK(target->list); - target->max = 0; - continue; - } - target->list[count].text = - (char *) malloc((size_t)(strlen(token) + 1)); - MALLOC_CHECK(target->list[count].text); - strcpy(target->list[count].text, token); - target->max += weight; - target->list[count].weight = target->max; - - count += 1; - } /* while fgets() */ - - if (count != target->count) - { - fprintf(stderr, "Read error on dist '%s'\n", name); -#ifndef STATIC_DISTS - fclose(fp); -#endif - exit(1); - } - target->permute = (long *)NULL; -#ifndef STATIC_DISTS - fclose(fp); -#endif - return; -} - -/* - * standard file open with life noise - */ - -FILE * -tbl_open(int tbl, char *mode) -{ - char prompt[256]; - char fullpath[256]; - FILE *f; - struct stat fstats; - int retcode; - - - if (*tdefs[tbl].name == PATH_SEP) - strcpy(fullpath, tdefs[tbl].name); - else - sprintf(fullpath, "%s%c%s", - env_config(PATH_TAG, PATH_DFLT), PATH_SEP, tdefs[tbl].name); - - retcode = stat(fullpath, &fstats); - if (retcode && (errno != ENOENT)) - { - fprintf(stderr, "stat(%s) failed.\n", fullpath); - exit(-1); - } - if (S_ISREG(fstats.st_mode) && !force && *mode != 'r' ) - { - sprintf(prompt, "Do you want to overwrite %s ?", fullpath); - if (!yes_no(prompt)) - exit(0); - } - - if (S_ISFIFO(fstats.st_mode)) - { - retcode = - open(fullpath, ((*mode == 'r')?O_RDONLY:O_WRONLY)|O_CREAT); - f = fdopen(retcode, mode); - } - else{ - -#ifdef LINUX - /* allow large files on Linux */ - /*use open to first to get the in fd and apply regular fdopen*/ - - /*cheng: Betty mentioned about write mode problem here, added 066*/ - retcode = - open(fullpath, ((*mode == 'r')?O_RDONLY:O_WRONLY)|O_CREAT|O_LARGEFILE,0644); - f = fdopen(retcode, mode); -#else - f = fopen(fullpath, mode); -#endif - - } - OPEN_CHECK(f, fullpath); - if (header && columnar && tdefs[tbl].header != NULL) - tdefs[tbl].header(f); - - return (f); -} - - -/* - * agg_str(set, count) build an aggregated string from count unique - * selections taken from set - */ -void -agg_str(distribution *set, long count, long col, char *dest) -{ - distribution *d; - int i; - - - d = set; - *dest = '\0'; - for (i=0; i < count; i++) - { - strcat(dest, DIST_MEMBER(set,*permute_dist(d, col))); - - strcat(dest, " "); - d = (distribution *)NULL; - } - *(dest + strlen(dest) - 1) = '\0'; - return; -} - - -long -dssncasecmp(char *s1, char *s2, int n) -{ - for (; n > 0; ++s1, ++s2, --n) - if (tolower(*s1) != tolower(*s2)) - return ((tolower(*s1) < tolower(*s2)) ? -1 : 1); - else if (*s1 == '\0') - return (0); - return (0); -} - -long -dsscasecmp(char *s1, char *s2) -{ - for (; tolower(*s1) == tolower(*s2); ++s1, ++s2) - if (*s1 == '\0') - return (0); - return ((tolower(*s1) < tolower(*s2)) ? -1 : 1); -} - -#ifndef STDLIB_HAS_GETOPT -int optind = 0; -int opterr = 0; -char *optarg = NULL; - -int -getopt(int ac, char **av, char *opt) -{ - static char *nextchar = NULL; - char *cp; - char hold; - - if (optarg == NULL) - { - optarg = (char *)malloc(BUFSIZ); - MALLOC_CHECK(optarg); - } - - if (!nextchar || *nextchar == '\0') - { - optind++; - if (optind == ac) - return(-1); - nextchar = av[optind]; - if (*nextchar != '-') - return(-1); - nextchar +=1; - } - - if (nextchar && *nextchar == '-') /* -- termination */ - { - optind++; - return(-1); - } - else /* found an option */ - { - cp = strchr(opt, *nextchar); - nextchar += 1; - if (cp == NULL) /* not defined for this run */ - return('?'); - if (*(cp + 1) == ':') /* option takes an argument */ - { - if (*nextchar) - { - hold = *cp; - cp = optarg; - while (*nextchar) - *cp++ = *nextchar++; - *cp = '\0'; - *cp = hold; - } - else /* white space separated, use next arg */ - { - if (++optind == ac) - return('?'); - strcpy(optarg, av[optind]); - } - nextchar = NULL; - } - return(*cp); - } -} -#endif /* STDLIB_HAS_GETOPT */ - -char ** -mk_ascdate(void) -{ - char **m; - dss_time_t t; - int i; - - m = (char**) malloc((size_t)(TOTDATE * sizeof (char *))); - MALLOC_CHECK(m); - for (i = 0; i < TOTDATE; i++) - { - m[i] = (char *)malloc(DATE_LEN * sizeof(char)); - MALLOC_CHECK(m[i]); - mk_time((long)(i + 1), &t); - strcpy(m[i], t.alpha); - } - - return(m); -} - -/* - * set_state() -- initialize the RNG so that - * appropriate data sets can be generated. - * For each table that is to be generated, calculate the number of rows/child, and send that to the - * seed generation routine in speed_seed.c. Note: assumes that tables are completely independent. - * Returns the number of rows to be generated by the named step. - */ -long -set_state(int table, long sf, long procs, long step, long *extra_rows) -{ - int i; - long rowcount, remainder, result; - - if (sf == 0 || step == 0) - return(0); - - rowcount = tdefs[table].base / procs; - if ((sf / procs) > (int)MAX_32B_SCALE) - INTERNAL_ERROR("SCALE OVERFLOW. RE-RUN WITH MORE CHILDREN."); - rowcount *= sf; - remainder = (tdefs[table].base % procs) * sf; - rowcount += remainder / procs; - result = rowcount; - for (i=0; i < step - 1; i++) - { - if (table == LINE) /* special case for shared seeds */ - tdefs[table].gen_seed(1, rowcount); - else - tdefs[table].gen_seed(0, rowcount); - /* need to set seeds of child in case there's a dependency */ - /* NOTE: this assumes that the parent and child have the same base row count */ - if (tdefs[table].child != NONE) - tdefs[tdefs[table].child].gen_seed(0,rowcount); - } - *extra_rows = remainder % procs; - if (step > procs) /* moving to the end to generate updates */ - tdefs[table].gen_seed(*extra_rows); - - return(result); -} - - - - - - - - diff --git a/utils/ssbm/build.c b/utils/ssbm/build.c deleted file mode 100644 index a80df2413..000000000 --- a/utils/ssbm/build.c +++ /dev/null @@ -1,802 +0,0 @@ -/* @(#)build.c 2.1.8.1 */ -/* Sccsid: @(#)build.c 9.1.1.17 11/15/95 12:52:28 */ -/* stuff related to the customer table */ -#include -#include -#ifdef SSBM -#include -#endif -#ifndef VMS -#include -#endif -#if defined(SUN) -#include -#endif - -#if defined(LINUX) -#include -#endif - -#include - -#include "dss.h" -#include "dsstypes.h" -#include "bcd2.h" -#ifdef ADHOC -#include "adhoc.h" -extern adhoc_t adhocs[]; -#endif /* ADHOC */ - -#define LEAP_ADJ(yr, mnth) \ -((LEAP(yr) && (mnth) >= 2) ? 1 : 0) -#define JDAY_BASE 8035 /* start from 1/1/70 a la unix */ -#define JMNTH_BASE (-70 * 12) /* start from 1/1/70 a la unix */ -#define JDAY(date) ((date) - STARTDATE + JDAY_BASE + 1) -#define PART_SUPP_BRIDGE(tgt, p, s) \ - { \ - long tot_scnt = tdefs[SUPP].base * scale; \ - tgt = (p + s * (tot_scnt / SUPP_PER_PART + \ - (long) ((p - 1) / tot_scnt))) % tot_scnt + 1; \ - } -#define RPRICE_BRIDGE(tgt, p) tgt = rpb_routine(p) -#define V_STR(avg, sd, tgt) a_rnd((int)(avg * V_STR_LOW), \ -(int)(avg * V_STR_HGH), sd, tgt) -#define TEXT(avg, sd, tgt) \ -dbg_text(tgt, (int)(avg * V_STR_LOW),(int)(avg * V_STR_HGH), sd) -static void gen_phone PROTO((long ind, char *target, long seed)); - -#ifdef SSBM -static void gen_category PROTO((char *target, long seed)); -int gen_city PROTO((char *cityName, char *nationName)); -int gen_season PROTO((char * dest,int month,int day)); -int is_last_day_in_month PROTO((int year,int month,int day)); -int gen_holiday_fl PROTO((char * dest, int month, int day)); -int gen_city PROTO((char *cityName, char *nationName)); -int gen_color PROTO((char * source, char * dest)); -#endif - - -long -rpb_routine(long p) - { - long price; - price = 90000; - price += (p/10) % 20001; /* limit contribution to $200 */ - price += (p % 1000) * 100; - - return(price); - } - -static void -gen_phone(long ind, char *target, long seed) - { - long acode, - exchg, - number; - - RANDOM(acode, 100, 999, seed); - RANDOM(exchg, 100, 999, seed); - RANDOM(number, 1000, 9999, seed); - sprintf(target, "%02ld", 10 + (ind % NATIONS_MAX)); - sprintf(target + 3, "%03ld", acode); - sprintf(target + 7, "%03ld", exchg); - sprintf(target + 11, "%04ld", number); - target[2] = target[6] = target[10] = '-'; - return; -} - -static void -gen_category(char *target, long seed){ - long num1,num2; - RANDOM(num1,1,5,seed); - RANDOM(num2,1,5,seed); - strcpy(target,"MFGR"); - sprintf(target + 4, "%01ld", num1); - sprintf(target + 5, "%01ld", num2); - return; -} - -#ifdef SSBM -long mk_cust(long n_cust, customer_t *c) -{ - long i; - c->custkey = n_cust; - sprintf(c->name, C_NAME_FMT, C_NAME_TAG, n_cust); - c->alen = V_STR(C_ADDR_LEN, C_ADDR_SD, c->address); - RANDOM(i, 0, nations.count-1, C_NTRG_SD); - strcpy(c->nation_name,nations.list[i].text); - strcpy(c->region_name,regions.list[nations.list[i].weight].text); - gen_city(c->city,c->nation_name); - gen_phone(i, c->phone, (long)C_PHNE_SD); - pick_str(&c_mseg_set, C_MSEG_SD, c->mktsegment); - return (0); - } - -#else -long -mk_cust(long n_cust, customer_t *c) - { - long i; - - c->custkey = n_cust; - sprintf(c->name, C_NAME_FMT, C_NAME_TAG, n_cust); - c->alen = V_STR(C_ADDR_LEN, C_ADDR_SD, c->address); - RANDOM(i, 0, (nations.count - 1), C_NTRG_SD); - c->nation_code = i; - gen_phone(i, c->phone, (long)C_PHNE_SD); - RANDOM(c->acctbal, C_ABAL_MIN, C_ABAL_MAX, C_ABAL_SD); - pick_str(&c_mseg_set, C_MSEG_SD, c->mktsegment); - c->clen = TEXT(C_CMNT_LEN, C_CMNT_SD, c->comment); - - return (0); - } -#endif - - /* - * generate the numbered order and its associated lineitems -*/ -void -mk_sparse (long i, DSS_HUGE *ok, long seq) - { -#ifndef SUPPORT_64BITS - if (scale < MAX_32B_SCALE) -#endif - ez_sparse(i, ok, seq); -#ifndef SUPPORT_64BITS - else - hd_sparse(i, ok, seq); -#endif - return; - } - - /* - * the "simple" version of mk_sparse, used on systems with 64b support - * and on all systems at SF <= 300G where 32b support is sufficient -*/ -void -ez_sparse(long i, DSS_HUGE *ok, long seq) - { - long low_bits; - - LONG2HUGE(i, ok); - low_bits = (long)(i & ((1 << SPARSE_KEEP) - 1)); - *ok = *ok >> SPARSE_KEEP; - *ok = *ok << SPARSE_BITS; - *ok += seq; - *ok = *ok << SPARSE_KEEP; - *ok += low_bits; - - - return; - } - -#ifndef SUPPORT_64BITS -void -hd_sparse(long i, DSS_HUGE *ok, long seq) - { - long low_mask, seq_mask; - static int init = 0; - static DSS_HUGE *base, *res; - - if (init == 0) - { - INIT_HUGE(base); - INIT_HUGE(res); - init = 1; - } - - low_mask = (1 << SPARSE_KEEP) - 1; - seq_mask = (1 << SPARSE_BITS) - 1; - bin_bcd2(i, base, base + 1); - HUGE_SET (base, res); - HUGE_DIV (res, 1 << SPARSE_KEEP); - HUGE_MUL (res, 1 << SPARSE_BITS); - HUGE_ADD (res, seq, res); - HUGE_MUL (res, 1 << SPARSE_KEEP); - HUGE_ADD (res, *base & low_mask, res); - bcd2_bin (&low_mask, *res); - bcd2_bin (&seq_mask, *(res + 1)); - *ok = low_mask; - *(ok + 1) = seq_mask; - return; - } -#endif - -#ifdef SSBM -long -mk_order(long index, order_t *o, long upd_num) - { - long lcnt; - long rprice; - long ocnt; - long tmp_date; - long c_date; - long clk_num; - long supp_num; - static char **asc_date = NULL; - char tmp_str[2]; - char **mk_ascdate PROTO((void)); - int delta = 1; - - if (asc_date == NULL) - asc_date = mk_ascdate(); - - RANDOM(tmp_date, O_ODATE_MIN, O_ODATE_MAX, O_ODATE_SD); - strcpy(o->odate, asc_date[tmp_date - STARTDATE]); - - mk_sparse (index, o->okey, - (upd_num == 0) ? 0 : 1 + upd_num / (10000 / refresh)); - RANDOM(o->custkey, O_CKEY_MIN, O_CKEY_MAX, O_CKEY_SD); - while (o->custkey % CUST_MORTALITY == 0) - { - o->custkey += delta; - o->custkey = MIN(o->custkey, O_CKEY_MAX); - delta *= -1; - } - pick_str(&o_priority_set, O_PRIO_SD, o->opriority); - RANDOM(clk_num, 1, MAX((scale * O_CLRK_SCL), O_CLRK_SCL), O_CLRK_SD); - o->spriority = 0; - - o->totalprice = 0; - ocnt = 0; - - RANDOM(o->lines, O_LCNT_MIN, O_LCNT_MAX, O_LCNT_SD); - for (lcnt = 0; lcnt < o->lines; lcnt++) - { - - HUGE_SET(o->okey, o->lineorders[lcnt].okey); - o->lineorders[lcnt].linenumber = lcnt + 1; - o->lineorders[lcnt].custkey = o->custkey; - RANDOM(o->lineorders[lcnt].partkey, L_PKEY_MIN, L_PKEY_MAX, L_PKEY_SD); - RANDOM(o->lineorders[lcnt].suppkey, L_SKEY_MIN, L_SKEY_MAX, L_SKEY_SD); - - RANDOM(o->lineorders[lcnt].quantity, L_QTY_MIN, L_QTY_MAX, L_QTY_SD); - RANDOM(o->lineorders[lcnt].discount, L_DCNT_MIN, L_DCNT_MAX, L_DCNT_SD); - RANDOM(o->lineorders[lcnt].tax, L_TAX_MIN, L_TAX_MAX, L_TAX_SD); - - strncpy(o->lineorders[lcnt].orderdate,o->odate,DATE_LEN); - - strncpy(o->lineorders[lcnt].opriority,o->opriority,MAXAGG_LEN+1); - o->lineorders[lcnt].ship_priority = o->spriority; - - RANDOM(c_date, L_CDTE_MIN, L_CDTE_MAX, L_CDTE_SD); - c_date += tmp_date; - strncpy(o->lineorders[lcnt].commit_date, - asc_date[c_date - STARTDATE], DATE_LEN); - - pick_str(&l_smode_set, L_SMODE_SD, o->lineorders[lcnt].shipmode); - - RPRICE_BRIDGE( rprice, o->lineorders[lcnt].partkey); - o->lineorders[lcnt].extended_price = rprice * o->lineorders[lcnt].quantity; - o->lineorders[lcnt].revenue = o->lineorders[lcnt].extended_price * ((long)100-o->lineorders[lcnt].discount)/(long)PENNIES; - - //round off problem with linux if use 0.6 - o->lineorders[lcnt].supp_cost = 6 * rprice /10; - - o->totalprice += - ((o->lineorders[lcnt].extended_price * - ((long)100 - o->lineorders[lcnt].discount)) / (long)PENNIES ) * - ((long)100 + o->lineorders[lcnt].tax) - / (long)PENNIES; - } - - for (lcnt = 0; lcnt < o->lines; lcnt++) - { - o->lineorders[lcnt].order_totalprice = o->totalprice; - } - return (0); - } -#else -long -mk_order(long index, order_t *o, long upd_num) - { - long lcnt; - long rprice; - long ocnt; - long tmp_date; - long s_date; - long r_date; - long c_date; - long clk_num; - long supp_num; - static char **asc_date = NULL; - char tmp_str[2]; - char **mk_ascdate PROTO((void)); - int delta = 1; - - if (asc_date == NULL) - asc_date = mk_ascdate(); - mk_sparse (index, o->okey, - (upd_num == 0) ? 0 : 1 + upd_num / (10000 / refresh)); - RANDOM(o->custkey, O_CKEY_MIN, O_CKEY_MAX, O_CKEY_SD); - while (o->custkey % CUST_MORTALITY == 0) - { - o->custkey += delta; - o->custkey = MIN(o->custkey, O_CKEY_MAX); - delta *= -1; - } - - - RANDOM(tmp_date, O_ODATE_MIN, O_ODATE_MAX, O_ODATE_SD); - strcpy(o->odate, asc_date[tmp_date - STARTDATE]); - - pick_str(&o_priority_set, O_PRIO_SD, o->opriority); - RANDOM(clk_num, 1, MAX((scale * O_CLRK_SCL), O_CLRK_SCL), O_CLRK_SD); - sprintf(o->clerk, O_CLRK_FMT, - O_CLRK_TAG, - clk_num); - o->clen = TEXT(O_CMNT_LEN, O_CMNT_SD, o->comment); -#ifdef DEBUG - if (o->clen > O_CMNT_MAX) fprintf(stderr, "comment error: O%d\n", index); -#endif /* DEBUG */ - o->spriority = 0; - - o->totalprice = 0; - o->orderstatus = 'O'; - ocnt = 0; - - RANDOM(o->lines, O_LCNT_MIN, O_LCNT_MAX, O_LCNT_SD); - for (lcnt = 0; lcnt < o->lines; lcnt++) - { - HUGE_SET(o->okey, o->l[lcnt].okey); - o->l[lcnt].lcnt = lcnt + 1; - RANDOM(o->l[lcnt].quantity, L_QTY_MIN, L_QTY_MAX, L_QTY_SD); - RANDOM(o->l[lcnt].discount, L_DCNT_MIN, L_DCNT_MAX, L_DCNT_SD); - RANDOM(o->l[lcnt].tax, L_TAX_MIN, L_TAX_MAX, L_TAX_SD); - pick_str(&l_instruct_set, L_SHIP_SD, o->l[lcnt].shipinstruct); - pick_str(&l_smode_set, L_SMODE_SD, o->l[lcnt].shipmode); - o->l[lcnt].clen = TEXT(L_CMNT_LEN, L_CMNT_SD, o->l[lcnt].comment); - RANDOM(o->l[lcnt].partkey, L_PKEY_MIN, L_PKEY_MAX, L_PKEY_SD); - RPRICE_BRIDGE( rprice, o->l[lcnt].partkey); - RANDOM(supp_num, 0, 3, L_SKEY_SD); - PART_SUPP_BRIDGE( o->l[lcnt].suppkey, o->l[lcnt].partkey, supp_num); - o->l[lcnt].eprice = rprice * o->l[lcnt].quantity; - - o->totalprice += - ((o->l[lcnt].eprice * - ((long)100 - o->l[lcnt].discount)) / (long)PENNIES ) * - ((long)100 + o->l[lcnt].tax) - / (long)PENNIES; - - RANDOM(s_date, L_SDTE_MIN, L_SDTE_MAX, L_SDTE_SD); - s_date += tmp_date; - RANDOM(c_date, L_CDTE_MIN, L_CDTE_MAX, L_CDTE_SD); - c_date += tmp_date; - RANDOM(r_date, L_RDTE_MIN, L_RDTE_MAX, L_RDTE_SD); - r_date += s_date; - - - strcpy(o->l[lcnt].sdate, asc_date[s_date - STARTDATE]); - strcpy(o->l[lcnt].cdate, asc_date[c_date - STARTDATE]); - strcpy(o->l[lcnt].rdate, asc_date[r_date - STARTDATE]); - - - if (julian(r_date) <= CURRENTDATE) - { - pick_str(&l_rflag_set, L_RFLG_SD, tmp_str); - o->l[lcnt].rflag[0] = *tmp_str; - } - else - o->l[lcnt].rflag[0] = 'N'; - - if (julian(s_date) <= CURRENTDATE) - { - ocnt++; - o->l[lcnt].lstatus[0] = 'F'; - } - else - o->l[lcnt].lstatus[0] = 'O'; - } - - if (ocnt > 0) - o->orderstatus = 'P'; - if (ocnt == o->lines) - o->orderstatus = 'F'; - - return (0); -} -#endif - -#ifdef SSBM -long mk_part(long index, part_t *p) -{ - long mfgr,cat,brnd; - - p->partkey = index; - - agg_str(&colors, (long)P_NAME_SCL, (long)P_NAME_SD, p->name); - - /*extract color from substring of p->name*/ - p->clen =gen_color(p->name,p->color); - - - RANDOM(mfgr, P_MFG_MIN, P_MFG_MAX, P_MFG_SD); - sprintf(p->mfgr, "%s%ld", "MFGR#", mfgr); - - RANDOM(cat, P_CAT_MIN, P_CAT_MAX, P_CAT_SD); - sprintf(p->category, "%s%ld", p->mfgr,cat); - - - RANDOM(brnd, P_BRND_MIN, P_BRND_MAX, P_BRND_SD); - sprintf(p->brand,"%s%ld",p->category,brnd); - - p->tlen = pick_str(&p_types_set, P_TYPE_SD, p->type); - p->tlen = strlen(p_types_set.list[p->tlen].text); - RANDOM(p->size, P_SIZE_MIN, P_SIZE_MAX, P_SIZE_SD); - - pick_str(&p_cntr_set, P_CNTR_SD, p->container); - - - return (0); -} -#else -long -mk_part(long index, part_t *p) - { - long temp; - long snum; - long brnd; - - p->partkey = index; - agg_str(&colors, (long)P_NAME_SCL, (long)P_NAME_SD, p->name); - RANDOM(temp, P_MFG_MIN, P_MFG_MAX, P_MFG_SD); - sprintf(p->mfgr, P_MFG_FMT, P_MFG_TAG, temp); - RANDOM(brnd, P_BRND_MIN, P_BRND_MAX, P_BRND_SD); - sprintf(p->brand, P_BRND_FMT, - P_BRND_TAG, - (temp * 10 + brnd)); - p->tlen = pick_str(&p_types_set, P_TYPE_SD, p->type); - p->tlen = strlen(p_types_set.list[p->tlen].text); - RANDOM(p->size, P_SIZE_MIN, P_SIZE_MAX, P_SIZE_SD); - pick_str(&p_cntr_set, P_CNTR_SD, p->container); - RPRICE_BRIDGE( p->retailprice, index); - p->clen = TEXT(P_CMNT_LEN, P_CMNT_SD, p->comment); - - for (snum = 0; snum < SUPP_PER_PART; snum++) - { - p->s[snum].partkey = p->partkey; - PART_SUPP_BRIDGE( p->s[snum].suppkey, index, snum); - RANDOM(p->s[snum].qty, PS_QTY_MIN, PS_QTY_MAX, PS_QTY_SD); - RANDOM(p->s[snum].scost, PS_SCST_MIN, PS_SCST_MAX, PS_SCST_SD); - p->s[snum].clen = TEXT(PS_CMNT_LEN, PS_CMNT_SD, p->s[snum].comment); - } - return (0); - } -#endif - - -#ifdef SSBM -long -mk_supp(long index, supplier_t *s) -{ - long i, - bad_press, - noise, - offset, - type; - s->suppkey = index; - sprintf(s->name, S_NAME_FMT, S_NAME_TAG, index); - s->alen = V_STR(S_ADDR_LEN, S_ADDR_SD, s->address); - RANDOM(i, 0, nations.count-1, S_NTRG_SD); - strcpy(s->nation_name,nations.list[i].text); - strcpy(s->region_name,regions.list[nations.list[i].weight].text); - gen_city(s->city,s->nation_name); - gen_phone(i, s->phone, (long)C_PHNE_SD); - return (0); -} -#else -long -mk_supp(long index, supplier_t *s) - { - long i, - bad_press, - noise, - offset, - type; - - s->suppkey = index; - sprintf(s->name, S_NAME_FMT, S_NAME_TAG, index); - s->alen = V_STR(S_ADDR_LEN, S_ADDR_SD, s->address); - RANDOM(i, 0, nations.count - 1, S_NTRG_SD); - s->nation_code= i; - gen_phone(i, s->phone, S_PHNE_SD); - RANDOM(s->acctbal, S_ABAL_MIN, S_ABAL_MAX, S_ABAL_SD); - - s->clen = TEXT(S_CMNT_LEN, S_CMNT_SD, s->comment); - /* these calls should really move inside the if stmt below, - * but this will simplify seedless parallel load - */ - RANDOM(bad_press, 1, 10000, BBB_CMNT_SD); - RANDOM(type, 0, 100, BBB_TYPE_SD); - RANDOM(noise, 0, (s->clen - BBB_CMNT_LEN), BBB_JNK_SD); - RANDOM(offset, 0, (s->clen - (BBB_CMNT_LEN + noise)), - BBB_OFFSET_SD); - if (bad_press <= S_CMNT_BBB) - { - type = (type < BBB_DEADBEATS) ?0:1; - memcpy(s->comment + offset, BBB_BASE, BBB_BASE_LEN); - if (type == 0) - memcpy(s->comment + BBB_BASE_LEN + offset + noise, - BBB_COMPLAIN, BBB_TYPE_LEN); - else - memcpy(s->comment + BBB_BASE_LEN + offset + noise, - BBB_COMMEND, BBB_TYPE_LEN); - } - - return (0); - } -#endif - -struct - { - char *mdes; - long days; - long dcnt; - } -months[] = - - { - {NULL, 0, 0}, - {"JAN", 31, 31}, - {"FEB", 28, 59}, - {"MAR", 31, 90}, - {"APR", 30, 120}, - {"MAY", 31, 151}, - {"JUN", 30, 181}, - {"JUL", 31, 212}, - {"AUG", 31, 243}, - {"SEP", 30, 273}, - {"OCT", 31, 304}, - {"NOV", 30, 334}, - {"DEC", 31, 365} - }; - -long -mk_time(long index, dss_time_t *t) - { - long m = 0; - long y; - long d; - - t->timekey = index + JDAY_BASE; - y = julian(index + STARTDATE - 1) / 1000; - d = julian(index + STARTDATE - 1) % 1000; - while (d > months[m].dcnt + LEAP_ADJ(y, m)) - m++; - PR_DATE(t->alpha, y, m, - d - months[m - 1].dcnt - ((LEAP(y) && m > 2) ? 1 : 0)); - t->year = 1900 + y; - t->month = m + 12 * y + JMNTH_BASE; - t->week = (d + T_START_DAY - 1) / 7 + 1; - t->day = d - months[m - 1].dcnt - LEAP_ADJ(y, m-1); - - return (0); - } - - int - mk_nation(long index, code_t *c) - { - c->code = index - 1; - c->text = nations.list[index - 1].text; - c->join = nations.list[index - 1].weight; - c->clen = TEXT(N_CMNT_LEN, N_CMNT_SD, c->comment); - return(0); - } - - int - mk_region(long index, code_t *c) - { - - c->code = index - 1; - c->text = regions.list[index - 1].text; - c->join = 0; /* for completeness */ - c->clen = TEXT(R_CMNT_LEN, R_CMNT_SD, c->comment); - return(0); - } - - -#ifdef SSBM - /*bug!*/ -int gen_city(char *cityName, char *nationName) -{ - int i=0; - long randomPick; - int clen = strlen(cityName); - int nlen = strlen(nationName); - - strncpy(cityName,nationName,CITY_FIX-1); - - if(nlen < CITY_FIX-1){ - for(i = nlen ; i< CITY_FIX-1;i++) - cityName[i] = ' '; - } - RANDOM(randomPick, 0, 9, 98); - - sprintf(cityName+CITY_FIX-1,"%ld",randomPick); - cityName[CITY_FIX] = '\0'; - return 0; -} - - -/* -P_NAME is as long as 55 bytes in TPC-H, which is un¬reasonably large. -We reduce it to 22 by limiting to a concatena¬tion of two colors (see [TPC-H], pg 94). -We also add a new column named P_COLOR that could be used in queries where currently a -color must be chosen by substring from P_NAME. -*/ -int gen_color(char * source, char * dest){ - int i = 0,j=0; - int clen=0; - - while(source[i]!= ' ' ){ - dest[i]=source[i]; - i++; - } - dest[i]='\0'; - - i++; - while(source[i] != '\0'){ - source[j] = source[i]; - j++; - i++; - } - - source[j] = '\0'; - - clen = strlen(dest); - return clen; -} - - - -/*Following functions are related to date table generation*/ -int days_in_a_month[12]={31,28,31,30,31,30,31,31,30,31,30,31}; -int days_in_a_month_l[12]={31,29,31,30,31,30,31,31,30,31,30,31}; -season seasons[]={ - {"Christmas",1,11,31,12}, - {"Summer",1,5,31,8}, - {"Winter",1,1,31,3}, - {"Spring",1,4,30,4}, - {"Fall",1,9,31,10} -}; -holiday holidays[]={ - {"Christmas",12,24}, - {"New Years Day", 1,1}, - {"holiday1", 2,20}, - {"Easter Day",4,20}, - {"holiday2", 5,20}, - {"holiday3",7,20}, - {"holiday4",8,20}, - {"holiday5",9,20}, - {"holiday6",10,20}, - {"holiday7",11,20} -}; - -char * month_names[]={"January","February","March","April", - "May","June","July","Augest", - "September","Octorber","November","December"}; - -char * weekday_names[]={"Sunday","Monday","Tuesday","Wednesday","Thursday","Friday","Saturday"}; - -/*make the date table, it takes the continuous index , and add index*60*60*24 to - *numeric representation 1/1/1992 01:01:01, - *then convert the final numeric date time to tm structure, and thus extract other field - *for date_t structure */ -long -mk_date(long index,date_t *d) -{ - long espan = (index-1)*60*60*24; - - long numDateTime = D_STARTDATE + espan; - - struct tm *localTime = localtime(&numDateTime); - - /*make Sunday be the first day of a week */ - d->daynuminweek=((long)localTime->tm_wday+1)%7+1; - d->monthnuminyear=(long)localTime->tm_mon+1; - strncpy(d->dayofweek, weekday_names[d->daynuminweek-1],D_DAYWEEK_LEN+1); - strncpy(d->month,month_names[d->monthnuminyear-1],D_MONTH_LEN+1); - d->year=(long)localTime->tm_year + 1900; - d->daynuminmonth=(long)localTime->tm_mday; - d->yearmonthnum=d->year * 100 + d->monthnuminyear; - - sprintf(d->yearmonth,"%.3s%d",d->month,d->year); - sprintf(d->date,"%s %d, %d",d->month,d->daynuminmonth,d->year); - - d->datekey = d->year*10000+d->monthnuminyear*100+ d->daynuminmonth; - - d->daynuminyear=(int)localTime->tm_yday+1; - d->weeknuminyear = d->daynuminyear/7 + 1; - - if(d->daynuminweek ==7){ - d->lastdayinweekfl[0]='1'; - } - else{ - d->lastdayinweekfl[0]='0'; - } - d->lastdayinweekfl[1]='\0'; - - if(is_last_day_in_month(d->year,d->monthnuminyear,d->daynuminmonth)==1){ - d->lastdayinmonthfl[0]= '0'; - }else{ - d->lastdayinmonthfl[0]= '1'; - } - d->lastdayinmonthfl[1]='\0'; - - if(d->daynuminweek!=1 && d->daynuminweek!=7){ - d->weekdayfl[0]='1'; - } - else{ - d->weekdayfl[0]='0'; - } - - d->weekdayfl[1]='\0'; - - gen_season(d->sellingseason,d->monthnuminyear,d->daynuminmonth); - d->slen = strlen(d->sellingseason); - gen_holiday_fl(d->holidayfl,d->monthnuminyear,d->daynuminmonth); - return (0); -} - -int gen_holiday_fl(char * dest, int month, int day){ - int i; - for(i = 0; i< NUM_HOLIDAYS; i++){ - if(holidays[i].month == month && holidays[i].day == day){ - strcpy(dest,"1"); - return 0; - } - } - strcpy(dest,"0"); - return 0; -} - - -int -is_last_day_in_month(int year,int month,int day){ - int * days; - if(LEAP(year)) - days = days_in_a_month_l; - else - days = days_in_a_month; - if(day == days[month-1]) return 1; - return 0; -} - -int gen_season(char * dest,int month,int day) -{ - int i; - for(i =0;i=seas->start_month && month<=seas->end_month && - day >= seas->start_day && day <= seas->end_day){ - strcpy(dest, seas->name); - return 0; - } - } - strcpy(dest,""); - - return 0; -} - -#endif - - - - - - - - - - - - - - - - - - - diff --git a/utils/ssbm/config.h b/utils/ssbm/config.h deleted file mode 100644 index fa505ec7d..000000000 --- a/utils/ssbm/config.h +++ /dev/null @@ -1,179 +0,0 @@ -/* - * Sccsid: @(#)config.h 2.1.8.2 - * - * this file allows the compilation of DBGEN to be tailored to specific - * architectures and operating systems. Some options are grouped - * together to allow easier compilation on a given vendor's hardware. - * - * The following #defines will effect the code: - * TPCH -- make will create TPCH (set in makefile) - * TPCR -- make will create TPCR (set in makefile) - * KILL(pid) -- how to terminate a process in a parallel load - * SPAWN -- name of system call to clone an existing process - * SET_HANDLER(proc) -- name of routine to handle signals in parallel load - * WAIT(res, pid) -- how to await the termination of a child - * SEPARATOR -- character used to separate fields in flat files - * DBNAME -- default name of database to be loaded - * STDLIB_HAS_GETOPT -- to prevent confilcts with gloabal getopt() - * MDY_DATE -- generate dates as MM-DD-YY - * WIN32 -- support for WindowsNT - * SUPPORT_64BITS -- compiler defines a 64 bit datatype - * DSS_HUGE -- 64 bit data type - * HUGE_FORMAT -- printf string for 64 bit data type - * HUGE_COUNT -- number of objects in DSS_HUGE - * EOL_HANDLING -- flat files don't need final column separator - * - * OS defines - * ========== - * ATT -- getopt() handling - * DIGITAL -- changes for DigUnix 64-bit support - * DOS -- disable all multi-user functionality/dependency - * HP -- posix source inclusion differences - * IBM -- posix source inclusion differences - * ICL -- getopt() handling - * MVS -- special handling of varchar format - * SGI -- getopt() handling - * SUN -- getopt() handling - * LINUX -- getopt() handling - * TANDEM -- EOL handling - * U2200 -- death of parent kills children automatically - * VMS -- signal/fork handing differences - * - * Database defines - * ================ - * DB2 -- use DB2 dialect in QGEN - * INFORMIX -- use Informix dialect in QGEN - * SQLSERVER -- use SQLSERVER dialect in QGEN - * SYBASE -- use Sybase dialect in QGEN - * TDAT -- use Teradata dialect in QGEN - */ - -#ifdef DOS -#define DSS_PROC 1 -#define PATH_SEP '\\' -#else - - -#ifdef ATT -#define STDLIB_HAS_GETOPT -#ifdef SQLSERVER -#define WIN32 -#else -/* the 64 bit defines are for the Metaware compiler */ -#define SUPPORT_64BITS -#define DSS_HUGE long long -#define HUGE_COUNT 1 -#define HUGE_FORMAT "%LLd" -#endif /* SQLSERVER or MP/RAS */ -#endif /* ATT */ - -#ifdef DIGITAL -#define DOUBLE_CAST (double)(int) -#endif - -#ifdef HP -#define _INCLUDE_POSIX_SOURCE -#define STDLIB_HAS_GETOPT -#endif /* HP */ - -#ifdef IBM -#define _POSIX_SOURCE -/* - * if the C compiler is 3.1 or later, then uncomment the - * lines for 64 bit seed generation - */ -/* #define SUPPORT_64BITS*/ -/* #define DSS_HUGE long long*/ -/* #define HUGE_COUNT 1 */ -#define STDLIB_HAS_GETOPT -#endif /* IBM */ - -#ifdef ICL -#define STDLIB_HAS_GETOPT -#endif /* ICL */ - -#ifdef SUN -#define STDLIB_HAS_GETOPT -#endif /* SUN */ - -#ifdef LINUX -#define STDLIB_HAS_GETOPT -#endif /* LINUX */ - -#ifdef SGI -#define STDLIB_HAS_GETOPT -#define SUPPORT_64BITS -#define DSS_HUGE __uint64_t -#define HUGE_COUNT 1 -#endif /* SGI */ - -#ifdef TANDEM -#define EOL_HANDLING -#endif /* TANDEM */ - -#ifdef VMS -#define SPAWN vfork -#define KILL(pid) kill(SIGQUIT, pid) -#define SET_HANDLER(proc) signal(SIGQUIT, proc) -#define WAIT(res, pid) wait(res) -#define SIGS_DEFINED -#endif /* VMS */ - -#if (defined(WIN32)&&!defined(_POSIX_)) -#define pid_t int -#define SET_HANDLER(proc) signal(SIGINT, proc) -#define KILL(pid) \ - TerminateProcess(OpenProcess(PROCESS_TERMINATE,FALSE,pid),3) -#if (defined (__WATCOMC__)) -#define SPAWN() spawnv(P_NOWAIT, spawn_args[0], spawn_args) -#define WAIT(res, pid) cwait(res, pid, WAIT_CHILD) -#else -#define SPAWN() _spawnv(_P_NOWAIT, spawn_args[0], spawn_args) -#define WAIT(res, pid) _cwait(res, pid, _WAIT_CHILD) -#define getpid _getpid -#endif /* WATCOMC */ -#define SIGS_DEFINED -#define PATH_SEP '\\' -#ifndef TEST_32B -#define SUPPORT_64BITS -#define DSS_HUGE __int64 -#define HUGE_COUNT 1 -#define HUGE_FORMAT "%I64d" -#endif /* TEST_32B */ -/* need to define process termination codes to match UNIX */ -/* these are copied from Linux/GNU and need to be verified as part of a rework of */ -/* process handling under NT (29 Apr 98) */ -#define WIFEXITED(s) ((s & 0xFF) == 0) -#define WIFSIGNALED(s) (((unsigned int)((status)-1) & 0xFFFF) < 0xFF) -#define WIFSTOPPED(s) (((s) & 0xff) == 0x7f) -#define WTERMSIG(s) ((s) & 0x7f) -#define WSTOPSIG(s) (((s) & 0xff00) >> 8) -#endif /* WIN32 */ - -#ifndef SIGS_DEFINED -#define KILL(pid) kill(SIGUSR1, pid) -#define SET_HANDLER(proc) signal(SIGUSR1, proc) -#define SPAWN fork -#define WAIT(res, pid) wait(res) -#endif /* DEFAULT */ - -#define DSS_PROC getpid() -#endif /* DOS */ - -#ifndef DBNAME -#define DBNAME "dss" -#endif /* DBNAME */ - -#ifndef PATH_SEP -#define PATH_SEP '/' -#endif /* PATH_SEP */ - -#ifndef DSS_HUGE -#define DSS_HUGE long -#define HUGE_COUNT 2 -#endif - -#ifndef DOUBLE_CAST -#define DOUBLE_CAST (double) -#endif /* DOUBLE_CAST */ - diff --git a/utils/ssbm/dbgen b/utils/ssbm/dbgen deleted file mode 100755 index cdd5824b65e64c4691fad36a406eedf2a0aa7b7c..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 188424 zcmb?^34Bvk_WuiQpw!k>1?{*7Xh*7mErL>|+RzkUAYJStxFCcyrO~#jO~N8b%cNDF z&nV+hXK)s`*&N5;;Eam2YPtYzabXI$pr|ZvFI0p9r4^L^zvtffvbFuq@AvuqtMuJ_ z&bjBDd+xdCuJ5%W-#T}cPA6zTqlB9T%KiPKTn6I51)=AIQ7lita)nvKSo}^GCJLjG z*E1Ml2$vi698g(n&jf}Qgg89o@o&~B4wyA6mnF34r5r41PfbI5lDzOwuA1b}rhPoC zJq5wY091~26+UBGJeTr>`ZTBT?8cv|c(ojmeKZ%xpiof(!P`YXx%i@{K656wZ42K`>sg=70imBDkTJNf?e-KC=lHVywW z@h{`DhcB%xx20sCkT0FSz5o1|pX)y#;9-BR1*74Rw~hdB0hi(EPmTb8aRm4|fE)2I z_Gc9U!}0&w2>R?Efqusb@bnS*{BZ>MHzV*_Is$#w2=LSq;1whAKYs-J$3}pEH3FYM zjR0Rh0-q~EZ^XaYp9264*DjZjz~`M2=pPz^e#!{&8%BV;Mu6Wt0{r0-^m%FoIEBmM z>U$mFM*NHYi62336$ppp(=-CSVFY;52>j=R-iUv(KTiTMoZMGNfUg*V|2cr4C;UuU z`t4FCLW|>V$Pv6M1}>`f8)M*)sq~F1T#L^pm3~7Ey?{7cLt*neVTq-zU`d6e-m%>2 z_BiUx3i7I5wT?2|vTBF0WXbXxSM3tF$5!uIvP4)if5nm#l`gN^=5{;Wy!fz%t6j^5 z%4(OZo`l)CFt?}PUbk9sta5q;cU`@+)>A1|y6POYf~(HstgR4M+JLEZ)Yqe!+qt~f zRxO}REfuPC)!!r7mbvOt)KlTC6_z_Zb8K}Fpw=pw z*MlcnNi}ti4b^jd>PU-uR?D2cCDW%(pEfg;zBZJ~nt_%d75XnS9frn*@H8Hd!blj# zX(@thbexMmf=VaQe}X{cs}cXa7dyw&NcuMd;zx&cEuPx=eu2}OOf#m}fG61Fzx)-3 z(|Dmn&GYi3LgfbzECm%Gb1z~k{!8l@H20bs0oVEk!D(F9el-6lIE~fXkLLdb&(eT+ zYW;}d*F?Zm4=!c7G5D=kk>!mMaBYf0h{+LfnnP$mnGtZUPZDlc1e|1RKe-WblCAxS z5pXR=5Uw}^js^++EQ)|9hM< zo(Onq1buG=JT!#|eqRLq;t2YK5%5bQ;7SBsTYC`tcm({i2>O8t_~j9B;m}f+Bj33q z0-g{7H%7oyBH$Au;HeSti4kyP1UxMQJ~;xO9s$pcfd4!KJ}UxFb0Y00Hv&Fc1L7%0 zz^{sc7e~Oaj({(UfKQ2li@t9XL|O0MZ4|`jc8^|hqj!tG?Fri=h>#qdxeO$O=@s~G zxFQ!R%F}E^2p$~7Px|eYr_d1WW%&}ykD`1#%g?8L9OYkP`8kxQ5D-3|vi#{@jjsx z{BI~vT`pM6@{duTx>_)o}mcNhk)TM$cEMHG~>PkU@#gTp1Me|k>#(VJavs=9m`LoJaviSQsm1^;mMQX$))eZle_iLe_<5ffw!G7q+GVu z1()vD{~jRv>DK?@b1gOYOP&(_@r~X3yb#5CD!g0&v`PyGcdEb>hVw4_5u~E79nck_ zLZ5_5I}WRSyY=f-JV^VHq#lCgqL6^k03v zVuvw7`s03(J_)MU4WJ7A2`M1w0;$})0Br607r++WA0zJ2_-)eIuCE}dq)$dG7@DV} z0YK6~3LBbyqmYyJ^I@8{Q!nRIyb%rg1Ih$AS+_ZjMft3quakanc0gX=e8UxjJo5&W zZa!t$a21N1Wxb4SesjB_`Bmhle<>%+1p)DkhyWao(lU7YMV;@5v640IQp2O!fJyuI z9uzl;&HBe7-Sa7w+Jh20!y_|*^!+eq-LKImpP!o0v=F7M=C_y z?2&iC!El9{NpC*{=@=QDXd#RYle60ISiB83gT-EeDx`$r(HnMQ

t0D80>XitcSt2*h|`18?^(WA=J5|m$`2EuTc{^|dwj=M-dc#j|`|Acxc z47Zun5>)e})RcDkSTj3DF#|bijhk0FrZ-LG9*2L$e?H!B6&=AyVum=l111 za1b>zG(Cu%=6FziHK3~h^<9Fz6ZsgsJX(e#nlJTw4h5*Yg85x@1sLA*7FuW9KG^#m zG;k}89ju9zD^P_tvXi1m3PewVsFh+DZ^B$Ok8&E7S9S~#er6X$Tm5L0Rw~s3PZ!$s zRNjFmA*&@efgt`p1Dq1$X?59ywb+Gzx(f=6uCExZ3C+SYx>u(fC#Lkh(;4R#v-iedlN3ho5f! zMo8qT%T*XuIwAVHba12VQBZly!c=&R2-l&#k}VJx+b{ma1)c>8Ejk9}m?W6^B!M7V zc?@HU=+l3Uybg}Em*6+*-vC>~!#_jwW(_Ju#HCyR3Q+XZw=e6cMcRFrC><7kUmPzk zvvju`8*&j#J6>8xkM7*_(4fKo!NKngS2VFQTZ_J3DwK;;MLa~`z`3II<;qcuw{0aP zO}xHa+(xLugN7>_3AUAT8hyt=9Oy-)$)xw zrVor*mY!FRlFB;NV{w;$8bG>}+i3oV#@n<51fI)7DvT>{OZg1iNnZ~3E)K3i5yao1 zn?K95(B-}_6{WoxXH&&)a!J&5G1$;iJi!o1O0!5GA#2*241%Ve-V6G! ztz_}Uyk$X|V9ou6F7%Y+wg&=BsDvLIi8(V0mlhsaD zHjw^Se|)1T^@<&*n24nmr zN*|&cCsBv)Ff-)_+g7vPfOl2?q=8M++u=0KEz(yl>fi)~Y)$`jb*bsYmSp zmVS2(+zj1Ei5Z=Um5xRk&3;UNGS^2)9n?g;)rO3Km_LMfU+vkWIgb$hKs35j;X@zTkZb)=!ao z?^tZQ)3n%h7xS73dyZ(!JqVG^mZCg64buwKRx)SSo~-r&S+nmB<_Bh}!?gEsfqhSb zwAVW6l7b0`EqhN}_MR!o*lWqywrYv*^yqa}L?oT?b?QKB+50(a9B-C(Q>EAcn@mju z^`vtwW8e0jKG!N;;<+28P@v&CGGmzgV!<*2yu<8QKWp)AO9b&&L_(oq6Kl->B&R^o z1T7Z4zarz1X<(K{@DNN3EfOYfADV8pNT-;qy-BsDS*d7|$B?JZ5+w{`qC8skd(tu~ zlcM|yB+xwBZpyDnetG zN$MlReqLbTZ=G~!nh_Rd;5Ueo8OhYFQ(5~>pvl-C+z6(cKj-gZFtlLW&KI7LX;5JDIr>?N2)<2?L+yF zGgtH<)0mL7kHHDjyf6DuUmbp38c$|A*U-ERGXu1oDDBhS+Oz~Mh(Xolx4?KunAaRB zm~@S4{R!|lkb_RT$g=k*lkQWqr1T%duh=@7GIp>&c(h>b`vuZ9!3-4KNTx<(Xk9kc zB9qY~K_}E;QX~75McQLxQH}@AtUadJ$%D#8`I=HuzFdp7+)rq3+(`C8L*z@J&^*($ z_X`Vxs0o44V&86B{~hTsPS^*tA;c7mPg?p!xJ+~k+y5Th~ zx~SYVR>U=en^Iu^!aC{77@nfkVI6yjGcKt{=z+d`d;qKtS9z)31+ewsRb1w$zjQ2%< zd7|?0aSALny!qF}1r~xH)322(5Ch+uF&c203QPs?oWPx&vH-xYyu_(pd86@f)Kpl1 zGG_hZI*ZbP*rJV#qI6d555iVAi_&hfHK!``wBCQ+#bU<~7~uZR20f^)4SMI$poikl zJrfF{b_PoPtH1Q~KmB&Qk*e2mdW@KH0$4JFnDHT;w~ehQ{7vgOF3}0RzY>Fv!8oyd zE^Pt9vg2BP2N7oDd*RdDDmR+SEz(bHz<*yI?Neoa(Yr=W183&q(cJDex6$s6vgcb^ zs=3`FAuQP7 zQ3L4Tdxu2k>|hVreI=ayB2mswXeGTxXv zN~&RMOET%xRi_gIHk?8$6H5ib;-5E;dPk}#b*rPY{$i99n!s1qI68jM5M*)BD>QF#tjKAw6H^O950{* zUjY8S1Ty8K1WU`U<@^WT!GbQMb_zBNED^Iiym_$V@eZ{dnB=^^2OfC^BzK9r4l(-& z(NOfS{?A4z?%8LNiV_O^t_gvkLy5NbQ%m8bN0h=T&+(P@HOf}KjOZ{vs z>ul>!vxmXjtNeHiS*an{)ID!DXazi}w+Jm)<5w%WIRj|4^5c_~9S4_P5CB1$kRVIZ>40ge3`VF}rIw6qJe+%#yv`BF`g7 z>|x8@qiwXxJufLuaUa1l`6)CHZ4i8R2c-Dd^tN1}DTJkOd>V>O=~c3!8k(h~9X$PO z_O*;7mC!6s2;s_KL=@727FwydPS;o{0)hQ&FjZ=y5x6xGzm&dW2Fz?pArSSytW(Md z1e9sy@~=6l%nHH}{A-R{rEirfNL%dlW|O&%Ef-Q53ethg@e8#YP=3pb(u^(+Qw0>8 z1zmsDV?69c$Ve|&PTzv#_yI2%KEEFs2hRqxG43D{YMc9Zx*+PqAo%( z3%oy2OpcDaTSI|19DK1JVIlW)DCB;CFuH|@+&5UrZDB*%)eF#F&SGF}fEBN`U`tr-~HOE-|dG_KOx=QIEt ziCE6tB>LJbutb^;-9h>hIH9Ydk6^`e(Bj*!6LoI^#W;x>Z=o~gn&K9wpv#z~g(-$d zU!Agf)SAAzmTB5iyI%30R8Gaa*gyg@{_S+ug`Ya2`gl?7;jg!ysZ z*xlHU2}tbxY@z6X(2nhIi=>7f%Nv?8vG?qMdMRQHTN-1KmPRXYP$NlL(jCa!Z|VPr zRJTYUSUS#*v1II0p5j=m^nF|J=Su}5ka~Py#?hE=k=_$qweDi+_z?xRv$fk1b?sK* z{DQh^p?}JBOUF+z)*(yA4vX~80>Amcg#zh&)^Z&OM*-TGr1dw^w@rs7t=N1NGcfN3 z18tpA&5eVHa(!$B!bD@K0EK__G>7B7XtGz&4 zj-3Q;#4_E57UlD}5LvJgQ95G&;-5EIDt3GwkC|Fa1}cL_LxlVkqrB)h&%$bbU8*P- zBQybXD+1BnlvX#$0yM`{o|=f2iSHF8;Y65_64D}l2iKC$P%p=(i!$rW!NJzCtndz+dPoP9SFx`1 z{glv<-oj=Cc5EqU1V!D)qOU{8Jh&@4(;htN%&y(ITkd#8xBm3)k;*R=?S@ zzL_+lc~;8=P=xCD)ckP$?gmLz{r-KJ6xNrpqH8K5>SrJj)o&M(_}Ao8{b;{MtKYqf z=Ol9BnuPYNpn8H~%SDZ+jh=e8?m^G*O9{891?wfLXU>LkqYYcejiJ3*F`H4Y#q0_#c@rjH zqW_+&l}R-DA`44-)~ubadTI!od?+_q$i|2^FmCJ1XstjCYfa@|<{Y#Vvw<$6M1oo( z#fptx>6k`uQt4Cq(HYI|xP*Nomq4TUwV0)A1yWdn(&7LnL_*tAaNi29*F`i|*D=V0S4I)Mcoaeen?x7jk_CF#6hvM!EaN*y?eb`i6c9pQloM!tli zB8@Y&h~}$N8eC`x_I=VB%X&7~!&a={M4Qs0JmcNQLrc+expICA{2V@}%mH^f&&nv_ zTKrfA0)Mi|ti}S-zhw~>{Umq^NeE9OMDLzDvMPSmlc?&r0i{K>d zr-jQ;8oqp6Bgvm0Q7WvTtUvNbNdE&!4yXT1RQ|*#!|VU&aQWX3U%vH9(q9SbpBkZw ztX~!`zi{~SZK=P2{Q42(Umh-h!SLli_ZZ^?x8-e$5Ew4^sJm4qHB|v(S8posQX$m5S@XVXGux&el#NZKS*);B5Pp zbkgt$ZGOqgcefi6+OX81p{#kQRepx<5r7WZwg*vTwz+_r$;m$(n(wC~`tA;+VD?w( z%+l01kPJ+F048}1q@uqXhagKY#<=(2^8PBEVoLk^k4XE>(v&pBAS^*zu7P+&kdD(y z8J$_}&OH?JGWLqniNJ5cz$AU7JPWf1zXKV~!)5(*EwsDmxg1ldF-VH*w^M!8t*p@a z7ANlr`+?i*h-tEeW-K3bZFQ*n=-KAKM{BZ z$|#$mgEo$0oo6a#+{i2+R!{jPmBx0Q*>-A15B!zt zEgi!Ebu)PPJ;C_Jl1Y53PSDt94w0& z-Ijn5Qn~*_5R||`Xb)dbCd2pTUy!4?`McGN*o;Ye$R#E-}HSfzgj!_6I zV?(XelNlHt;_^!(`-W|s!kg1W@YMu=PlKm~;5LH4p}|#cl?4R-Gx-i{uePZ$3p>A2 z{`~>$Noz=HcmGGh999r|%ye$SI#|T=6}PvVlF(S9d;uK?BPFzr!Y}3PpQz-_To7WS zo6v$Tfg32CLV1gjzMPAxR_uJn8;98s$j{^ClhyUaphEWt0srAu#zy4umPE!(@p0rQ z9JxqE(!L%pc2sg?8%I{F$Rv)maO86wxjMS!6pn0WNVc5Bf=ZKV;K(W<+ei&%*ZUO1 z?}(V6w8atpfz!}j(%(#6q^{t8X1hjUu>Q12KO!Jq3RJn+JXk87^d#RnQ;)3Ya^+m` zgFDk=N0|d2(q-SgLM-%0@xbL2oSEff=jG0{I>RI9!_@FExpW_F^e*AGHef2)e}oG5 z(t|+06p3UFG?$QasRHDG4MApby1ymLn$vxpN3|2}foE`Qi}d9FCj? z4=N*f)+Hpo38o!{%v!D(2*V)7M*XxO? zFXurFk^h>B||-6d?B{1*}lu%YQHhZqj%u4|B>-wvwvUp@|ZEMKfRLl*ODQ%(;vS zEUg3XY2Z}&P9CB1xi$~*LaV~IR<7olYG8ODKeAix<19Xc@n7jwP14V&vgq7@r`Q>~ zkA&Gt1`yod?S`grpkhRPr2AaVZ?_MSo6P*_C2rv85cwBE`f~bTqe@~JW|f=`@?D&K zVyKe$b2eKzJeR|@aJ7^ppX5l@GVJIBwaDYhwH#TYmQ+LJ&pC1lN8U^3Q6p;-$K(UU z&GU9IH;+1Q^4dLs8&fdR6`%^9jk1qvVd!x0eQn-=>%~Vm8wH6L`ce|E2lRO6AVls4 z3=tAHRQvyhRpyK|+U)E9OvxbTxMM8oH-o~^q=UoA@oC-q$C0HUN&gfT1q?fpcfg2p zP8NP!{YwY=p`WQAtKA+PK=2bk+)n5?jXnCqqkISB#Ie0r|Lh&a2S8{3h^>Fh)&2W$Z6^s^_XW~fi_{%#MFGr*v9-=Wa>FYvp0r}tuZ@~AL9urVi%e+O_W`7NT+_uw}+C$h))BPL4v7Gw>N zz74k7J^Treco9I;R?HO!vv3jjhsJcAiR<`$lxW{C+K;QII?L>{P3-s_W_sHnBi;4GKKA(V!W0{H&hY<<~3abC35mhe+@HEH{|2MRyz7d?w zranzbq7CHnKm3szGp7^y0y6gxE!SdB6WEAfn2gE0pnyC+?N5y8_hF)&!bDVmJEKcp zOHwRAwNljO`+bXG8ZwHUlZdu6G=B+?Es*-*dJCY}e3XdLl&h~NACzZKi@=ffS=SEI zR#`c0t*vxV33OWCvGjX-B z4E<`*uU6@6hoQecn*KpLdJ_G=rUy!0hFZ@eico#8ilAq^o{gYHU!0dq>(fV^Ix0S{ z9rt_cuyI|CvybjNn&01wEsVJb(FhVTh#MuwVMQzH*Rb4N1<5*|#H|6ngC_@&RQ0(*q^UziNyOITbAx9jv=cPqG$ zKT5f{TPGuVm?txl{E#PSA^8qZ<|5g_lOmE^c(NGDzfsZA>xC0as^e|34iqZ{b7~nAn7kvrgvbda7?^ zytp{BUiO7JKqx9EuaoqGDuS$W*$}@y94W`Xa4E`F@NzvIv#?^=axX-dLwvdy_PAHI z$0NYS+T%V>uh|0=`zkJo(FOTrFlC_(3)nO*H5Qs zX6c|dQ>0r9;dp3ThE|a~_Or5X8wJ6#)>B<+3YEm=x#nd=-FSy(L5DJy0;Y4_w?#{Fni*APn z@8;=Fbsq2}wAHn zR{&8iG&up+Zzt!}JQn@pH>$!6FYSvG@=OZ>W zvb}BMiT3s+wOv46b~@PfJx^GESFDM|vGX?E*((?AM@0Jvbm7I`PvIq%q%`~-O@;k~ zg68&hD{y}>bdR<#Q)?f7>r^^{OZTIFhi?`oTtYQ$X?^D9T*0(q_N8>)&yu}w9i~+K za6ivNmqdTKS=`O;($P!F*?YZjuxm3A-8wip*nrDm398t>=PEbmb;ZZiRg+ZQ22jt= zqT&mClg_@286CFW%T%|I;n!PmahGo7;Qrik<>hp8953Fi!^LuI_UQX(Vq%DkR@!(i z?GpW#6uLxY%{Cohdkl9r9#DdB!v4@eQt;+M*3rJNDE}_3;;V;e3F*SbQ=>skIQm>iimoe<#S-0c$0tq;MuchP5ryg6ME7c z@#FbY>O|-4IFj<$#zw>^Gp>b{juH*#j|E&t*&ion^eL~AM(m2(epW%eOOtg<%=lQ@ zm4*hxg`_yPHHCYw-zt}I?eW$GyFN$vVWVDip!>1x9YDP8fT`>0JJ&Vf{tYe3?j-kM zXJD}SsgAfn4f=+6veYB(GW8$nIC}-Vh|_UepK$_LGo+r3Uf+OD{MjjSeOE0gvC6XO zj+?W0dbj-tp`lkS&{1a%ZiDQ;cSHWQ{utesXIJJeE!s_2OXy-M?(nkv>-=p(cI6lq z>BwrgIy>lgDlXfS+ECv+1($6s;VUvO=5QE z{g=>tAnUu>o%H@aqVB^!DKy^lw^+mdTe~kQ+Hoxmhts&*iHlY^%SKqV%Fol`t*P;T z;Tk^PsE5zLOWOz?mCV<1uaeHBd2Hxl6N3ES0!iT~nwf-RXN=cTqtfND&)QHB=QnQ* z4q~_NT;(#b@XZ=!Xub(>ifrk^aY4ut$dj6SL7GEfeKIsXL2XL`BH#CIvf81SqM%Y-n_Wmj zn_vwn)%h9DGoOVPhUTe&v9caq{Cp#Aqd=AUIADR%5Jclq4-L%T?}v>&j*x|dU!arH zI2g!9x!}L~4p+mIM#kfM&Z8Gf`u;Vj^gs*jf7yUFZQTK*v@B;W^D#T3hp@kZkT=Zl zL5dBJU*W;^<@?F(oglhaD@yN{X6b<+U4jWy3%bx4r|TPs{*P(O3{rrd?PQHy zaQ}&v6Ct8Up7BpP>7nd@dp?7RU66o?oNxz|LU)3xVhMB`5ot5#dZF_TvNtZgQ@3O@ zXBd0^H{b>E>qu5=)!fR0SuxX98UK+~K zeZK5IgOAFY8gE8-x)3QEN^+@F%@~7yhFn8nSMyHKMBfjI{#Cl%4W4+Tq3LVz7X7(8 zF=r4d@7`8=Py{yy5-FwI+hrbS#Ch~(bDZfL3}Vz>O#b!R2sz);G6noLUj| zguvPdas`FHrvo;IriU5Q%4gh0#75i^Z3G2>Jw~ZQ)qDd88e_mM@DLC*J1kL4H*cZR z)Kg5jnoiLSVIiHYL6g$C35F(`Gnu;Q#)D1>Zetw@^WG1G6T!}6A1!9brx}_jfXJd< zI$yt?njq2OV_cPmeGB9r0q z?ZF#CXO+{^P*+RG7x4ws*9CFw5^yWQ*BQ?T-TvL$kQI;4Og!&op|u=3lQO7xV^ zw)im({&WB_n8l!duK=+A%vtQHd%Mx`a`7exwI#ivXC>w4qH)8uR5rm(JRCkk{PF!C zBAOFLgo@r9B3lkJbzT-ae^V!Svv7-p1xXPLM_iDG7DzQt0FgVGb|EVEgeyp39+;-SFt=RM@cjEJB1~^DdgRV3BGB ziS(}6PZKH~ri55h(7}-vpC@UU*>}svEbPsh1{Y>p?9I6lfp?ibABqG`%w?9cLy^G#>nxaY!vze>UahTMs{7Cow148s3q%sqLk8b6^xCU zG&44R3jqrhg790|DnOghf)92#@z!cmBdKvHe(;y1M&BFQ=>5xMSgT-}t*%vGf}<&M zu)ebEpOI@7@(Z{PPSch5(c~<$QXn<4_d)CjfLnyGbOc`mr%?ap7u3wsp}--m0rksldFWvQ3QcV5rjg44S8qDpYe-3rR-hAE*#f&3X z|J^!@OE0_(X5_;ihNf2`866y*lBT*#3F9C4TQa)bO3fikzo3Tq4GwzhMJVi#2l8Ni zcOxMYoj}eZXcQcM-b@hwJ)*W%ud&`&4U1s$Rq#ur5GaN;8cm-k7{c=u9<>3vPK8jw z#`FcVkz``!+rCI^8)~RUp({W877l8_h<6n`DuZ#T{7rOpfzJ^T zUvmC{Nr%vIS>N?qJLAP{tRvX4e53vWVDG;aTYo?O+9=2~QkzeCKI7*oWnkR52m6^c zY~lrKt-aVvQTkyo|0>m46!RnnFJdvg5rPqbXmkyar{oWTl@%@WddL;l& z5QuvP5P1MmJ4}#71oXlL5hN19-O&Vnc*DDu-c2Ba^N7F_P4E^Yzzz=){EP@DM-%*& z2r^ZI2}Cd^nqUJF&?~6KA%h6M>W@-#IT6%-&k3@KU`I5;Tq0;xmArun{uoV=P6S4k zLoN~cq6y-OV53T4A%bPm1fSzbtF>JfQA7l$Xo5~6cuD23kO+*?1kVydom$PiiD2+l zR5crk;GimEF%f(gO|XO!s3NL};LT`)n}{G+rMIqm`U|>c8j~grVsxm)GC|m{4k~1xFGBA zW`(>zH_JJ$$LZU6sO?QaXPs&z@ifpaZ^E3F?{h(Y9n_cg*K%1GviI37{cqAY+OQ3Q zPvYH1rlS2L+W2eUi+A|~=ES~A6eVXK7f=51Dy_xUZ z=zj@L^yR;%cFkBdAy17 zgGR+6epG_^%_V-jK^Da?Mb)o(C#slp0^ejm; z2IYj{yFW(GK`{32L+<|H|>y4r*P;y>CR_f}7aHd%P6J zH$nhxG4B0#!mr|ZXBe-xzJw*ZeKO9_nxjWSQ%gHq#P~ zA=D;9o#qUEgqjpW^$_X+M^zCjJbgS)sO=n85REdn0QFannog*&hQ)+>grhDZ)DJaLALT%xwn+cT_LhU2eZ#gQBP?v^KU!yUVMuy56 z3sfNGb=JCzpJ(DvJpk$pLe%lEg1Uq=+E1vA5GohtlP<4d5l%xJcsIX?w z6H3QX4-qPCpo4_^imTxw)btR;lsABSo1@AI6?QU_P%m**b~I`gp`PHV%L#Q!h+!(# z>pqT(BUIS`o~KevI7(>->he&jUP75U>Rm$3389V?Dubi`Nhkz2)m>(gtaCZ)*Mtgp zgIR9^b@DIF@@ok-E>vnRN@2E;8hj6Ew%=1$jtwpBoY9G_FK4^oh_QSc)}ZKk1IVB+ zXM+Vs@kYEgQw7en0I|xA6n)Ws*Zvt|d^ulWI74V$2SJE3GY-Gnhqn1HXk)K9jB1%%bCLpo`+j&9T?i%$dQ!~|422(=4JL} z6+kz<0J@7A9euC!Nzk#kXt{0HkZrz)9*}b#h-jpL1YB@-N$=SDFMR?5eh`x-t3T}t zEHnn`;}($I7mpoR&vKeyBq=*k9(w`O4nxyJD0>RyW`k!x>OKX+q$Bu9#by$-4vOK8 zl_aUxcd!9RPqQ)Yd4XifIdLHL9c}>So6P6>;+0}hLx%TvOkU0+7=gF$hnOaW-yd%~ zM69o+8o*OS3sBy^sQFSVhh00Ut{i<13SieF(Q~_RAl|dmH*le$xsd87C$A^2$4Eq5qJP- zc^ni|8i?W`r&z!#l89m}{DM;~CW-(k+Ln>{v&bqzbOKBt=RlP}%AHt=CPGJL5>S3O zUTgarUr4>-Ayko|i_uotzPJ#yy#DwUmpZ3MhlL$m_mK}upMhZqIH1iiuZzRHTFLVb zT)Q#_lj>eRscvE%argVS_3*1DTTo1n-)0EZ(FbMoVMo-j5GeiyYyOV@T2MuO-WsG2 zanea39r|JG8X%Pc^j^LRa2Hfk+pSygfQ8`WLQssbKZNdXRQ(r6rv)f4=`R8$+kM4) zzgs^QIg}V=-;=k>>Cb{5r!&gE&q8NxPAiK@BX&XkIt)FAriJmetAeA-Zsq7RD7O#x z*V-HMY!UK41rJGo0I@nq-?w1$-UJh-=QFHjbFw1n|C`ZYf$={03q}up&QtX{1X)a< z9#V|yb1Mi9&2><|rE?T&UW}IE_UP8%gknrgHZh9OucIMsj~rsP2{Tgn|;3FK|1`xxgXlQu7fS2)b^Hh?v3C__K@_~poYOt%>7VbIp-GOVfw>Q z)3=~f#_z!xgHJ^1mw^Z$6YDiR*bdh8DX4b(gj73}adDLXX^27%IhBj z(6sd|+`Z*zaG%xzIA~xdQf%#`EFde%$zQBPCBviiWSm zN`HzY#l=?rm0$peGBjNWHQ2#< zbEl!Xm@@dj398551xWVgl!Fifpc}$yWMrG#q5k(OkW`JWv_xh78}WmhGyxy`qR)d( z*^QG=(NMS>w+M>y`LLJ7oP%Fub+bSCt26j~s=iER5PXnj$72{Ie!*49!fakGU&Zr! zi~tP3i07Zs@FFXosBz0?*{?O~tB?)MgTDknfV<-}Sp6_U9y8Y9O)i;fVPHeXmw~#m4?E9aG@_SG2*)5R}zUpIUlPlybc5q4OiHfT{i0 zBE@6d1}~fJX!y>bkK^Gp1^(*p1grfx*BA38`WV!ywuJANqAj;2DF0-2)lUK^QC={B z1AffWMu`&MQ_UN&X7{0A4mJYUt^W}Z`ti0}<(xCfVAlY062%M!M2Cq88%7p?e0Rjw zLnyB3J3xSfos14&Ji>cFNYci;{?C+nIr9DpwwF%O!7EK4@d8JcC^xXgMe?O+I(o>xB|G7t zA)wP`uSTjHlo?CR(L-j+nQR(3ct^07HDS(!_hD>ufvRSXqiTgIPfbvA%Yrm{a^Mdc{$?I>GKAvB_cr-o`dG z{T(#E!7*0**M_D|L~kE_T?B_L9K%j&qaCw8+^{uc1Ht|T?cs~ry&ihA8ZB(}{gmms z9-nXcF|iNGX!myvbd|-~dD%(aB5>{*cMfWo(JeZ=|6-z4ujtA7{pGP zz6olR1x{yfWeG!GubADxZY(?a>c?vpV#ZloOi+ZwY1jv1Ye;OzS9(!?gxUpHY9hX3 z({Xw@B|1dsPOijm5ht%qiJee_eNhnyaf}iDxrI$u8aeT20%)uhbpzxd?0y%%I|ljE zTj(rr<7FF=)=mNFx=TA%8&jZlp5n4U?nt0x&!~-3f_&3+7BxY3*mg< zr1CcK4Q68MfaRWO$Rq803_d?KQdA98?P;!1ITpzOT@(E8YB8SG;?7Rx@n5sqOmY$f zq}-X708QVffsSf{?vjeuK|eIS)`hEoSo(f5nJm)HyoK3@4T^@_$Qby;59i_b0e&Ma z0j~dGK1L+lCMJS`Q0Ek?&<-F#(5aX?64zV-PT#`7DU8Q$p98`Q-T4l9A3`pfzGDV+ z9Wwadfw>@l%6nAoQGjt62o|>;M7zHSlHCTMq>{Xai{j)V?mMUe>NV{n_bf_o6LmfP zouF#;oFlGh2^wnX>bQTS4&c8M4&NZceYx$ExgEg%BQXFt{WNkeS;zZc1-wIKjjO?X z#H^0K$F%+j{O)M{Tfhfx%tK&b;15OP-xISsgO_V~gyD}o2*abno`>N~gpuG(G#_i9 za{8_z`U=i5{7)mgH?RRVd)o?H`93|gq;lq0AjiiQ4PxKx;F&9~$Aoh5=7v;t9TO9l6jvz?b8}WI|Y%DeM2}MkVRHP=N|)MX#RrNXd@^IONkD*&UwyL^)2R z#d;2eG_b0^Lgk~4q@++){ZEipxRtUy8SRE)Xuo&@v{)p*2x?Kj5j9bCR0DjeI7)vU z3duS1z7Aj|Nhj9>+%7NW_ zCt89q$~O!%pxn%b@QYa7U$7rU-@i7!F$To^@)ue*EdTX{BYb`kmizQcd$9`KKthVu%=47VQ!@!v_Bu7A5GDf|XvpIPw|6{Qj!x1i z6LYwg9y9?_%?zN+`ucR4R0FTu04pi45t(!# zFqt<@&Jr@&b1Lw8qM$u2EDi1m^ib|W6{S$-J~fko!b%M?q&ph32|Ba;=_dnK#25CW z^%&!_foM%?!RIM6L(Pap3gv!;a>^AdutxA5|k$%1%G_D zHS~Uwm0s02Wd@JIai75Pu}9d4_d8ji4t6zITLaDc|uG$#_fOo;%c_j=D3Dd51r{m!^rdLeCKTy|J*j5V#F7_z%I^2|9 z=%}dWsj@0>Jp;|HcM_)5=JD23V!o}`YpY)^%ylfQXNdwE{&22f!r#rUhVCHn);jR3 zdbMEkE_b*+LaC$94`c$CuL`CTr|g|UVC zrb1zE{v4seR3ey)OYpo+m|tFqe^x9*<_V?w#X?bDnNV1Ct6}F#-JWj zT9fv_;9qB}_k?)=C%z%AqDtYb&Hq6kt=#`XFSqKMA>F9#|DrywdNQYKtp7~UHTutV z#%kCZfBd;dW6cWwFUr-jze25+8lh3^sIr_Sk^cTQYxrmbgyE4D)M!aLRRn99=&}MU z;-2PNRxOxIOP3Vmv!fIM?CI`l-t>xTtdDCshNy5<Qm63vp7Akg|V{URYTgYtis%eX zd+GG2+-;u8>Fx~GxuzuwW|wib%WGU|t3|MK88H&nv%keojd!!KtY!(~V6_`QnK;%c z%*mf;DHQSx%|iMecdcE#7Og4RUEW%cP=a75tJ3Z*FZnw+sw0QvGf~FbEP_~-vf?})?sL&x+*s3wwGX`o`AtC6b z=#dp}BeOUogw@p{jHyA^&@?sH+G-%lR*9h$nh*n4Zko|#^wv75gH^9K)K&O=jBanuWUit+V=)S^bXHfR*Bb3rF7!|)nfO&Y>!CWY>`Fpm_^k0(qu;{dyu8ur zHZHGspwbj(ZM9$q_jBCqwN;Y=iA%l1?X5se&7iJ2CAa}T z04Fw1uJzKyh+3$^=CK*;T$r7>jo?ViLx(878Bp6}EGRE6BYQ>8E=XgwO(L5KicJ0g z^8;1v-uil&0qv?ujugdx3tBtt$%EnjMq{C=AfI(6qj9b!-)g?5u&6A*lrTB!53pL2 zKM83^gD!W~uLhDvK!gGnz>uz5j}1d)eF&)m2*}aGYB1`qL{sB- zEU$s>@l#`~zsKQ02V2hjqmkLrO(=~$IXac%V?RUHU>Zjazy>$!1=TKyAy0t2E!V`T z1*U@&Vj%h*kvT$q+>lGY23zr>_bzvk0~=jSFeba0226mFvh_~SYC@{~3Hv{2p^9b& zDywbF8MDx=$32br0>Ti9XLTLI0sGYVA+kJ3hCq2m<%MNyXC}@qFDbN?m6znN%}OjS zD=IO~%QxoVQf?_mzO1OoT8d|Z2?I}IzER8LdldO4fK#*$F;qo{T^ll&!MAFSbxZ8E zp87-4XgvzP_uAr(#NzRO=7&l6Bc9c^2u_R+iDbA5nD$Duh z1#^ltraZ&hs%gfUr#Wk$R++0A#-69S0G?JwEG#hTGd9lPg=r-H6^4 z?J>7%}g{g6IcqV_H#;1w_B_L!lO)P!sgGrt+;IMwTby< zqNS)91*XnOoLgcl%*zK5EDB78w}H%3XeL}1CEyUHjI#xFYo{mTaSKT1n~IU0Iz4fI zQHhx$7UUP+#u#FLT$Gnr1fqhX+f4;?EVq>BBLgQahf);3NV$?C2m@op7gM>}Vlpe#2W9EuqZPt9;}Yi_r!klmQMn7U!J0Y+wHk6ZBIX2X+PEh~{*4V^(X z8l^%;%lbbX4R#&DW+5Z0HshUKH5oV3ByENeVKQw_66)l0ODuWRB$%CPDc(l;7kM){&=A6)p2bu(@_BqPc~Q zG&82soT0QhKhJ`RAw`mzQ?W=>WogW~ta?6A2}Kc}SNlyOjc!yIrelb5lF?n`s=y>B zREp<$sp}I<^5+>%7FL%1E-|5#A)1zyvq7?eXXh5p0kBkL!#4>CGsWV7W=#XvW_ z4{ljAF*28#3eBbxv(Z#oWVYN6=P$Fc0e1+z(2}7ITrQ%Q(!ib?dgmFWZ2&0B*5k?JFqu7eRKEw7EN(06U)8H}UG;oYS4ICpBHby$ze>4=rAHib{G*WTy5+q$to;XtO|~isX}7}s?Z_Qw3d#R zr$L8E)DW>UHF&I44IV33Yw{?u8ZcI>29FV{fn#K9;E+fw&1GYqDFzr483Wfu#(*`E zG2j@H;YQ}w`EjA2~k*l_Kq)-(UBT+@)WFg5G-$Lq4H}Y`pI1~^RB&6Y+YAARWDdb-Vuv6! z$wLrBgoj(0S63?$h7S=RM#ahxV`3EuV}@v;wtKXA6*xq;3Xc`6f@7ts;GrV5mX8st zfrpCKFhc}uh#{gi#1P?{A4JR6phHA!h*-%QJXWv<56LaY^0g?|$zp&ZsWEU(Xbf1B z83T?H8E$i4RZbm-#t01~qNRr6(PG2!7`bYzM+#LzF%ng1v^W(QElUN)3ews*N|XkS z6{f*s#A)Cdff_hQq}IBT5;a(iKn)x%PlHB_)1V<~vjAMPr8*1J~L($iaC zje|iKBf-&`)6L24RoK0+cc4ljvR64PD_um6Gdw7X;~Gl4>TT8hSzC#tQAfR+aXYI? zIGrKX&~vqoidNt(h8_+_9X*_cs&G{-XT>V&ZOf3ynILW*;dp`4IGkYoaf*UlMg*yM z;ffGxx*R7&?q#le7o~A_NZ94B>Iz3~J*x>04ILyC&((NT;b3NUg<~a460Nhg!pW3! zuE04I0QXec?s4J?yHVAuap+r%yD2Qm>{RVQQ&5)9c-gat5uzaoTw}w@hR4QI^)5z< z^HZmnDp}+3;1ZJ4PDy7)t;zsL6~w0ohpwbRjmwT*O(*fHwXLw-O*KP;P`Dtqf@HxQ zwXB(4xR2!|Gt}9t2~lUOv#loCb#zR~p7maOIBUs}b=6FbI@d~GqdIS8C4s&5b*y+j zGbwCKyzn5I^{&;-_HJ861tW4ZUj)XDO;Vz8<7z;aqnaGR?R3=El6?0)45Ru7c2n*l zmhM^?S>8=oQI=D}<8V+h9GcN@bSMuKf!l64ydjXc{$8)k$@E#_bm4jnfVlr*<4@e| zL4Gx4GrrViwN8pI8~HJ=u*54oKF40(>Zmu1toZ@MCjUFi5>^H9aAq3I}%#S>audyK)4r zb>TD*F$qaWozr8htD+>_yuO+e9vsQ4>OlfQaQhH}6^Z4}O7?`0qAwE!X7aG&bQ0FO{0QU}dW*Olf zJg<63W}>^A0w9`gr479UPaHVV!;U_WZh%x3Mhl**wV}UcCbA%$nYhfw0v$?JRytQv z%i-Ra-Gc=D!-3;Y^b52h)Tai4+O;eeu4Ii+&AYeTiy?vaAGfQTMHNp~JvuVg8yBs3 zpL5hMhyMcf|JZx;_$aEZf4HhUU0t1o5E7QK+5`eY2_&o`Yz+xX*aHLzDjGrpiLwO3 zDq9pqW&i;NVKgd>B0A%Uj-!m*#GPkGA5qc4al@S%9Yr0taq0K_J@;03rH#z@qc{ELyzyGL}=Rg5`_gUc-EsA`k?v3>vNn z?;=oM&#$UfQ7-QDC5l0^LZxHMoU;HkrLb|Mec>D|J4!S=^IRuK zjgnEZ980j2TQwgZHCW+UIoum(CD2Yw7cZ)rs|i$HX;XnVlVs#|gwwu`303cQ?&+h0 zJ0{k5`Sk?YpR*V3H%B`6B6LZS;-Hk6ruXh0QBAmd#0-v~Q-Rq)Bv!zeOd^;`mty{z zk6V0-#_b3%r z+>I#emH_Dt}KyIgBgV-yA4j5N+J5&t<4Ey9+Pd6Y(N3UIA6ScK2_H- z8Wn^rxQj9UvVO3F3CO`M*l_h1 z7gM!yPA+BuNLeU@eWBhU)m5X1N+Bv(yjLpYi6p{$*MsG5T!3|)YPkZl? zE8<+VNN~|gl8Yh!H_H2u`4?5eJ4RUxo=XyuevR5dPw`5TNgsEP+qLKS3;KE^9`4dX5`{ot8MIu|IQXz%GaA@A4u{W| z_#Zn>haFn9rJPb#GYyQN6r2&{jedqDk=P_1D*=~r_SXusXHKk63sa-2>${Ey@QN_ z0XH8+&?+=E9jau5Jmcq;nh~`nlv1|s9V{C@#~DYfMX#T#vLdR=47PPya0ZMsBUr!= z00I%cA8@polG7hGoVL884*Nv$=B2wKd{)prb>f=`1dQMeQBZRt)&1X$A=X_otK$Ho z6%jfA+rxmCRS|J>cHEtf?3{l62MioEICsdLin;L6oL^mY$)yVxE?Qi>1dg=JSFBvM z`m*4t(fI{q3X6)zjvGH=;-twXQ>IQUojzk`*{t(>!(p3G4^%G21hN#~*Sv@!&d>6k zLBZH*rG|1~GSq1bl-%()v;%^nT?FX$PhS9o>k@r24Y_S@IKFL$+`u=qtB@gEVJVigLl|8L3}?XL1YkHj3_Q9J_3txuuQwLe8Y^mzjJY;mYz^LVICiE zT#z+AIBa<>{B5i1M$8UgkhL^843o@UOfDm48(B-lAt);=Sha{x=>*}vH>NZw?gWN% z7UH*f>tZ~|8(C$-MFy;Wwadk)y>1zHphyc;R^Z$p%vv$8Fql;r%yDso&>}}OD{=h+ zj`8`}z+sf)VIX+7;%KA1EUWOC#vFJtE}Gw`Pah*bp9`9#l1uOJ+8R9Mfak;FOO`IC z8G;qd=PPP1wv z;#%=&j}$Ow>eR_o^Mab)Gh=Q~IFgGhC}P^Q(G!E2)0V+qa~jTE3+Dt!V8gL|t$g4zLTV(EIUm(-&mrV{9y4JQoAyf~;x4Evu?ku!y`h=-dS& z2SOpUZpAuaOM`1_=hjs&n6plVLT*+CvX2O2oXF#D?rWqmBiqh73Fqf$eB8Yuap?G_wkv@rtWF6vD?S zluqcqYyg27S(T0+b-3h#YeDiHRrL8HDv2Xq6$XcVvgSE*(Kd6d`UEFq9#aptRIS2X ziZ0j`U+Cbq(=#4CgXuuIXvQsT8C3O=AW9TdScpzr}bI%T;#Mv z%_4c$OqxS3Aw{gAn?luVaUQhP3o>$XRCP41FXk0~Aa{kr&!}2jGY?NU@Ief^=d;^Dv{P?3V9hd}3;D2Y z=zgl!68OmB+E-mQ9_r|ulbziwBZH51;IymD!7x*J%AUpn7c88^2Vya^74~A^7J#mX z!QrOx@?fyIvx0%1%t5F{i?V&yVn&^j8AESwO#SCBmnVgJV*=B19k!indI?URG+|bx zywYH8Og?!W;O8=-9QxwB?Hrv9&r%Tb#r7vy9DU?xbOU}tOgYh6avlsAQx~r8YMzPo zSJF`%=>eO?Ru}#CLJv>t(UYf*(M!D%U+S7{(2Bung~M}#nK?zH@mK@=d}UdIB_o5T zc5$sdp0s#TZ}kL>bg!mz^tMBW&739@F;V47Qv|ro$!vihA!8GdkIcs^%o9Gp)zr_; zfP5lwB;mUMf=WE3!_FjGGwNhppf>{OIkN0y>087{AaS`T-l)jS&nu|RE1Xx8SG_22 za$ZT^lBIcddDHSr^U9{@t-wRibh0kspwI?o|Dh7HXyCa%_TjqV2&IlO3m1+t6!_Q` zM@(ci;h}0eZ1YKx{K*qbCX5|5sT6B37vGYI<$^_Ok>;SlaFnhpPqkueiiZekpi+$F zI+RH*u;XFXSWZdRQs{&WUzixMN5a+a1xg|8&zwhPT#G>|OfQ zHjs{?if*X(CJtU}%CdFe!j<8r!ETWja`q9}M(~qAXAyT#y605MS!2Nh94y5`QPV1| z4a3>Fpqkk7%)Jzk&FQ9>Cvz)M*vP!#~51#r;p|+8B8yF?1+~9_rX3 zhW~TropSIo;c-nL(c#b8Wv@=(soQ104lmc?k{G^|{yz=p{G0q!ekOgC=GQDy?e(+{ zSH;kqV`v8#8}9ud;tp%v13!a5U-Q@Y&Ejik)MxV(p-+eW>&MZ3J{ZaCN4Qai6D^+s z=Vfiqj+%Qhx1q+m-pCa{Z47*q^NUAK!m$x+X;tL~8I`m9EaxsOI9yImA{*zafZ9>( zvW!N&tmmy!nyPoWqj6IL^IH?OkTz1M>*?}TNYSVAbvVlG=p9<=HH=Ha$Q>UJA3^-& zM7*2?Z~rVT4TtMN=i>c9ClMY+XyBDG-ylpwSUfWvE=O2ahM#&uxcB^U_!Pob7lgyf zc&pt>gt-XIE)0iD_?Elb;qWSi#}Mv7cpBkhgqttMyZXpSXyDyQxs|{pwC9Dx`w*5P zY(#h%;VFd0^HDzDj_?%I@%s$lFdbns-gvedVJ5<32tx>Mydbs= zVKKs5rX$?UbcFkuz638TLYRi|IKowR;cyTWYV~sHgYfVQoXg8m3IH$#7frxDtTu+tXkhcIm`?1gYM z!ovuQw?RI_V+fO5ApbS6E5gj}XkUaU5w1e`%}(f#aPzgWXA)?Hxd^MTgWe4B9;-%# znRw~fHwX{o-D%}5p(nys2s82O!eancp-c;`lEd}(it9s9E7J4ena{mq^E#JSb{JYzcEyga5KWa2s7~$Mn@2qAv}rD z#?L^VL3j+Ior?P3kMa>#BMc!-djNK2cog{&Zbo<%;c0}YNaI%wvsQVfvHF?}YzHOTyuE6-2(9#~Vg>jju04 zAK@#0l9CIP(#Ew+SZUWABRdQ|w}1C^3Lvfk|7BCc;q_#UN=n}30d%9cC@I@>X;NB# zQu3%Io8>IP{}JH&6Q@7SdnNu)fuVw4AM;QAuf=phs|H9F_SZDF_A>M79aQ zevkjgX?U{|>j!DuJ;h0B*LcSy1-HeGPRiWs8S-sJbp* z>@|u)@uA)Ls^-1GEftRXV*ZC&Pt1)UkR^7XM7vK&3VDu6Y1;1Z0e1|zHN@!;<$e!- zc3C*w#bF{GhX2;EM=tn#DIfLN<{6Wew$%%(Z;C5O%G`+d2zmR#7;FewR5gUsm~u;z zfA6etcsSzDhxT3oek1sE;y{SKA@>@#v)kSUw0BWbt+#hvv*uRiP~O9kcM5Ymen(Wx zo6uC=c(>${U|l;$6??LzBidUy)dvV5WL=$#k(B`p8zc|0ee1jlOYcndM2D`7W!m-&~1&ndRZ|8#>=s&v^D5 z7l#1KZg7j zNy(F7J(hnZ_rM>F^V;2&*kha`8L1MmG4d zNG_&Ew8sRdV-Gf;hLZUWow+!ty~{AaoyZ#6=1U4mXr+!Iz(i# zq@9)_|4HPxB6gB>H)b7>dAlfSpVxa64fWHBdo=CNc6k!=_F~W49&xpf&Wy~<><91Z z^y)}@QPK``Jr-I_?*$2^Ny+&U`~YBDW6yjN@|^l@<9C7!KKS6R02eaM%I zz4>A0(;xDm1-}ve0}c}*+v`2>%dm&<&~%*2x-^#c8K2bX^L&Rv>8_Dx`^(p7F2{)}|t^Olp@VdnM+( z75R=L-yG(1&kI4%DC7c1+7Xk5+G`TVe)c-@ok6}Yb-lN<9o_po(Odf2*rZc&-i;XS z&GfTz>}S-YHzND4#?J^KPCK~uFGwo)lwp~I63_=5Ox@0vfL0>FE_+4y<(7Gp-MZZ)j#*iCr;I zc(05#s}jI+*CK!U-f(ym)??M5(D65UrCgj>yg_V$Im4(9rSkO#^gf=AhlRt7u)Zn# z1GEwQ%#gRr8#B?+p6?Nlv+q9e$xF^H&*d?D z1#N%Y;bG+8af{wh%6P=++vG(*+!(jqF&ky$Lp?qQ{s{2J!ckx3$Ke{mY4C4y@!8Ki zfnU2X9PUNF^z(_d^DOD$8q53zz^wxA4$Wgg{)ONt?+=F`BBL0i!b8hx+^m{2=&Jj_5O%`izI(|20rE;9?@}_PpLuGWflIv}M*2RakJsr@ zy;en*0vZ3*A2?ZeFyG)@SK-K!`nQ6;59#$ym9y1JFM#asNZ*X~DwZSjw~QZ|k7WHT z=K5F6`DkKNXhCyph_veh$T#k8738P;hLD7E7G>ir=Jx%K1cxF^KGz)?p&&uWEb^94?t=C{$I6L2dmpHt-$IBOE`zwyKU3qQDI zAfI+R+!r5`n8Q!jw4?V9jl+>sK2G`gr3|Uhkw8fJVNE;bp3(FKovw2KpZ`wN_Kz*; ze%-E5>+lsFexSpzbZE5b@;RnU(w+Q zI{Zq9R+$l64&Tt>`#Stohu`QhtV2hnU9ZYd)?s@co}d4iixz`*Mw@In4OKQqvy9IcDlL9jECnn&$Xr`Yug#{E)s$(;Ppf4{4g? zhxFZ==J+OkRMQ;aq#gaJ3wh_@pSnJ#apbdX@>#|)7oTb5bI3pM;!}6t6Jq#^i%*@1 zcjg*Ih3>3F=3%-My5kNH$xS}QaPDgk+Z?Y+=Ccs5X}u|fA18cE^PTzDxp(ny3|`JU z$oHYdYruWc<8KhV#h%&J!$WsR6}YhvFUv$KDp3cKy3QQL%I~5 zEj|7Di_U8@m7*VKf9s?VzBs9CG&Q;}cG5==vYXgHHh-5dA9748Vq()zK07+|Z1N8ec;?cx zrQfptt+i)M|NS{DYR;Cv<^4|wo-O^p>xZ{JTl#DNeEi(ArO#M2ev~`?9<|KK-h=%1 zv#%Q9PIsnJ4Kw@;j+MF7o%VCm8~<>pvt$d*;XC2|Ptubs`iafIsrc*sXlk@yJNZ-F z-!j>qUZ4dygF3vhU6cAd`a9{XPhaO8*+h&}zLWlF)dNlH@09DL@9%u!5O?{p`MbUP zc@zD5IRKq$QDIRN{VvpeXMD_mF4V0#`A+}-@2!tUjfufXF zrrDPk61`HK8Nsb3;?&w>90FM38_x*|VtkYCEG+p__**dfxPOpSWE_mzw$u`U$RW$Ups4k&V9ncXRlKD^-YF%w2%X6 z2Gxmea~yOq*CXw5{5RL|o-MdV>SgE;>YVOk@E>EoD> zi}T?*H)QZlgbtpI(Yw4`YmnfX&geEqA&tY!XIu7elBP8V0{rLU>tx(loq@Agf*<#F z1-=dYPvFOWznxM04)T9sJlQ@3VBC+4r`h)s6J|UZU=eYKNg^{)M35;&cHkeZtgdahWBm3Cu#*kT?hc-1 zH-JiDgHr9pO^A@o<`nML4mTCtZ6I0@n{)<}%pnzsn$LsTl3JQW7){1A_K31MTm2&i z+@VCZ%0|nX!^njk1}V7!n?0E5bYpgFe$0leQK!^zSw;`Z0%q#Bf@cvENWvgUCAk-| z)|-&S%w#^#&XP9VD4V_5m)^sF8`0z33Pd|VW*brVJZ7x~+4kAA5K{09$rxts&`Vb0 zyENb}*`V0%%qiY4%<0-5dr*Q{DalB?77~K(6{j#%sK zDQyx-&(2Wm?JOc)3`Y^$^8{JVLSuf#uZndyq=;3Z;07 zQe0DIN^zlzAy+9blj_n4=T#3ZR+``)P`gU9`Cm+L)J7>5Jk)-p`yuJ_1siM98l|w*5}mtEmlGB>V8r zqZ?p_k162>3N*Te_-(u--{3ZUih6dEes=?{U{3EOHc&)q>fgw_fmM#f;8PZ8Ael!} zEFk1;l41cN>p^y`L>+~CgOv0^AyZobD}7MNX&{Xh6Ft$qo03my9j7ABkJ)e;?5SGn zZtD0~WKb<7WDl%;sjQsJz7%BZUR3)YO7rY2Z{3NbK~vRjlg|l(54TbD#mLb2?5u{p zJayowtYGGcm^q}OONqzFY*>l;BK1Q?AEGKRuakh9W_h;)esiL@t>BGNt-X-3!Wfcp+}>F9b5`j_txM$-qtg1!Z^6m{ba+l@;^ zcfT|{TNqyt^9tlqTNqzo^A-?l3**ZcLTzDu zIYOu{jIW;%Y767*Z;nPDwT1Bw6y4Mo#y8l!73E+H!*&^mXu>u`yhb9oG(Nqh@$F}& zWlQ5bz^K~N_ztrAYD?q0o$bXf%@pt~n~f?mSq+g%E(;JY=TzZxOYvV}&jb=CRM!d{KF#=2Vdhalsd8IQ9&A<_B;sOBh>mOe% z0P(Gix3>f$Cq48N$DYJ<_nh2gARX$+mp~#yGmXv<{(~p!3ev#B#UhzmBquCATh2Ef_uEW)yBIPYVu*q#vOmH2P*p$1Cl{YvNU)Z>7ZZQl*!`VTT!vV9sE z{I~K+&@`LRB=~P@i+IppM*g83#Ix-stncl_^3a&4$fM|Mk}%iNr6Hv zJ)T4l{7Y+@Mk|&_c?6K}&8Mh4`A}-hy>^S&z_R6OkCsF#?d@b?+ufmoEk$&wu-t8iw~$H{_CP{vZJd4aTIQDu&QTSV2CbkF+wQ=Ob6MGB+ho2<3T&+^ zQbmDX?KD<)9xdG4{*?In#1FIm#8*>J$S!0dH7u&wh^I+84M_u77*+&cVu90gv%Nc; z^%P_7VP4&-_A0h?s$Wrl(Ec4OcmrFjyWJlm?HjAXInN$W9U7=xfBVlAeNzcABkXnL z-^{!d?Aa82%WFt2v42ZB`^aBlKZAXdy`SaQ+JmUb0m@mWIthyso!cs!Vs#y)Q+0sfz<7v zj4x8{LY1XmQWVW3p#zGxu|E{rG`zEEh%di}x-E0<@FhqIY-2w$O)*a3zgeiqq-J(N ztl*WDw1Ai=PKaP&24OxSGFx+fu`MC8jW#Ek)o-RkblanLEAugy@h;0q6^Ie2W(8Wy z=E)AE6W&Hrk^@`;?6#66Eg=23o$N{OQx$o^smNZVO%*X14MqC7_s#VM;UA zBg;wPE>zNN?P0y_KtHCm5hB@W*NRCunFYv`R)ZKZX^SsKj8j5dUDXoJY;U0$^A@p9 zhlgN0Q)V!;hB<3B)&63FWC}An?uG#KGY^~7{54RWXhhBEL>uaOXIf3iyNIfvdw8A1 zIOls7Q)K>2lv{-BHAO5wxCe(bvxrp-?!zLixb5FzabpQ+--88>lc-#9882FQr4r6A z{{!IoF&i$x*)075Bo*R1bNb_mT#HDzlZfC_LbrD%Ea~=%gk{~IL)aTQ09{kvJ6OX{Aoyo z(3U;5&{7S30MJi0v<^z(deD{)A;_RcQ#|Mr88t3wYgCn*)J}UXh8%2f(9-{(wner=4?dTq1BA8?5EjXLxSrPK|eus z2P9nsP_J%?n5F4i2>uHC++0MA^WJAKySqf^m4~;%HXHajKyUQoyV>5UsUX;s?`BV4 zN=kaOB)$MLt2evXJ;dU2R90_x8ch{v-O3{FW#0{WkA6~)5E-}?Gk-^M0q+sb#E)^~ zN|k0F*369cTDO;i>fsOA%M2MAH@eu@VZqVtmo@uuK>u2?J?k*}hBe>riX*D~gB9Crp z-;$mFFC?>$5_rKi5u)V`7W0xL1`}dm&1d6%Cj3%!G;(_Do`Q|9 z>Zx*df{k&SmCGRZe#D17;z!RtSGGd?H0Ws)3HX=|C(*a_WKb%^M;#4g=rZD&GPM&9 zItvZ6x8%Nu6gDwgLbdJ;3cHm)LR}6j?rU7T*G{@Z8t zd@(a5x;zk68Klbwyq9lKY?@|0Bu$lTz?RJ|?2x~5T!v)RRohG9CWsJLsQ?p;s!~W) z`x8_fI+u8+6k`$Yd9Thi42p`9(kCWJGt{vxyBCef{pXMc7KDzIvKn=^;z(IXB4v$} zvUWi_%Ni$TrJRGZe*Cv(or_X<3^Hd*Daq%m8tsgh(iS!9%4VD?d)JY(fyNmZ{4thcWU|5)HBz{Gpn^ z0{nTJFM|xJu#`E($XKHxdm!f;4Ur9^8ec=ilodwL9UAulaANNOx_!X=tZrj09N$pi z$^p>nh{nEwI6qi8FyRj1y*>q49K^x_8&{J55Qsjg`aEX!>@3UptE;Gf07!A11q0q2 zqFpNB-4P+C${odw&vnLMp}MnFMspBHCpNrO%yH5KF@>g_7eYyD|GGxY-zk$5CXA`( zc#Knx;P|~xBWTa5F8*&6-&^-F5-!pS>`Z@l5{^OhQfY?;I++KA_awR0TlXz`))hK| z69N_GG~%uM1;(~z^McQiztE^J%UNrO#y%s;)2a*FJHtdFIz!4(Z1P?806ySO5K7(+}=H-te&u~00kzrM?VbKx%6=dbLP>B7=^Jkbh zFAEXmAl(jikrjem zY+OtU(BLVZoCa5j1&}ioXWAVmQ~m&II4?Pw*GhiO8YAbr^&yM!P;C zX+>JerKUG>=aMAjNmMps@Xsn=4+Mx@>isl6^kk~zKCCI+SH z*CMzDS-Ra!da5yGD!bNqBI>vnFeW0Pe`jZ~d?#wXjGTF*n97lqfsQPyDY^(W2h$HC zuO3uxHiV6s&)yyOP~5Fx_D$*=k}^0^(?BmEeaJ&`0R2W5&Y(`0_gDd8HoS`EU2QEC z;!`P^c%}^RfkJk|%I0q+|1+S6FX0?~L@&f;Bl0Q!Ue8;A*Y(6Xf+m}j9ijq8IsVP( zJRd&TIX z-<$ZeD{;6^RLeRN4XqGIT(y%EyI^$UNA)0s7#SBGT+r(rC`Wj}`&10{H4SBd__cMEO0x)ciEyImL0UL@V*?re)KqXjESq(_hM4#c_Or>1f^~k2}nJk)wH! zJnjyO{t_(`cc(;qqlNt+UWIIK*hfSus5OAac*7&D$dYm4UzLDj!oj%2*nZV$OzH8=EbIhjU0yK>8* zE+Nhs^E)VFZu5jy3Gb8>WW@6>;=o|!;_;X}C~KD|1FV2I=%DrkdVzz29utwv-0vAL zsK&R;kqlWxT^`i3EMv?OPBgc9mVg!T)@xMZ05-sj>{ms+JRw&)qhCfb{TbpGi1A(` zh*FDq<2A)79*8;wUXZ)le`DHcq#0xBxfUNsTk-mlk|m<~WsGH?i4QQD*O64Ak&=zE zdzdVDX$zZ6rej#V#oWAP^po6lyo>iSnOBISH8P4YRTE70gIjCy>t8r;A|t5GED%vfGE@IOLH+&4(oU0=**6z+!B{--#fsk(A+ zuUb~p#|}ff^#$k9&xzp|tndwj7Ju|0K0gW8H&UU7_I^2p8a%x?XuM3HYuM3HYuS+ddjQG0H4=dv9LX9K7E+itpE^On7uS+3_h_4H_s@h!_zAkGa zE#m946-30>Wes2vUzcqlBEBvpBEBvpBEBvpBEBwJAR@jl)GgxcLgy^_y6gsz6G*}) zM1JDy!b)pj7e*al7gpc#b*TWGP7fo&Q!Y&+lhqKJW zJi6&uBH91!0R4tnIh1%Ehmsn|N!n|*ScM@ix96>vWU0HzRxMS~ zSFQ=xjSqPo*95D9x+&KL>!yDIqg)fLo0(U+CRn$u0!FzeSo_FVt_jwDF{ZdCSO+Kv zt_kcUEJ}2SYXWJC)pZd12Zv5Tosi8sH~<{f>5*S>6`*T^^+#8o8d<(_O|TwgT)8G# zk5iXn;%VNU zl4`BDmt%m7Yl8Kz(uJ-G)_cMh*97bRs{m213DyUUE7t_;Lzbgl6RgvWE7t_;Z?wB| zO|bq>zH&{lK4M(CCRiUcu3QtWe=x3G6Rb}dSFQ=xr_^1!CRk_4SFQ=xXS<<;a!s&4 zXI!}^SYI%%TobH+G7i@SwhL93c7baGX(kB`*96kHIkahbW}}6yFL%K^6Wu6(E;zr zpOg!-%|$xn2<#f#n$xlKy*wl-P*0x7NO_cnoX2dn8aq-0Z~>i#z=Oy;(tT&wWw`AW2+!v)mGs5fK(

r(JJ%_+L>2nF zQ)smo)VzsLyG`6PV&ar_RAe?qIj#W(MaC2hn=+rfX^)j9bmvZpc^075INgin~M=cfu*|^jYiKq$>@G)YGy#F{UJ; zM&=3%Jfo#VYrC21ptdVmTmN~EnOCXek2}S8>8~mmY1@)36a(U4W^FHLhHQc~&5(yoOvx=dc(dpky#wKL#-(kb}# zSqrYeA_U=_%`kn`MGL-)1>ebn&*=Qo)?T3tzLTvzJ5um(RE-;*g1Za|QL+(f?Za1u zNaK?d$IkS7X2{m4sQ8z)_}^3f^%3z8DDe+F;xCF7f9i@5LBuGGu#L08oI|_Vv;MIAaLDy)O-`d^-=S?1a;J`*D5;C-98TE{O73opQEPihW@V| zHRX({Pn@zLJIhg1j$!Kg(3U0EEqqEr-1n{hyfk3g+quy_z_=~$`_{o4#FhKLb?ZF9 zmHWPR8@;cU`@VIEPlYP?ed~5&l>5GQn1f2W?^}1|B3_J#R}m1>3d5JB$CFr#f3cxy zB(prqBY=Dl-S@3KKfqvZx!2z!2ge#YBl}%vWPh@-<&5lioss=1of8a zht_9ge@C7#P{;mLu&~Jy(m(JNs;$n*e%BeZyx}HrV}a_7>>ox}H5%YLHH$o@(SRA*#=6$Pp@vVR^etj@@O*BROGIwSjQSQO64 zH0e9|mj=*hWd9`>g41%dfA=PIF*zgqU1wzfUd84a+3z|d`){}&aCJuZyUxh|2I{8H z$o`uS0i({ye%BeWu7noss>nGqV2x<>)gqixQpn8JS{r9b~BA@6h^;>_5m; zH0t!oXf9t}u-^GyXJmgP%U5S)|6`1+GqT@xM)p6!^3@sH?>ZyZ$chK!?5OelBvptdt8{B6humiVD&ttpS;Yw!hgNECVNT(a{)x9tF zTKj;WLm<yqD+1Oz*=^UcHJCCpL$Q+$df%PN$X(zrv&xQLBk)cZ&!F% ztwq8t_@Q-$*D!thjNq59V$Oxt&QeS;XOJq8#jICYr>Og4i#n&ys59(fcX+!b-nHem z>MnLLV_1(mV{(s9@yR{AbfO+>QT`Y$GcDaP4mOo}Ld!IQS85P#@%+D0a=%MS3+>{v z#~rasM(kk(A9WScPi6aAbDwk-;S&G!PsG>3!bWhdHgSJckvGF#S)x_EOM@l?bW}iS z;Wwia1Kw96M487f!LtxLM-6g7Bps}~-G-dzj0%)`O6&L!lRALg&=$vl?zIuX4B)%V zBN{U}!nTuq8@4rP)Byeo$czP5i?p=*{vhkBaM%^$J#j4Rag3tMSk{ds`9=YA7ReVt z1`7v=5H$?eXHBB2H2NL;ZTl-#k`Y|2L2CfwLl?A+y%r$4N(S$CiFmZBh)qt{51w>^ zK5Gj4E@Cjga${;&lkQnljNrGfENyYBW2ss-BcfXST%{!SAKDbuHv-DkAl@e|RUlPF zu1;XZ7P=EIc8R^VsaPu_cBKpSKvPgg1hm}+dc7$qIRd)P1v1h8%~du1j~YZ}r3JFk zlU`+W9;iSIIxnCc66r}#wnT4$|Kb5gLW9t8j)L3baFz4J3KqvNh_lz4{0KQg&wQkaN z9&iEPYYuS21@N_MM#5(aT(!g=Z@x<*WTm4)^N#w8+$Gw7=Ir9ypK$ADo3AO_}-1+c)3J&vH@?VGt1rKQgl&jGe&*M z1=yhhky%dW!80zz15FU!S=T|j3$+*?we6ctVCNAw*y)fi7#{m6lUGb<~%OV>AZhgeiIQhkl9eu7R^`B<~(jPJ|E4enp5R?8gw?7tQU1_2A$UI zG$Lx_VQrde%^@zpTn&h{W^~P(>w;a?1eUH%SMNgH2VL>^k?eT`#_4AVHSMVjrjNI$j+593)093EMo z3ba7-NM%KOukvFy*zNH=-m;1X4kB|1=!G@dZDx76u#}UYi&qG0`eo9Kmg6}dm3}z} zcHd04d0#0Z;7!v_mR-q)%=V~tELn5=@dX=Zj$q$L-hOXjIW_x9S)U9A{Q+eu`ubsT zFj!Qz(xImeM_H9jAE3*M(^C81&pHm&T-)JZPVOMhH8i*PSZxGZ4rgkDX4}(FtnR~e zYP;gFB>GkwyW-9whX#R&(225_jPNSljcQhLVj?Wvh@z4`xMyb*; z?qxfqu{i9)cj!$bfB|p+2vH_^XL&ZTR9C(U&E^{s$)|E0);T^zNy{Y%4y|J&AUyvD z?eK75@HLm_4GPF5eG1BTYyPzh_JkW&*92ydR(7Cy{;FYlM(+WdO`E5W(uD4+QG4%v z7=}lWZO1mgPTH+rV~2PQ`Irr7I-&>S?#y)_#%@5LP&0_af6G4NO+@}(y61iKNx9?< z-Z45P5jNunbbAi4CUfG`F2ErTa7`hjbsN6xf<4y+){zDqqG3jUw77pZfn~U1on5dt z9h%W9&jlFN9AKskFk1tZ+Subnm!D-W*!t$=>~sNcX%6sP7of2@z@J@!x0?f;b^$zS zy`MBeNqC8 zCM1atXtx7u6woRG&4>X#=zty*(2WAx6a#w80X;6DM+J0$3@8qD6UCnp(AxwJ`XC0B z&6+(a09-LJvd-b$@Gr4euu!SLI$|w8vXFzL)xSJETg8oZ<_3iy4w^@%?kCWD&DFA8RMz(H)RNwiP4(FGr&4w3Qec3%Wc)5FYs;}^w`T zU}P1rK5s~UoL}|0!&GRqp+#4WZLDN(c)5}}wNYr)W(SJx8PrnW7O8hq>cF<+LL$Ox zf(Z`j9RWQjpb;^kB@XCa0i7YJb0doQrIYZ1B&3dJ!WkqSbP_(6gh5Og_>~sxRQZ?# z`iFpK38=^T5U^3Z{3EDq@ri)e5;SN=3`n}drvhMC7)<{Pt2E+YksI}|$c_3}+Q=n#0Zz@(`yMUb!F6 zhXUfk%(8Whm$Q|#;^xi{$t!@b4xkAA}UZnHbM@s zOAjYTGPN>B>IX)6nNhMBq2-Wc8yoP=47`e}WNiawg#JrXU(%^u*^~~FI)b_4hRdYN z{hi9l;*4Q@v@A|;f^i=zIvArSinN*d&IM-ps~&twOuj@W-hPdYMj^6mUzkzY(>)G& z>^tStlbiE}ao&6W@86LA|Kio?+ULQ)8|wTH|Mn()c@6D#Z`q9?1M&L(>7!A*gq7gL zeLWK2M#7&=#MS%L6Ye1Y2lj&GghK%8_opY^ON@GddSVSR`u*vNMFg4hq(ye(RZP+E zPlt6t=m#b#Lq~y!22w4 zX)$p(iB>`^OT-B_^8WNh5A|$kP6V$$@gc&}%nLwVk*MFFz6rdIiC;2LXY+axn-j4W zj=VoT(PAF`{`ABaB=q~!6I+we?@v$cNJ76qJu#hxet&wRet$X@ZA{efPj3L=Nuvc{ zHX3(h{7uji?@wnXakCn%1B@C8>iy}guAQLXpH3ls0Ann8mi-tgX=17^O-wGEQ@Foz zxO@QPUJyy_4ox4iXqoKW@&S?(_?Aqcylme{;2X?**Raj;JP6;C>D&GkFfG%3JI{s2 z3G^%WT_=4--jeB)AE${+;9D|%y9u!qrZe01*@#nL7NLA=JbX*0Zx35fl$=5|duO82 zYBCQ+S5U4qgxGiaF&lXIJSB_KE2!8lqv0@7hm}d--+^lVOXj|E6y)K}Pi+p8x{A=E zeB3kMNpcg(qafSXEyafQYRdE!fW(e#Gm9NULT)7gg5(t-+ub`pgs5RmLxtjy0k7Z3 z&CoQQJ~3q4z3)oRlvLhOjbhg0e(|>&vlJ3KoHs#<+YAD*fOpv@#h`%(LHUjuRJD=i zn~jjzNh;k))jPAolF~&g`#3AxMQZv4)6YRYo@B;zqz+F>RPM_?&1kR#B|jUa^QZZ# z(UI@YHjAVnz713rR1_&lQdB`>qXms)dKWS0cqvm%If15R@e`xPPh!;QT8WC9|6;sE zSKWfv{Fn{A%bmJxJ}%w-i?aMv~>|$fzRhg04m62qQNl@&O}<5IJ6qZRkCS^g4>K z9g{*R9k)T)uGU0F>=hPDJi%9gozJ)PulN?eGq1u-4h zchE^AICO2eoobkL=j%1mOm{ZgJ*5rGB6*#!iKw;E>5R_JNnox3rpwsUCBjEqTZ|H} zjFqqs%5>8JD${O^3w^gjO9kY8Pwg*oK|5To36`8@PET;$57rs!9K0B|khe+ee-OgE zp{D@Ie#T*o7f>jF1y~Qn6o;6aAU0K`(e7DY41F?2D+$V^>~u!E%*m<`FnO*@j@$Xo zTI}LV6<~@6yaB*g1;9h;SE^hvQZ&7MLLj!%-l&FdI*MRrl1Pc8;i#t_c=|K?U z3EYP}RqHTUmOk96T6aj4J5}pWi9VKy3}13S>NA7UuO#{>M!%NmB1XTF=wU{`m1r%a z|B~qM8U0S87c%-iqkR|E8onPSwgRyqB{_x3VMeq0=&w(vFg9ooXO$d|*qrUGNIyP? z>)#^3OTrG9!xhfxSB(7rGTCvkr39;z`tt#7--|SF4ws^w8p?`%8QCvjbokd%I4$hT zIiTqdkY>$!keRswRA1W=GX`FX!UG117x{TSTGrEmY{i{13zQ{gq7dbxLvx!m<2(oueL~oa97jlm>YT0~>T>nCkJpJp- z4uYee{giwW2gcRYzt)e8tEYdhFyrd!U&}B_sHcA| zQwa6+uZ6ewBSk&^i>|^`?8vCxmjzwk#pk)4!Hq2=(-@6)^t>71h(fR)YB{2=(-@ z)xxC4>givrl@RLbUn|vo1S#t2UmV8@L8zyHIg&wKAwSlG1KF35rk?(_I-5T9NcHru z)x$gnn0orx>TA9aLOuO!WecI6{#+ukZtCe@Yp|Jsa`5yo z+vOgVpYRf-YnNOZkY~+DE(yAL5xtQ~8AL4=RH9#A7OPD?|uz^wgl@^E(zZdajJDyY9 zh}#iQv(FF{;&Rf-zK57mI}s1s1If>)THS5By9CBiW~Til`Qtb$vh5!zXVQm==h{=4 zx8!rgL-u}RF6a-{if#T}O5j4q%j_Q*pUrr+{UPI*P-VFNvPaY*DwXx9ezwt};Y-dQ zcAY~f;uX;G;lG2FS3t*`CT(NLE1=^&CiAIRK*xI}MZE$#K28Yr3g~#B5Sgib+fKYC zM7GiT1bSP%-(*Q`c(uf=W{aDdV7FZ{XQ`&!CBhskDeZF9!n{QKNPD(|=V=+C-l@pa z;W8wG!2#1@q)H?o4R`lSEXf+QoR8S&|?+cY=}4 zWLYM=%v}zJTB$e?DN;@s?9@{w-#J}y5b7vVmY9WdgI}Ry<*VuK~$5ZBNNqp+(5 z1zyB%vTG|u5+-2#*$EN+VAZGP0_;|!^Z9XuvAeveYdZGK&r_ilw12%$ua3xl5ORB5 zCjc5~CWQ8UopoNt6z|S6>ofQ0Or0UL9!v-hcz1bRYL}1kMSJ;#F!eF?U zI^K+;RlD5F8vP(7+eOH+B~W?_#+D*?lgt9lyq9%&8)VOGS;~E!UCb#dKDt4t+H1Abw9)NljeJDxSBit0mThFb!@+ctbvdO$ zhY{z;Y@jPckB4Dp1t>x7*qVQ3RU1gig@o6bxz$vBW$+C;=KG{~Oby{1PI&T7VUsWy zDq#ZHj|@F=i`6hcB;K5!br_|Yr%8NC^DlhZl{_U1kB; zp=lv-Fgapt!4F<}3T|KV2-zhw*A`^Smb;F-WS;y1bdF3Z*PG*jg-89GoTV+0b&o@@ z>j&q{N>g@Z#-xXTcX;@tgART|3+#jpnen9|DZeKOHno!* z@2S7lG%{Q3x(X_cq#BtATqRs2 zd?_LGbswca%V#f|oCIrTU>wLxA6}Hb%I`t%xDv5d^Sb;@l> zv7iQ9eFb{cQ=0!M_}%`hiQC&n07joDU{mu$(eI2-pC75wQK=Dru2@*0tHXH3crrVO@bs9L=a9gRLmsn6%4 zPq|K*13o`yLmmcX-#X~?x#+VM`9&W^z6>&tTk90Slm`D44Z56<+mg4ntQltk3UR38HHa;(L*2AaTR_gJB~M z;%m!nMD9Z51S08M5D9pTKP?xDBSTtrhLCvM7G#e&3e2pFz!}_GC$od0yZ%Cx-=Jal z1Bp)Q-Py=V0{(BA`&_AwO;SC0tUdF%PUfKcs7Z3byEZbWr@(_g^BWCq+a8y~Q{3g? z-G5OiCOXUrG^3H3R2HI&20M7Z$|A{_mVc>p7=zE#pni~>xm^>*AXdZ_vee0v+2E49 zTTqa=B9eTcOUQ$|SZ+H2ipqG*CF6vW5x4X38VmNFrL-1>ax}pS{aXc-seGB{b5`w* z9xKwFpN+u8=$;a%dgE7NN?fnAaE{HFESwTEuXjnh#F4ZkYEY$B<~~>M%cHp;a^=3y z$^G+sEH#a8Pq`#<4$i@RAU}dI)RWK#m&tYVAEfEfpBu>S2hY;slD-|CFofB6gonpW zTtwf%`E#UaA@Hp(MjO7P1<-z3tyM>fKRwyV^qe0xY=KIEqhn?}&7KQhjS58(fmU{`Zn_r?J~FT#~d2 z@zPREPJN!`#5KdS493`@L9n@e%!Uiwa_Yh`1A^un4|Dac!=6lKco<~%0nRmwE-DAj z#Z%MIk)EPxUZ%;ylug^0iLl>;>OYvdXG!s`D?n#O=^k@I%Vt!iZvvg0dqD_#;2orB zVpb&E$epEGA3H2`ZLV^;t8{`7wUA0X38!>Ica_irzj$ErXDnK`?%1W1k@PA2Vt)^N0FUcMysTjq^0`>wC9*d1 zd@kG+qws5heTxLlgLx5i6)i-DB5v`GV!oOrNnTAt=0Yc@F_zc$?H4%^irD-`R5SWz zjN^6v7V9XEOM{ZDG`SS@ZJ}-<@T{ctD_(&mB}v}o$voU!NAncrLby>l7&i7?D^=ho zRH;dsMRK*KQ2#b973Eesnm@%D{}~ER?$-n0aS!2O)ABB&xQUkHMHH+t40Xoum$K>{ z%Fz;xLVC*hDy47^Suu|`;sgt=li0<>J(^^}&zVAkSx&=ndsf7)p%}*~o&!aa4B8+U z8gP@Q4X{QMuEIABz@1UF0>=0w5ZG$!hu~WFKjixhe7C@`GKyb8T9R=J-^|G+FTHPn}HxJ`UPu3~eME6Rd6qmXxVT9nC-gwvo)jY9skM|`~W$@NT< zR%Z-qaRx5tO`9acaoge8DY}>~lnaRZdY@%X|P|t3TdVpc0I&c zNAY5=oS zB-~LD_f|u&r)VY^3AQJH1qD2dT%Iu^>mEi63&BOTd=bP*@^?z4&cz*udQfDeTuD*?bBM+s-67)|GoFx=Oi~^ z-+u4&ywCUhy_X*+YpuQZ+H0@9_TFpnbIv~QHvHL~!)Kb(hj2CL?8xCWP3c437?7Pg z4=^R)<<~=Y<-E&0r@8k7xgm$W1fL=KIR5O;i84>IdmoTb=kS@P^oa`DpTlRG(n}Qb zKn|a2N}r^VgE^y#Om?Rs&!ahfrYU`@((P~#pJ_^;<{m&f-*$5OOjG*j2B~M7((h&^ zvmIu=hoFLyaq|z5ukX-|JyX!jv$?{_xRpvcgSnHN zaU0=2*?gEO<94$6*?gEO($yz_Grq70VBb$?_P+z8 zW;TzQ$s9;dt$k(|$$s`&XplLGMU2hnCx0`uNgtnGgUp#Zq|eRfS4}f>@56O|Hb42B znMZnMHqWQX9Q<|AtFvdP16HawW%HB2nM1z|dJAq*0nR7Smh9Q6Rpv0t`M7Rqf~pev z8Yi2=Gc-;S3LmKPmjpMT{LLKxcU{NHeCIElXEJ3@WeYml{N!)uxzy9wb)TkN#iyR? z+X+Lnc|1(!$4D>8<|ltMXRyHW*`LHUvy{qB&Hgdv%p^~F_6FLdjPU&Im8{n+!WG%M zq|YW?oqaKFQcieXHb42BIfrml_ID`%Ji;A=d1g@N`GhxT^OL`sb6M_=>@QQ!JeF%d z`I~vcYe4cl`efGdK!07qT3tcapzA5b&mP8%b*yYwwu>B@^%R(^D-xr?QQ1RT*#=tp zoa{f6zmfbiv-!#2%qGey&%S_4HnXVtPCuITLV&6P%qM>{FXNdueQ(IT{dd$y8S_r& zHBR+ul8sY+hVl!tzrqTB_OGCi&E_Y6Ge375djHwkWz=CWb(@_19Cf%W6Q-M$&9A~_ z?qlA?+1M;_GLv^8wIZ9J{LS1?dP_Dx`I~t+%k9YKCx0{Vp`6XyNm!K9d9cPQ*3?0b zF1#h8aP!IE%zKw32kP|1{YcSH@l>czhgrU_JIA*OkJg>zX>C8%Ii6wp<+^k5c`oR# zI>&ct?+V>HzDxQ#-8sJZUC`@w=Qu(h^U2@LqvN1M-;T`JZcT?S?;s`f*IE~D;Ab8u z+0R}^`1N}M56|XDLo$Cu_>AlvTr+>W0Od@~t|$Es!c(&O6nN%;(C*W-&m#Rz(#x}t z6aF3Hx!LtFLgw!Y&(G#3e>4A+@ZxNK@;CDjge$W7$=}RBQuoSie)2c-Ez;|=`N`kR zKYb;gj($1e6NE3z<|ltM-zMCWJr&o?cL=xXcA?6u9kMmfB!#DI{29T`Cx0{FrAdda zLWtlNx(iUSI~bwhaI6S5IwC8}CY^Qza=Yg!j?-?$Xbt!QWEr`K*>PUIa)oiouZq0L z8OCkI$Y3Sc(u5(}8<4O3~OK47uAZr0jzB%_k*3ew(t zz&ZLZ7MQE#4zI_4>2Sk#Zm}#uoin}E7mVXrfa%-XuSamn6U(7+^5+P!`$5_iJ=pJ?p{6!dN%xIgldL5 z*lf(t4HJ)L;_7||`VX7oEkhrp^CB~!!d1=i2=b|B_$z>O*bL{e36%N!Tw1O&Oh`8X z+a7XpuaLI^dFK!cZl^D-a0?92FA**9=kA@F3&Tz}ohpxu97rx2&GG9>pL#d~tO4I#PJA~z!}x7^C@3dz0ZBjjEm zl6z3(4hqX%_m1d$M@TNuu{o6;9tg?xF)JuL^rJx5J5$+|45QlCa!YtsUHT+9&B}+{QRnKh80{ z^xqdnZc{h8)()dXa{u-ba;Jpkj@S1y(!Gwb+;Xe$`60QLiudGps0zvDqeRNSSBB*l zSo_w8Q=A9XL}lHVmjFqmRs?zj1TvP`1aq@d1x%%GC$@P6DB==R3dtX?tWA%L_B)1yQf(`wj#Y?pBwJ5QKMEPTWlV3iqWn;`_HvR4-g# z>2mqPw^EAhaZ_rW`wX~%UNMn(vE7|YC4^%l^xqJHySohat+#bi427;y%3j!n$gduu zihG^n*-cy>VsgF0)0%;+<4bN(_z!>=K4@&hFzs`B#`hlz>^L6pSIs8FZ+$r!aVr?~*A3k~;3D>7kh9I% z?x?dPok_Lp%bCG3cWVN>qLoqA!uIv$Om;p4iKh)#PQtr9g)gixC)xP|=sd;-&ZFNNHhYo>U7Gp$k^3AmVlw{2q%dM1r&DnY=^I*q*bxeylO8Z+-nCKvFla9I!p)@T48sGO!Dz{M zaVGuFBt0WZs^%&wGp3dMr6eh9GO4iEB+(l)&^J-%lP(%I7~b%rp^P0pBeKIJJ>)s+ z=k7gm8l-gUt{*A9{ZB(3n&?0y)DC<#4sZRIauohm8Paq0{yS(Y%QIp9je9;t8 z3bTJt6@Wpp+=`LAg=aX0zYB5xNjRHQI43xT(FUyrWe&%luxf};r*NL*6pjjU&J)ff zDV$@R!kHn?cHw+0g|pBpTovN{jBplwQw$?D0%vE4^BLi+NZ~}?KNaHqy>M>s!RcP) z6h07Q96U^_-h+`hUkjfJah40`krdAHPT?;@oRSJTDZJG%vKwBb7|{*04(VDtg%5}191uBuzSi9`?)gsP(GcS?VJtUtIG*8I zNZdU?%PBk_;&ext>UN}Xp6e8T7~(uzI3GyiM0JNX1r!miQ zCY%%ERCT$ar?|P;P#&CxF@05JfrGHhWQe*z@N@SLI2~J}>cL{ovHCLLcnn@+j{+-M zHxP`gBbcC_*apb>5#_B>I6lW-m@qQw>l;VXsPOeBtL`#2Z{A~=>M#g@Rx#cT{xeNa_b%>PQ+RgzW#z1t!=0-StN`dk6~;_4sT2=m zahi7~t4-1_Wt(!7{@I8U0^2+`N^GM!KDPlIXLxvUlj8piF~p7<`)ms=lTOQMJhEr; z)ZKmbK`v8ypUN%3gU{D(D0le7$fbS{!}??P8zK#;jQ5188!~*XrCc$h@2lUhnBgKr zv>9Ho8OC(>6!wTUlxhj5aFOA-5$4P<(6-pK*qQdWq2CR9;TTf{ZGMqvO84%zqRdEF z<`iBY(xpu?qV=y6ea74xN>+7y)Hnvp*kugPYCI@eKP~Cg_M7B~sLv|~c@)U^jnX~_ zo=l@O^?XnCyrQR`*iC&Q#8`lKRo&&ao{UqS!h4LyFG1pLVO%d2ns&cQ=5}=9!v zRg-jGnxlU1-do4XBI{A*ndz;-y+MkbX-&XO*3dT}Q|aFWKApS$2V2XPh;2k9^(ctUFcC!zK|6 zVD)ory|--6ADt>^R(lU^e-J4gBlR3(5>vG0eUFJIae$OL%Bx{czOpiAmPs5X7D9(T zVrZh|E#NLuPioKf7C^|~Tk#O@3x=r_8I3(Vw1$$+ZStPC8csHeJvBUc=;b=sc@V?j zTgbv8X{;)$q79_N4~@I5LRrJd>h8{0Wy~uS>E6rI;|fbni*sc(L$juPHD52MXh>=> z$?UJYR5H4H^^@f)RYm_-8eZH|VPsNGpIMEIit-<^QU0!wpvOhf=WSj*K6`_eiea zwi`M#PgQjFX2-M}3|(dZylFk!;ZB`7-J2R#|9xZ9v2JbVJG{k+U-`}D>Sc+1%ZNqM zKV)1fJS;AOA>&)RWE`7p%^;Cpku&Y99py?Oi=XyQgD8tUV-Q;8gq8t|O#6lL5~aM~ z8RR}2V}lx%x;-*%?cWH2Kg#3{ZC|&n+LMm28Wunqn$mQ;4a!7IUv=NWitH)NUlo0qy|MV)%Vh<&NxG zJDnlG*YmWLep-m*mh@SErnzbL8{|;8u?pJZ>peMp+|H8ZF;5$b`gT?&Uy?zW&m64F zx!2VN)s07&e`Cs}0+i;DBE< zY}l{aYsk-m{JbSkd)km+2f6S~gU~PDS3F#EJf(GW3eO#)A%;H#p`lX1DZJQ_*_hKc zx%c2|n_>JD%DTwrA9J@M(?`riRal*^i(>q=OS6XW{1tx6q*K%9Rr_WhfrK}Ov$_MJ8?H?VKv`39j? zZ&JM2$LK9&-EJ5;l4r#P$_nYKGs+MlNZz_?I23bfXW)KGRg|8WeWoMw+SOkP&#!?JPAN}Z$*62c4(z8r zt*w!F6*|StPYl9yS~DI1)6B_P0Ov0tcyk0#Z#=-~;%4$h7XLv7@C^f3Jx<-Z3)oVL|_?i3RT(;$ag1powxDt!yW4@JCHNEZNT`@hwd(7d?+) zY|HqQCax3CS6TWABU3neFipn0nphy5JfbG!&zdMV#mabCO~!kg_^NR7*qV&@4N;Wm z0X7*QXrkQfDC1E!8Gq44xgl1@Lv1qts)=$Jr;NwjWc*DN%S}PEaL`T0-!;uK<;=p7 zHyQuXwEdzG55LLyP!r{T$n5#FW<+hnBP20ZpUs0JaaaLeAl-Xas1AAi7;2VKFDEt4 zNbrr1%;v-XILBb3lHd!kdK$5>p?=yZHk&U1WcY?UU3h<=4r;ogt`jPcs>FlYT5g4u z#@9PCG7Z%c-rMPD{S0-!@a`kEzoEWw3^tp`UgBj3UE>>s$^$Sn1{&()BKI0f$Wl-8 zq&xCg*t&?peJRrP>DIv#D9~Is$JM0 z)KG5_ULMAok#DFk3NMf4%ot{<4+t+0=*$>ysO5&YY#lUn7isf${Lyr|fiSNSn}n$D zCK=9kLgXE6ckx-O)eoAI`kjC`Wft4#s>*l)$jAsEBhn9G&*94tev49@fmIeqw=6yaHPo5I$NVEdR8JA}siCZCNOR7sh4%LW z;8>qUAdAm@XC;|9hp&tHmnm_4R!AiAbrHWULgC-HrO-LDh$3|2BjM-sg@pda$FIdgf;N5&z_Q%_M4P<3?X@#W2o z?_NTFOU)t*b=a%ty+Vc`@otroH59{d<~DNNN+N+Q3n3Tp{mt9RzVl<=c`~^UvAI4V z^S5m-Ldft;Iy8KrQYVmb+@=#kI&TxsJ;wB(lF+9_N+pDJ-d3FZ6Uu*->0@m=A*A!R z;=J)$Q0OO|1+_xw^IePn>Rk&ixERUl79^IXE^Jk2E?)!7c#eX($D%UpI+LU!_0;NI zzIia9lSb#JOV(nvrE>w_oEbQY{&!)z_J$zSSm z!$CEzGxq~DLf?ijLJXIUnwOX}_gqMge1p1h;i^4_QM9h*+|f+_x3iGUN5J(l8HHUtC^A{%m6Kva^bI$l9heCUUh12^fto!;IZiKibWkf7QLes?0nILMTAbxjo$e;5C&DHqbFWS9_c;r zP$qsjuN|X!cCAVN&vqc#CAg5~Ebah;TNxKp*)IaY8&2*5res`>_W}B&`??F6hLe>{ z2$-jimRQ7+uy!`Y+=bwA7imt%5C{L#^y7OWW{5hS$)pV7BNjt&TW%ZZ>d|6nka|5a zrYaCGhctA5b!7!6apc316fpvqL9?Oi*q#{6>)#1Q2XmNfWGY@deS4kC$f`84D!&0o z{$3+=G4vkx4TC&`44;E>pj^K_^|$rW2PjP*>kN-1%9RkZ^DPT-APgOLx*@-UjK>uj z%W~NoLWQu&^R&*$dim|nu(M2#zaj?@WTl`IXV_dr?hjWPizH>9r)^%WKq1YY?~GVr z7^i{}&CVltPwR2Q{79}6`#i7_*O9>q^CP)R?DGXuPv4&n^UTn#v-!Pskc0-NPu_&0 zRj+cLf*6kkDcuH2x>xYVI`TN9_)2Z`|Dl~mvuQ?CMsRV)FpTT_L-$i4(4fmiEZ?BBKSEcP?#%ioR>m_4%kK7$g?o0{hJO!NAtdT%rG<5IBx z6K`tDA)9#`?U=v_T5d=Ut|m z7ZLFmBnlm(&QsUtU8W6H-z(7PU8d!`pMW%d-euZpF7GAk^DfiOi-_rb5izaU{T#~C z=Ut{vRER$BGOa`*`n*fL8BI2Q-euZk6~RB6&KD8WrYhYIr}IU`v}x{s2*r7qY?liF zX5I!MWiDZ*)p?g`_Ylg)3^)9z>OISelWE$XvTl}uJc$t0BpDC$9>^2Nlj z1L?2)LQfm!jaT!AYt*L)4XB7F7r}AW0g}=?VeRY6j^j@;+10NDKa}8oACu zmPdJHAYHvM82!Ras8L@Yi&7AM=sO_$T--+wkQ==c{W1D5Vc+L7Q_)AX*8GxW^dRA( zj`uW?FWrE=gI)sYt|i??$9A$;bI?+FPFt1Z?q*6(+e!mDxm-fIqouc>hE)FCz1&L5 z!=sL)Su3|{FmLt{oD|0;mrE%3oDMTa{17Dm+`acB*AV17N7(|b%vVI@=Rk)3!DN_+ zQRiB)=Bt}S&al(E%9-EcTYR8l2j=N=;z0Dhnn~?&TnU`b5jnuo383i#|ehkwsqzT5yyGJ#Zd#uOGF7 zX{ycZQIFA7aM;Fjq=6BV-9t*T*N`@bxYBOwW~1H90PxG@oa5&2Hu=s{L{Rqf4ayhTnSqcSGKqvm};r%SFuQE`rq9J$=_v)hAYJ|_Y3bo3GfCO$4CEuwtiZ4J_4nWC196Ued%j#icagVD{dLRD2>G%PBGO3*6M%#3Xe& zh$N1GUmyqfqWWB>%BM*t>cmN01pBS$Q4%j`1+3)zI3tI1JY0y0$gM$vPcz*ZbQ^q% zztizBPWOOhHmDM*=+D7KB7KiRMx;;QgD@Emewalpl6Da|<*W&P__sPl!n?5p2_`%t z7z$4qgqf2gOnCAzC=#9&qpBu6X$D!slRF5S@Px30Cxj(DA#B4FA~rlBV#5<6HasC> z!xJJlJRxGk6CyS|*#s4BcybjG8=g>O8=erc;R)MQAIFXGWC;)(o?HyXh9|oq&4wqp z0LZL~M9M#D*tCY&se8Mv46VCGWoKM)!`HUR~E?+tq&&iSQxjE<^7t`sp$_ZZTGME{J z!&)z8od#_IKo1zS)li)5A>?~>B>TDPO;?~p@aOL38|yh3mvs6?JBXcUup5Boex5!5 zak8QR^7rbTT%11OxKD&=&aB+)NP99wd$m0GY!>`fh~~`B`yTlYzXN$6emM9=BH#J| z2!1<;)X$mkxX(-g{H-0B!|LZOHt72btv1+sS2J<7!M>ufH3l2Di)pJ2R!6K2Lch)0 z54wgJWGZYMv96X1$NkP(Nc^!zml^auphKUedMb6owU`RqjrC#}tawz!Ft>Klo~A$h z6W*C%Y~#!x=?uRy#HLuPyRpo|2-cr<>`$Hgv+NO^$`5WS2NB_lx}xB{YdFi4BdG$V zpSA$N#|bP4@HBvt&zT%sq0cbx8%%trTxA}{TeHKkFg_Op=3Z8!nPt6b_#d<-T!>M% zU=>?QG5i%$2fs@G4QwF%%7^}%XbZEixeg{dOth8g^FY2tKdDln@ZS3| z5fsc|MQ>*>dIVY&aH4Z>=S(*02Gs0rqMv5Xb^smQf}=7&!ybiM@DJ;hpfeEaewHLv z?Vp%JY z!0-_!$0=p!k2PHMtZk48ui;aMql*olrXJp6kgq7WLaZ*YAR+L%f=2qq4#TltDGAEn zN*@DvP$lv?lxl~DXgfet}y|DI`ipM*yJuPg_M8!3?dq|s<2y)Y7A=Zro(lsbG)2!Y~h z8y$Zw`uyqY9>e=or^0aDu0&Fu+tNjy_gUn}P^4Bz@jncd>imUiV{Qb3bHWvJA&~s* zLj|sZAl<5>+ZRKr`Ok%r?UvOU{&I+JoWdEWZkG&jl>trq2CH#L1_E4VzyWI4;(O5K zOl3ix14pf3@o-hS%0R3A!4!vf${!G-Mv$k_o?ep!fqeu%y>M87zo z4__6cn}VEiQToORRkw8}eUYu(KagA3ZG@`ZV$?;~P3O24_;@tINR|3S*7S-{AupT4 z(X;Yz2+`jP=%eom(VcMvsm<9+n}a5y(Q5N}rf74vDsRS3qK(cG2R`~}NZ#mQhLHT< zg^*5&GA79XAVlZm3ZVCmJ0)ZBa-owS-4-<5{XwSTIU%~WCC@X(F}U=T0+SAv)X&}f zbU)F4y3+nBtG&+gKJfgKkl4Yt|BT z?hyU<5dE$Y{Xjq;{)G@-tZ-H_X3S{iF|grSY%>w}=L#Y3E3{Ake;@_xK3Zw;ZKMn% zg$uvJo59uR0fwiAdKsrcy^JGx_T^cSou(|DK>;)Rn=sXN&RIg=8CVDk-pzb(oicj{ zBhED-;pG=K&1GE#q=>f}`+jo*SU61-6L2O58_KhR-ou2aQD!kQePL+slnmV0Ki(0G z)z`#vDXMcOnbjZ1Bz~fzSX~$@Pezn{FQ`-Wu6Qx0*kY<3T%htG_lyBKncjfm0|vR- zeY1vOA{gv=17>8FW-dULys=JZ<}9ScGzP@5n>G+s*n)q^)i=lW24rS369Z!$4=56* zgDu^lX$x|QW{7+k&vh5%Fh{>p&Vrm`B=#Q%00DfFJ3C*?I5U4iPJt3TFwKfZ*bT%N zEeA~rrDS){%W9Pmz4N?$mXi~Rh@h&uLB7aPxJU>^>fn#>bYY(o9BSP6!J zQGsT&^JnH`q_LGuqk%{olcr2J76tU)u0d0J(gd{i|F(9O#>|w)>?g$p?FvWn14kK{ z;|2&0-WQwEO$Ox|-^gZ#a=@7;{*`MujYH*`Vt}I2jL0-L_YiBP%ro<~6Acaf4K~}v zQPFr|X7Rbld`q+r~#b$}+_9QT~S7}?W0)}< zjzD~IUV!TUdVU~G6XuSBSI-MmS>^?{E@-NmYRI}!ckB5!#dLU@eSsx{oudGW3oQ|x znzTrfR9gmoi!EPf+kk-le`HGd=Z218tQgpRtjG$F4uiV8j`auJqzP)w`q~s)jWdHd zJ6gNv%n)?5!J*za#5e(3o$iI1!8mQICt{9J@{M!hlcs7YFev8>Gw0HMMg*w#meZ`{ zZf!ME`vwM@aJlR(1#qU`X8{?LIavNz%<3Zr?C!1GeeAb&RW7dw zGlpTv7r(;5AEezH#}fddi>$&z0F@ znh{_|(dhxIyTQl81nM&*C{c5ia#)>d>lYdp%XD9vWz$0A=u1t+d>MZ$zM-CGngwA1|y2o2k2EnuODuqbAr4V#aDHj7^ zhA|U5XwYUU!~W>Nf#^?Tdd3(k0#kid4*}?#I@Cw6)PuzM3;^hd{4)sSOJUF0=xp)s zb5yfp@fV7-INCGLGA*i|n4QiTP!lG0uYvYZbUxZ+l1&Rabhp4uvrn;n-5h$Voh^}_ zGspkf`9F2i=;6#Vhv)@^Ni_W;H@rs33rs{p?Q9X=bhkD5NhbFI!Bn#}_T zR?QJOFdG!gz<~po!^~ouTBV%M`Kcgfei|k76P#5$-3gBca<2E?Wt<$8s~l45GRVb34J*N)v*BQ*|k76a<^9Lb~gnr+XtOt|?}Df4=RY z-MakTfJ|7p&vQ1>u73n~bVM1ZkHEc9YCm0^;HN-~%H`{PBMnDMT$WSn>!$t54&u|Xc}VuV;!gf%Yk)KIudPvstlsVxI3*~6e8N4o`_WaeaIy_jPRrjuakBw+#< z{oPhktU+rL*#@O&hdPz)&Z)3~o-4i$c0FW^Zz~RZvG#4NfSI8=5n$0oXAbQRgvZ{J zb8)}&WV-`Ma6S8_W;J;hk74Ix=X8LMxqO-yKMZtL1yJf&!Q(bVGRwz`(C`k15G4N|L=meHGc6# zeSNH*!YW9We{_HP(;kD9|i;>{^7xMZjFN=AZ!%I6a z6zwuI-g7<)s#sW(>nrdmL2t_u{#mABd}%4tClY;0T|0TbhL;Vzu$TxlMp#UQ#q^3Q z7Ijg@L|9D3+tR>4EC%pGFQdqR(jWYSf7*l+klg5F>iRmbFAi!Su~7ya4m=NEX1 z>b0|0t}a`(`qCBNpoG^^y?F=i;YL;AUevD#=Doqo1YXW&qTj;nFJKGqcdk z^A|D+q}}{`SRq^4%*Gsql!;h^rYZAym;2S;kjOc1P2>70Fwz;*#;M}{X7IU(dx!O`FVBz$Md`$+jlljDqYMD<9+B9b*-A{{bSqC2?uo_@^-pKWq8XG zD>2Lx5$6MR(Esu=q^r4w55VAel0hUia)ic{M00CQk1X|ZjyCuKkg-Qg1G{EX1C`>5 zCpw$k8a1`GHqlg9+tOkNtC{PeN@7BD+bjy>Aa(fkas_$cb=(SX>puLiu1Na3{9Tc< z9Dfo2J?=I7*?E!KIi<-+QJ(ixr(tl}{Oc~9Tn9;uphab7jLVQ#6A|eoELu#uPP_!ipW7; zRF}A#kS_lcj)oC$3`dfPcZnLYz0;K>uU7v#og+&`W3D%@A<71csIC-MRs8WK-c#HC zZ107Le|&QL9dLvx7@8_d%XXocP4;vA*YE23Uf0!SrDgshWL2FXgWi$o&sZ^Q_R7Tzu3r4kQ%_D`x@P4pK}+F(3l?`i zxy0M}ke9hLwr0lG#n*W4)xYy@-{u!hUU9~}Gp2d{W_pQ*OJ@D{3QT7gSIvgCu0aQC z#{Z4}YJc^DGJlpo$@^Nwul8rr!QAM==xV4_bl%8ouFIW2hZfS+hb7-V^z5~lDnfNM z`JZ0K&e`7OGnaqdyJx#!Q2fP1WANQ72H2vS z@ej?g<;c>z{jque6mQd!rT(1j!n7cDs<#oT=UpF8H92~tU2EmtV4-e&TC(>;$1li> zcrf%@HTyh^vG>2)&>ff!691Z!dz*8*8yORqoq;%=?3iKYt^ByW8Of2r|KB{qPw2+D*f!E7!!(K_b%Oi2*5*0Z>(RP^ePazOgiD!oj~|9 z-t{*492Q-_-OKh9uY0FqCcC^~_HCG;v?>f8J&ukTxYU@X=-x@MdEVtOVfgZouk$u1 z#(SG*7M!beDvDma`jCoOHv^COYEV~FA}75bP@L4`w5zG?uFE2ea!VI^QyS3HN4(6Jy!4&>-znQw+IZPbP4!LQ;Ny6q ze%@nV=}hnZhy2{gG)}&5ba(VAZ@>xvfEUNq8#yD->w@sinHPH<{yy(%$1hsqAHcu7 zVz0rkzUI);(yrpN??J*=?+5srr*#cZ5k3E%(jdB4`pVG2^YgsM$Ka92bCU~b%SiO0 zsIM9bljXkxt@S4F%kzJ`XhHG-eoCTWUcMsQA~U$+K9ZMS?QN>w zs@6aayUJd{;5hS`UshH&f7arO3zn2EV#MWowME_-EC-);{KuwZ;e8yzPW3Wmy5loU z{((2|r9G>vO78JCSAXK=Yp`_kkNmx4^-nRJE_@xridW8BT3Whvap_{OzR0`X^$Y6C z7R*|G@#^_rUE=5?73{X?hMOZ}TnwmdN*}J@@15m0c+FTlR_{NsCK)aP9z?}CB?3YBG>D;mDbq7bbWa3-Qjts2$ziUgUwWcf4j3tp-k<}(J z&p{EpHp8xtEV7`bG*zEquLq!6n*j>7sgi^&J>rw*cp_dDVlz`F^4his+`a7^>e>=5 z&^fm<*H9$e#{?(bW=?K*_1BW~+jaLwa4da)DtPP=_t#j4eNN)c3+ zg6pVX;thLfaOBLaS?sEaPZ_26iAKf`;$JRB`joDTOvv-L6nL3OhIV=Vs*`v5YJ(uE zCMPd)PByr_=%QQ7;w=w*yWJC?@HX$n)Vc6V?*~`GIlmgI_MX3LC;s{gEKTD~0RVJcSGqE-oUG$!h6WNmwtkratQrDW0&}e?{#|sQoHE z$Sft#2+OFTwb7Z;3B^zPWrw&I5{WW8iA4R2`A3cUs|c+|xHEsE_vkiMHS-~_Yv!av z?_)1{7w*KO1L>|;&B@!{#LLHcSyKGEf8FCtVynF^`@E?~yp{V(y|b&6W$2KPc$vEq zAw3oupXaZ+_Rzu2vYgglE%4@73=l$I_o-nDCS)go+C_Hpm` zG9qWXK0_1NYuV>L>qaKJlUHn5U0Q~9UiGcX($&QW{_3T3MJO9KkuwH(d02V-gQvgo z+sL>aEdBD96fW9z&a7u&xb~2@$=|tVQE9Qi{Pk74CM_uQ8jpMVMgD1Co4-544xUf} z?j;}9!Mkm@hBjdT#qQ4G!m00U-WcnY?rnOT?%)tRp#e(|c6oGj+vU+|rOSi7sTLc- ziP}V$**1wcH@4NXH)2>;P@=?%wqqV~O?-GrV-~-z=hg_eQy0Z5S%*bve@2 zRgq>x2XK!K9gYI>T@UZJh`Y$kea!zo27sNBC1|n6W4mHq7c)$}`W}BGmZhW0ysUlR zP$bZ}Zd&hqVF#o@6X+bJB zw弡`Wz5M#Mb&P#ouisAZ-w{)J`?qsnfIT--oU_rtGE&M;M|?Gw>+_qufya|? zd$W8TrlZN;zU_$N-*8?+Xz$NFnryszm)GDIy#4mujOO?!2Y-$o!_4-OzuLPQ3uA9{ z!wv+G*F;9=`QN<*LVB)!zUA+KZ^U3{@qQS5Dzq!?aIqBc+#r^E}u$GzVKDS?$c~0K5 z*czMi@ub&|{fyk*Nq>J=q&PQOTI{7abYTW9!?x>gZ`e*GC|w);qL&|EvnMh>e~*_y z4<{R;VUuHfc7Zz^zppswhE}@F9sv@68Wx! zcUNE68-^GK?)s3I*m)r74czB-E=(qqU%uYkIP(DF>xT~{_j?(M0|%1H17GqotM^CF z&hwvGgJ^4K*U!9fI}KcmCX!xJwO6(8<@db8gtzD=zl_^)ZuF(7H?sPmfBo~x$lP3i z&oO`hF>fJvFlYEXlNWpSyGsu=`cD>aZj4OH?R>n^Yw?SI=B<3}n4kM&>=QLE_LV{p zZHMo^e)}hHxY|Fo0Xs@bzjeRYFEYj5|4C^2o3(I}$e6(_(5u_&6=3;vX(GZKVYL_} zBF>Y}$pWi?abWG`)>ucZv$ZP`v*T=UcgG}n!uTqqrH+p<$l-@%kO70(eSGA=CI@bM z-*a4l&aH7Zr>Z$w?S881ETX4pm85221*t94d9Y4|JBj!N3@+Z*hp;^K`W3Cdd%+@a zD+Y}Wc;>-1U*X8Lx!T+Ml0T|cMa{)z=VoQ(>c1Q+xv=CkfG*q=_CM2&aB};cGV-JU z4My&4`&aH9*y)Jbm8E-`H{^sjZ?xK^zLpjK;&p59x#h{LX#p(cqu)Hl#Vn$}g6KN0 zs{t!%uWPqA5Syibw!auVXy3)?y7P&*8^61846H7_tZJ|SOAMFQ>^(8xJGA{cIwQ7HzR8;fSXJ)3<`ry+6@T-nH#>5MJNnVZ zNkobdzvIuLGyRm8U*C#}?cz5GM#c|Sl$%=o#jVdAecs<(PH5Jk(T=A#%8Ee)70z~H?_CMiZ|3YH`X>6cg8x}i#r-8U`^QGTAYBs#qs$1 zR;Q(XQ;8N@-@MU533VHsL~950*VJ`&*3`APbhWl=j@tTqr>3JlZWtPyoSHaHq%^o- z!Qy#NT=s}u0K?uWkn5bNM){? zuC}gttbS5pAlg^AgH@0XQdisAxkcA-eKV@s+*S*`rVTqUPEAcmZGDOU14+at>pxTU zpQ%QrO%ANX7C`nIHj;80!@9woiWstB9&7rUY1C|cF^w4@pgJ% zGY-LNr$;DX*Phl6rIC+*0c|%X+7q=csB%qHdsnB^8QU0Z>xwyTwTb5Twm8=*HJ!C> z_3f=r>xM);*4WB&J8C-<$Xj2#rM+QO3?JIW;s=!0_O?Wm6OYx#o%Ri}Elx`<>R7v_ zxlJWBv|vhagY?$*=ucZ>wVj&34PJ@9W|=7vya}G))7sG3>D0#SYU*nfG1QCuAM3Fh z)86LPwsbVrqVbzMYT%>o=x$rsfm@psnmHb8L2-@L2HaXfDC5L7qc@?t&GCAa+TK|o z>#R{WfRwtJvmV{90|P_cX=!V3Mju6YX-A)2Pbl+wAD-sB2AIZ7ar*1cvZ9 z6oSH?TFT{JwQUJ%1lL!`SvYk}t*dE;2|<7c5JDR@$7@(~=)9!{ex^;*8Eb$FFePe- zF$JY<(3PqIPoiYf^a62rrm!`f7VQ5xwRL1}!q5Q=v+HWJG_|)hv+C{2ifn|q6Gv;K zx-B(v7NAvXt>FPn?UY29X|0WK!1}QTmDUxf&Z-TK)lBV_fhig;!KP%j)+3OBGc_tF zg0Q>tbd!QF%kqwV+ZmW-NW}V`w6pXK1FAlS=ujy<@!?(p$p{h~P zsGW5kTi8uigLTwE^G;P2n!S$pr&`p9gpQ_mWM|D(a2oIG=%|4=I>tQkxy#y7f33A@ zS_~wKSYtcJ4&Hr)P1(|b+Zqehg<}oI6pRBnwM4f(TL5-I6|@`-3+fpPv@O(aqrNZ) znprt3J5zmo7rM4n*Mx^YU;udMrWkrL|HL-XEwq!N5aS&kHEQcYRh0dc&4MD;HigrK zo21%ytg@*ZrXYt}Ff`;;K!eBWkS*v@)KxX8LyM{AscYGwEK^TcXt(`Kl}Ptd>~Wyhbb*+bXu|0Kc+I8?6E+rKuX)>d?_K zo}=+81oJP33v$HSU29E~U}UOm?}D-Hum%ARHkFziwm1#&jJ8Ar+r6lZzrcMmu;3YM-xvr@Z9q)48|BVyEflR+34+gNB)|eW{&7w|^hiL!xjWxW%fv`nu z3x8C`r(d#_@RuIM^bSP;AOwwv?(<_HMx~oOV|7ZP5)|~OjxIcX$>@zfM8RqxtZQkH z$0QSpQi@6i!B`5A&e+kZHQm_RU^L-)%=Uwa!F}WSkOSI}vqFbMCS~$KjRqAAICbdD z7^vDe#cN;_bQL{ShWMawospGjhqs~QGir!Ws-e?C4W(%fr)tEVBq)_}G<8jT2RK-^ zVUZ!FLD>MD$}~`{R?VYoxa(|d#BdUiVPd9%5sq>gHWN7NjGodMQ$kf{N>}AUqY|Z2 zR|g&j(_{SQElx*GLx;%I{ohJM&x>L74+&Q-+R)yKg$mpsE={5Epm=kAS8WT1@=bA0 ztcXM5e+{a34XWXk11WnO>uZZ9nlMj_{QCBm7R(Op@S$SkqMTuiJF&Kn2zJ{Vnj5v% zG5FQ95^BCx(`sX9Z7YIKhk+r2fDJHm0#F=H8EeDrxCs?y+mMK_!_L z8~Qya|E|r=)T)_NtFm5gJ-YvT^kE!k6+`3@YiooxYu6*Rb>dqvrfgk( z&{%qGW4?CqatS~YS|-BCIGT{bMBwW%5$)E{)zYGzD4f#4Ik15mvS(q*gDB`LMMiGkh+vSutU#QnuFG;sZo3Sa7@H3 z7*ZIAfkm|;M_?A$!{~KQEI==l$P2j+Y{BV10cntpzo@TM-+&QDtug5%4lUf)pkrz^ z^eHVd`zyn2W=W9CKU|hU28P^?_zMXTX-0553cMo0p|hrjH!P5&y`ux(4;DSx*00yI zSzBzA22|LC#f;OS!yI@z^hmTfI~~pS^o2Hd3HqY5v6f*hs!TUQgpN^B_Yo9K-O#b% z>d1zu76V_Rv#XAM8MRZlV>tYDA@)?fxK-nHFsM_rVKY`h%A)A27+1O&A}Nq1pkcW& zz_F$tK^8>9OK5irYVUw>gtKZ%qGy))B&q=t=l}JXj4KXkUzstjQH zT?W%@F?_R8P;C8jHFc?8;Vgv>fY&?oHg{kM#j*<(Sut$rewE#cCvtL41r1eluPQj9$-+dC9Jz5#1uO@azA0D7N_8>)E9 zHxL~X0Zala+^cX_3K*7Ec~V)YTUpBlywZ|Z2pE>N zEJfBiCc}9)<0=6qBdTo@?Vp~xB2HOSvaPU^s_PBd6_Bh514u4@W`IA}SuFyBT&s0j zaNS|R8_nN{b8bq_dh+yfmR>kEDLKAka=Okk1KB@^4fKNfi~0-rS8S^Dho%w`?@_2< z@6CQ#Cpy0{e_f|iz`s~oD+CP7N^N$L^;0WrnSifZQk8&VS*gwb?Ul>+8jSt|q#%Sv@Nku}cBS|;EGOR5qu zEGyO7icE$@<}aO1;2&9G6$1XuvXvwSmIB~6+9<)b$*^y*k}ejo&5)wb6;6jUA}Kh( z^1`3%T>L!)J{yn(e4$5f!G9mHYB9m}R5cB%kTT*FnARC@Td7LGpZ3wxU8hFCpBa*N z0>PHyStI_ysDa=r;+I+Rl>)wO#a}9*2&DMRq+o&gO9R0g67h4b_+l5ooin)dJ7fF_GkizDP`=$M`Qj$u%jPfLRp9rn zUMq$g(~UNN^PH>jX2V#&VBA(`*cPWB4ypcd*tn!57a6wUU}M_AGkakcrV`7HWw7Ia zI1FM_Bn#8L5c8IkF$>d;hS_yie8qr23`lPp@ICW)u(Q?FX}c30)@dsx>3=tTBfuA~ zRCjLS!l9b#H^S*&tM1&wg%_sP51I-n&BhkdiNWxD~iez{xmM zfj`%gFJx#O*Q*S7k!4wF(+qI6VVPiAB<{7GQuZFh$|e-}oRMb*nQxnb#w$&~04|aI?uT)!@QG;0LY#)dGrgh^E}6 zAnycx0*c0pPmrMUDN$FsQFpERix6MH>kUcIu7ceXF!rocs;tW1WK||pWuW_?#i7ISG7JBPr-F15N=&ZweIn9m`fBV5X^7n$tyrf@T?VFCKoc5h7alwzTZY zqnDPF$yQWS-s{s;GVxrz^)00QnMsobss$MDJ zKP;&t)u1UpE&3D~0DVosPg>GS0e@piz5Ag_bZ~Br`e8XO;BiY@Az)Zms%ySwGRR^M zdI`9}tm}GLLNd-YTv4ar?i$xg3NBLe;By&ocxwIX*N^PPmwI+Tm1Y-@QpjNL7xltt z)U$s&OmUm+qO(fC>ny24z?%%I_bQ02T{hz~0mB(n}^RK#E`1 zi};i&eWD4qmfHBTO2A7jsY1Y*A!*|X)?yi}1Z?fbsI3rOrBLBUmk=@>0~ToVDw%A#lRTc1r$Nr);(o) z7(H(fY%ichh}eS>@Nl=8Qm}8^8Kp|VA6in1r=*rxDJ4E-bJYkaW%OmZ9-TWPDFi7A z_7N~BwC1C$xl9TjU_+`E0!kU$2LzL1(3AoOC8qSZznQ>%yb;gITHs4eJw4~j?MXpu zZROVrc#S1h2)M_PR6K0pM*_B<^?t)>A$4H3o;=bZml)oNQ<4-|Rvxgjh>$vyt*5BC znH^uSvt*@!QLA{oLSTMTkU!6fKPo%!nu2RjS%VN4%^03BYN?u3)N~@Ue_@Ktb|AY4 zkEH#lRjEqAKUq?xfTG#;i0vnhg|zpkyf^U0rY{=!iO9BW9bCW3s-5Tz!9msz^5iQw}Mz4w|(=^_O+5%5Bj z5j7DoEGxAp6(&QC%~&Pi&n#($fUg-+?|O->Mw_uxz+YO@S^>kdPOjIjCL=qMzM*)UFab{`s@#ZZvKmwg*kYyQ(P#sA;R~ny_3lq1tj-Fn6!3DB5&k4# zry+UH#;T+s2}`_kZBmeMsi{7;!({n_En=B~2MvkpE=vk_C}7mJ39cgRdc)G2^$(l0 zR{}8trVrGj@H9;zQ$txMX^Ca260pjUdN-wW<(sjv=A5G$qe7h?LyR&hI+I(qKhZ3D{~yb-77F77fxC?cH1=?Pe>j zQoxxeBYaT6GAnE2L)(*r%{El5kuOdPQf^r}#t0_LKxWSt7fqxx=}y=7emj=A&d8*Y zq)JF~Op=hcSpBO6{G=iEb-I52!e736RZ<92!3>Cr_B(BL?O7v|LK6l1p$k-%j4-A2 z)b&my`XO6tg@FC6)mElfI%P8D$DjEdW^=7h&6UzQQgc>8i~>rqa_WpX*^HF}hBK!2`~4=vQJb+! zz@HdW?x2oH13^S51a=yG33lU|zQke$#CB zPk~8Y5o8pw=AX*A)Ml&{Fq|<})paJrtu|wofXP#qC>d|E87l=0XG|?|zsc|;o3Tp3 zU!AhVS50bqu<0ydo*|vY0i|O=I{|qhFMn7w3HS#?(%K33p=GQRa7?#G5Nw*E zq7VTu>BguFOufqFiiNL$j~Fh@yaI}~s9j}JFfkbN3Mi)0jDo#uB~}V3*3pcDt+e%B zA>hT9R4L$>tgYhC=}Ez4(9jYE`m-Ny~syKxr1uD44Vi7zLDO(Tsvg%YacpX%@{W*mJgi%LU90 zHPZ5=U}J3~Ef-K4NlX0bepxLY=NGnLt`zW(hJ@Xg?MXqx%~@@mQ;Nmd*0aoejFj)# z0+t;&6_n>x(P}+&iT8+lU=c77v^-VN@?IQxzA;mvWXfHK3d2%wEm|dDEV$t=V1psi z$W=+fJ{~aktc{3YWyMztD7h#rHD=vyWbU=Hs|5UWUtNmptQGJ*L(+QnR6=BZ$;zq} z@MTL{Az)Zms$S2T48Jsgx!)u35v$lr0be&H`)iHOGa_!xuDghUyOSM-B zxX_SvwFPSk7*8g?uN7Y@pycYkIXaBYYpm=l0h{dIf)xTb8=^Tm5t^*) z)d=WYZ7-E7`?eu36|kQrRg3PWmUO9rpRlBwo>jmLjH11?S}!;GrF~*n&r1cAcEaYi zfTAx12^hHV3IRo5a0xiw>UpVvf$A$#Rj)}^J>`DBC?&%(f~72{_b{I3QFd1^ZaQ7|1n{5!=^_uM|*n^**>PH8MY8Wv2uM+fC9V zmL(-jk)&UpoJEob+Rft>DUu}j4SI=(B&qAxOGNg9?brb^LesY%*?au!K?IAB?jrv+A-zgU6vtfX-N zCdk#3MUo0nF6DHSBn9?j@HWGeJh>D}`rgS|B#VxjkVHBM6@fo% z(qKO;qKAYF4bE%=OZHykHj&e0C08^XFdmQul$!TazF~HxCM;jzb}MPENZ1jO1iZ$O z)MM6yJRI-|_!Ud4Fp&Lo*g&tDztB^_nP$z6i1Ym`1(EF&tRV%gwwn(X0)97~)s|&| z!{+b1<}b!~0VQAWzLuJCS#KFrpEZz-Z`cx-3n(R`lE0TjRScKo{K-I`s{yu=InhYygGItsNx{b3eUwWCoN7p( zbCsJEM9Rf}aL-bE*&?25%971$c!Yq{ZGja6F0`bz0xq_s3IWrsan|-U&Q*qcSCH## z20Rpydgc-{0{+pEu-tc(f{6TH z26_?2FRk&1g_nS@SyGLFf3woJyUBj0`hoP6f%hiEf2;Y6F1!D#q+n7uhU32&1}T6p zEO44Nv&>3bRa`}a+!ACqfh9X42+MDP!=^O3*@H^|*pPl{{(>Z6AZS@nK|R}s<%>?Q zSV=3ye18r|0!qaZENo8-5?D0FWkdtBMaPgyzs)!jv^tBR`X zkPtCFjt>|WqNA9Zt9HYcF{tC0QBbbx zKBodR-~Hp(m(*VSch>stwby>usa>bfaE57$B?t9U90g?kV(w#sBE=h;6p?YsE=XTP zDmACbU7Rek6wgPfU?slyfUhvCYkSeH0IKP@3X6p!lGY{%J>_qR81t#t_9YS2U>P;v88-FGcQuWa+0E=8AT5a=lsmF^k&7F=HZ^ne1T%kdKJ-#N0H{sXc1|$!QlRG-Y54pu=NGEG zCCd!BE~m)KwTB)AuuYvR+dy7nsdMz5Nj?L>s_+tH5H|$)6YiBkKq)2x$%8L`^W#G& znEh!}oc;Rw#`0u?+^{^^fY$f9a#9rk#Su1kZ<}C_#YnIA%lH}$?si-~KY|!oMYan2 zp>oV2mQRKrMgGWGwr5Vn7l^huO+#)jQn97@x`X3Yv1)TT#<=U;YZkKxF2JuDc_+W+Y4;IVpMHv@()Ee1QBf`znYQ9fV@)XH$byocp zL*$C&&jf0%$Y~~95sH6wR+}jv0Hm8Hx)ot@4?4r1Uqhr}%)R=kRVr!~SvJ&RwkG`P zx=i8}H{p_OH`5R^0ptfDN|Yi|%31Z3C>uaN(Og_Q$Eoz53z5&Ri1|t+YL$vwMQUFK z@$IgxG{pk$`Nz)&QNxsPC?j|e53;&CM(uiNcdR?6auvnn0oiWC96gJ{zIG#Pzh`^9zb=TH#aN9Ktrd*s{H8_**@<9bGt;-e1gr}(&ok`$-A z6C?c;PjFC@;+YPL?1@?2{oppY-F$Wf`t8>R(wiK4n&QUUR#`ea#UvoJxnL&e^wJb> zb5LaK%L4zGD=A7!G)0kqSjAR#|B!Q~hvMxH>Z3@1 zr0%%D(Gy-wk-aRE>_)FuEg5CGmq0PZzbA(O za`Lkl!oO!HGoN?izMNvnk3BnmKkh~SShya4l=(CQ2@l0TL*0DcJRN17;L42b{1~&3 zyD%mxPB^HS;vIly$Wxpf@_Sq6>7)#Niu6%F#!~#HBTP|zoAa!nA`jn5$EA2dphb~w zAgib!ZYhz$zR?->Qe@B%Q#;#d)a6mHe!%}3Uf$!pO;Th2uaTXxkCd?fZD84yYDvm2P5#@?(-c`=} z(}z_2qIlPUy(B}B;=O>ZtIQpqcZw93Iw(!CR&ZL%f1&U$=R}HP#X)@(S%&R4%zf0A z*hg_aO04PX=R&E3}8=at9?T-aNnxZKn7+K;}RTGv5yMq7mCn|2v)kJrn~^ zBYm8np6cA{rFf4k?kW}+5M8Cb7aRLdXVph5dL|2!AJLd4oF1)$M|!YOevk7qNs;6- ztA1h!Gp8d4+LkhXH;SI~iiNUkm^0$coP;Wg<%f5iiOf=0&m_e!g!TN&8)N#%FqmSP zS%O|>G?`&%(>0+S)I<)lEB}QMT@(r*Wwwzl$Q0?36neu!!^gfPil5lQOyG%F`;K%K z69&2rsmurzUv^L*#n%DZ8G*SKS6hylenR>I$CaX3cTf+-uRExp;zNMUF$iW}b9$R8 z(ic1WnLEaL)=x2THZsg^1MP#3K6xYvmO7%yXZi##jX0Gw#id&qn9US-mn>u>$lRkq zV?JS%{$|kYK1TUhuE;)$j{`EB$fHqyhw~#z@oWe6P+SgZmTKGZN>=Gt)W66qN~_h5 zus;d0-$6-=w>zl+QHcNUpd>{WD{GN)BlNZ(k@|y+SeoKd*CdRhSW!G0kc~ETLp7%u zVm4xwK1KQ?9ep1~653A3TtFWQ)*JBpZKL!cd4JW4KSnCMV&AEsSFFk}k+#dO>@(A5 zb}yXSefpAxRW;*OUU>oI6j&+9`(iH#7L46d@1|)Olt1n~?J)v`O$+in~pA0xCe!5AK zxzMRIcpW4#IMH#XD6$yaTFk9?iaivs0VF?mAEC_CFVbQZ>5b)Mj?dZG#+3ghD1+id z4oXpEtt?+qBP%6gka5m+O;FA|vR;ZKfaD-W^eLC*#uuO}uB|(ubRJR}XcReh3-n7| zeJs$T_!~gBjhTBcP^9=LK-1N-$UN&8sXb-!fA6UJDSi=<`QXXSKLvW4739gbrdrx_ZBkQC1E(fJ3z8jFaTg%L9r^isZ5|g;J)2XB> z^60R2mATzcG3iJklyQkC(xmHCBH5skZDz|s`& zaZr-td)<+xmPAyQ8-f&hUN7c64vYUMS5^27KDkiE^h@nsCjFYi24nC4VRH$<-&mbo82&1yd7LdXEc2k|Mo0 zsJ$QhCsCV29A7U*9wMGC_ifuYJd-u1rxImpF*_dVHRd(VarRIo_X1nTMeVVjTmIUd z?%Y&%0cvu@JdQrzKJ2e~tb8QqzKgTPpPWH^{Q~tIf?3YP0wU$naK`rMeV@r z$c+4Xxb{8gVlPGh3PvwB!Tds$CrD#Hm+l4Ey^gboBDvqv{6gj5gJ)HfBL|sZ!v4n3 zFTRV-^Na88e{O!E+d+i*ImdOG4)m-6*TZrYe+@|Qp`AVin!VeZr780Hin4lBn& z!w);o9*X3COZ}3{tn2^7e(Af|+%J7^|8xD4ZU+&H^h>tFxe%l`QQX0eh@N>O>opAz zbA~*rrox8NCFf)quo$_aLXn1XBVkFzY>;N}hofS4Ib~8!Q-k!B=|vFuablXvQEUsC zvXo~7o2K#FC98OaOimXX%K5DRn^zc}(TwZ6TTQgFF?a_*n*ZE0a+@{EgwlZaSnQn&g zkPAwVUMzHNnCpf}c4>VQF!hPVfy@2&S}pD6v$xV?k*sSgJzl44$Euf(Y3?PuRC?*B z9EkeEK|>ubf9Lz8#OV`6IabUAi>mH2VHSC$jP;cKqq$!Q^O#TN+^Q{Vf&d9+lR%N& zGc~s+j#S|h&_0heW`}VfF1_Sbde~_hUirk1i!LD{Rbz<9(I0McT_jEMzX6KhhSQKY z$Yh^PF>Dx}0BQIMS7HyvuwitAJqd$-&M-}JgWGuZQ``vXcs+gVedAx`gX>`;be$Fd zsrNC^{%@p*sKkW}CzzY=O6Z#fab5tO29du!moEVMag?8PV!jh(i6qp2xsLL&jy!oB z#DxLG^7u(x8kXg)3A9{4fbbpE>=#Jo_f*$JI-{>7f5rCUeLj=qeZ50uP_;$alJMph z=T(~GOt&fNr#K6c?QzV7HuDr$2e!UzEODzVF-7q(XVpiMW#|&+;0P1~dag;Xlovt! z7e^o2b={6jQ%EJuUy20AS85ezUoSq)KS5E?1x5a2q)1<>Rgu14e341SYRCmOP4Q|6 zr6_&_Q2bm%F7AP4E}=*QncXOcxuVkxxh``~xCg~fI;f9gm@7Ktd>#he_r}~QoPx@a z9e0}I3HTLKe4V3)DH_gqKVRyn80L!B+2@MA#ShXIjyz5AlMcF);!S|8@66p9C{nzu zNzn}x2pzb}lB8iN-srgcDY6(l7@6A}C{hd;8jS`^SO>%D>fFd!@Ey|;qg@tuhpLqy zkOwL<%g4kXs=77I?8lq6>Ge07VLvcaxco3u=~)!d#uqg$x(y5-bcQL4XTeZb8j4}A zXrQ`aaIP~)7;cLz?Me%wV zO1o1Gb48sv791xd*Bt|AZ8^+?H2z5^%+mtFE!`68PIFkFaw@=26wd>r{1$NHK!e<5%_V=#^U))x9-0nSGU-i`bdE%bj2 zoX-(Ffc)I!;{CZAIG-3ehWsrp^#25$5B_@t`P1jd^REZarvRpq-`YZdF8cx2^h1#z zj?^#ZNX*6j_4|p)FGcF>KZpVQFw&nO{RPrTkn$&(zWtlo`TG1dPLyGqtBh1A#Q4+D4} zjpw;|Cdt?584Z5H%C96(YBAsND~?a$I=^E0RSW%%=tq4>c_-Z6MerT!tZyG}DK}*@a;;V<<+K5$YN??*i{+xcHwxS2eMmBLt&-0*a;md1k{PPyMhlsd zyx1~lEi$=UEw|lTP`-Ll)Py}ZS{#(3D?%u5ou(AaI8d*v&cVv)XrU|@^M!%2VKL5? zhYNz+Y_U94p`05Ss1>#{S1OhZlqFsiOBE^1+JecCcR1gsrm61fi7;g3HZ@(PV(~WB zh{X?4yJPV~)xKDKh8jOCq5M~y+;qBlRvbQ4?Ty7}srJL;^bc2)vH086wj<;8XRFCr z{0PMX7524F9jO-XjN>^CNDE-VGU4LJ4J2p;FpB}PBm?(Y*^`?>%4gh;_lKfYDt?eh`=DAI?3 zCp~<(;JRMB(Y~mz`m({J{qV=&sd_w*8vSU0eogSWdP$+N_)NjI|5K+V#8RoJ&AU=3 z3$FdV8-6lwawF8>dx3L()@RhT{vHp1H}FXh-za#TpK0hn=ariU{)~rj0sfSSZw0>J z!~YuiV;-*WCu{#7@$k<9|G9_X27I4~>qA$X=V1@O8~6hren0RZc=(TjPkFdL1giPJ z>)}rc9v9E&pr1hhXM4Q}yxYV70=&x`FGtA6M3;Mpr++-~4i7&C_&kqi5%6|Te+BTF zp1wX?qsu+S!v}#Y57#$twf?J!59>QFcwBt+MojnrT^A)x&;KU+|4oMHD7F8?7Ow*T z9PsQ%EWQW$WDEYU8vXey>DBix@JxQw@~j5WebC?F=|9wh|L4$ee=%zqSl`Ei&-Czr z1Fk&$1>ii1O`bms9v8PZ*@(vB`hby+XV*4cuN~5#^y?(Sr>S`=;q_b2=PU0|7(LE2 zYa|!PztHf^QR9zTePbkVOAH>JxApZno^wTCsiPz7{-uT|y1v`zT}S8JEk-}OzW=r0 zM@t+!b|!Q|5)=E6%;Wf~lrP;v(J`6Hu`;zm?A@amF;(OwRl1n~B=EdD9r8(Z*iZh>Fd0>4>s z9iQE(7styjf_I>-aH4PSYW+zs4*I^diL*Cf={vL?GG6@s;m-v({pzZOp??A$>q+1T zK49@JDEt|NBNIyWGxju3`(mpv&17D)B(B<@DbG)R15oSlMLZve{zOp9Y@& z9*@2ctohYTS)<8*zSQu*IF#r|*qQo%Fk#S}aP1QC%ma^9$GonDe)jr=$@3(M{zGh< zXKJm*ub5^os|t8Vy!d~}=v$re`z_!(aB9N9@5RMQ;MJPNUj)8~`e};~1HV)7Inuu4 zzsVY;8~FF3KXqclAdAty`j4vVxOI4Od(`N|IFvjO{ch;9f4&Sn;l=IH8A`Q_KWh;O zQq2}zmz(s;oiDhKCx7{W84MN}oYq0Q5m&(A67SK;S* z@ZfFd-dvreiq}1^T7LoSL>GN zSKt{T-mv&jfL{ZAe~-oY0Kd-Q$b^!QLqGWrtG^rb#%Ia1(c*Ui|03}0ITq)-qi=BP zxV68_;=>r%dj;3^ebsB1Z$n=p&OcN3HDz#QLdnmdKmMMCL5JYauUqK9-U2^DI)I*+ zrZ6vMQO111wf~c7uMNOY1ztTfVUUY~pJRAro_{%O5XqX?66lY2CQN>#h1CVb*I0ZN z;&Z9sdS3F^!Oeo}xE=83hmzp|X($;7|E~90{u<)*x8RxS`MC@D4d9ood3^~yd`igz z;9ocTGN1o0YmhGB_d_&F~K`Rd5X^({`smMesaD0cY{lN{Vr=% zyTSij3;rW!N!+F%r8ewL7=6B-Ah>Sdu4NYIJiJirOPsUb=aF-aKCOfFJm`18Pp3z`8Vjh-}U|tUa-;4cB0r;aWc%Bzr$NzHp&-lCmJPTZ! z+q_-}UiIp$U8kj;Q6`2 z@EoC%*q4452Im6zzff5Rd}?dLXx;(+P2k_<)$0o2d%SY50nh#wmgf&>-yP833w_4z zBL<(RUVSQSN+f;@c*b{Fo&j@JsawHww^#00px^OMtN$#@{RZ^E>*;?7`g^fo9R~mR zi6<>j1Ni?0&lGq#zV!0~_4rL*Z1rcLeSZi2N4)y}o6!epDEYJCx_$k9;HpjFV zzDB@vK5@)r{K9_|@FeyFZ-@Tnz$bBk^@G53g6r|p@l4hr?ZEZl=rr>rK5uefDg&>2 z^VL?vGhaR9#s7MP%l(kwS$?h;Hya)l7D{df&-g1=pZPpE@&PkPX1s0^haO;%5TCRPYYS;iL%t zE_`0ti;ETDRm?M(mejSt6MYGzc>??#2R`{%7XKmI`}pWdu6?T z``Z>g(~h+IyPnS)2KK|_1lRT2dqTqCvrsRUuNCE_LFb^V7aEzNVyRH?tmbNs&Mu`7 z6?86NxnhMXl`F-1K~;3FOUZZt)l#8R$m{HKHCSuZ8)HL5LXpW_cz$o@;*EW&Oh)Ak zwZd?*E@vS!jnT|tsZy2$0x?h~ldoikOO=6KDU)whYV}NRY+I<16-$>d6XqBtakpHw z2NyDO7@@XZ*`p5mvC+}(ViL+_bY+@U2L|&?#AS0DL(<_wIa3`GALVF6x!$PODpePP zC99UIfn2@NxqQj0Wm1K@7M81AeJ~?OA_QLA)vZdETs|X#7!b0-%2>J4xoois7DNdY z(N#`dxhxO^w0o&TyStsZYDpljq`0KZp-Z|}ICQaYB~$Q{#miK+CPz}lt)Z$ozO-A? zOi83=OI0aXZ)8R*d0n+-qL?u%%a>bKao;GE$42G)ij^vFF0bfTwaT^iOk<>2&(sUU z`XGp0m%13ZDh(b&m+F8O^JCJw`m$DBuJX2^?qy1!SrOM7+Zvg2Rh{2=S?c@^ohvCU zR|BJ&u|}~}7oEb^TuoxVe2L1Hsw26KxTQ+PQQb+GEncOh+|e;<4bzi5m#$jXrRtTj z+F(KC?p0FIXjP+&RbDz}=dvz|T&eWI)noiy%)4OSH=8{VOk})|}Qz~ne z_pZKVU1ptJ%b=4}Zt7Qgm05emrq!2hJYTdzSLzKJQ7Uu6#h0#GeR1Z}_3QiAr8242 zYc5`==}N_c!OUQN%yeg!E*FP3%E=nBC=SYKR5BWhrBbd|6nR*NU3qIFalyqK*PNeO z(z&E_xwLMkK31)&e7!y>opf-tsx~!7)|Cq8fKD~j*j_E@c4{cyEW1`G9irc!Olp`^MRk%;M1^|%=Q=6+dOlOBX+|k6Qkcw1^+G0JtQ7_uYA`b@z1BoxbW3J%td4l;c;@m~ zi;B#XbtRp&I@YK+do87yU=+*sLaiYjW=xpUtLDPi+K{Cb&x_GGG%ERKLh+z3v#`v$ z=+(M)%WU3j3xzHD-1dwd(h`qMtB#J9q%kGTh2ctVyQzirm1@!B%?}tOnVaEJ^p4kd6)N+DgRp8NFyI-8;%&nYxG6uPNys zV{~L57%Js#q-COww2B=DLS8C(k&>RCH^;-wTBQeUU6*XyO1JMwt=UO4?+rC$WtkRb zOl4%~D+!OZkm7H^w@h6kMSrv2-ES6>U z5F72Daah*rU>utGXBri$xE`;nUK}pxO3lvdU59umbseh6T&;UhuI2^C?M#%ES=WW; zP^=D=8DK+kZJo&si+7q{24zOCLt~=_WgM1vmvfsH&l&N)UdfC|t)-sQ9H{qLRYX~A zMhl}&y;YW0+1-p5MNPIXvf`G;M$0)pdtIulEPjsFrE^IawR4a#n!c)sR&x}@R5R%6 zn5NSskrR_xnU>FFw&_4M8_H^skqygGaX7q?=%q)E=1Ma0Hml2SOcn||;n~jG$QOoW zTUNPtP}Us1f|R`hBRi|SH(!{E&H7|wYu2pZLarnm2%*u-e}h9xs}*JKb#qOAU^rtM zOAVEz>1DAgNLxq;lg*T#vZV>?YACm*AdRdC>DFASXu6j4LpLUB?+0i^!&kYyUc3D@;6}!EXvL&>3l@geM;6;wQgHg*6Xsd8!A=& zqD5Uc&$cz?aPp{(nf00VGI5mFX+L$m44wY8pf?+OQ)cFA34t9-dZd_04vxra>RO>J zyEwhe^*XNUBm&|0)r&p+J zbAAQP22;EfbdKg`HCu}M81KcJ4~+lC zH+^45f7Hasba!r>Kg&MJQJQoyp62dB!a$IL;HQx6OvPMzT6~= zzn|2%6Lj5aKhIt^4;j|?K4{PL=i|Ck@_N_{n3q*oiH`ir-ucgS>KD&4<|=O!=ReC~ zx<#(b&-9%=&$I7^eHQlGe{Dnm0jujo&@m++&%L+HeQ^ESWfNCE?OFd1%XMwP26{Xn zKMDIsaAQcvUv81P@@apQU^*taUE|KtZ zFa3;X&-?g^IacwvqcHI7i2j2J;qrNYf8kE6fo7n(iIvNUFe(4FJ>BFv{{-yMSSuGp zua1zc#Xa&rY|q~VjNES(^N7FA*|wsr({IWDuswe_urTYM=7RQPeo6!#YFEq? zS;@)k37fFL*j&{9VY#4x6G?mi4q@h7w0}fwwg0r|?-knLqW$AytNR}9`MZXBu-~uA z<;DKV`mi58E!V>3^Y;(0K55Kcf48}y{^WVqvvht%M!>ZSy zeE$cgzV7iY>~jY!+etqOvqsBW(8B)u@7a=OpL}rp#Vzb_#4q3u?KrsodtvX#@1{3w z`BxqjZ*Q+EwV_4%&mL`Ol(jd<+Y{|;VgJtCto>dtoG@UywAa%E`wlUkKKW&9pYsg7 stZ94BePR2=9Tpuq#i9!(Dan718?GO^d&ZT&=K*VfGS&gh5^ewg3kj?12mk;8 diff --git a/utils/ssbm/dists.dss b/utils/ssbm/dists.dss deleted file mode 100644 index 72157efee..000000000 --- a/utils/ssbm/dists.dss +++ /dev/null @@ -1,817 +0,0 @@ -# Sccsid: @(#)dists.dss 2.1.8.1 -# -# distributions have the following format: -# -# | # comment -# -# Distributions are used to bias the selection of a token -# based on its associated weight. The list of tokens and values -# between the keywords BEGIN and END define the distribution named after -# the BEGIN. A uniformly random value from [0, sum(weights)] -# will be chosen and the first token whose cumulative weight is greater than -# or equal to the result will be returned. In essence, the weights for each -# token represent its relative weight within a distribution. -# -# one special token is defined: count (number of data points in the -# distribution). It MUST be defined for each named distribution. -#----------------------------------------------------------------------- -# currently defined distributions and their use: -# NAME FIELD/NOTES -# ======== ============== -# category parts.category -# container parts.container -# instruct shipping instructions -# msegmnt market segment -# names parts.name -# nations must be ordered along with regions -# nations2 stand alone nations set for use with qgen -# o_prio order priority -# regions must be ordered along with nations -# rflag lineitems.returnflag -# types parts.type -# colors embedded string creation; CANNOT BE USED FOR pick_str(), agg_str() perturbs order -# articles comment generation -# nouns -# verbs -# adverbs -# auxillaries -# prepositions -# terminators -# grammar sentence formation -# np -# vp -### -# category -### -BEGIN category -COUNT|5 -FURNITURE|1 -STORAGE EQUIP|1 -TOOLS|1 -MACHINE TOOLS|1 -OTHER|1 -END category -### -# container -### -begin p_cntr -count|40 -SM CASE|1 -SM BOX|1 -SM BAG|1 -SM JAR|1 -SM PACK|1 -SM PKG|1 -SM CAN|1 -SM DRUM|1 -LG CASE|1 -LG BOX|1 -LG BAG|1 -LG JAR|1 -LG PACK|1 -LG PKG|1 -LG CAN|1 -LG DRUM|1 -MED CASE|1 -MED BOX|1 -MED BAG|1 -MED JAR|1 -MED PACK|1 -MED PKG|1 -MED CAN|1 -MED DRUM|1 -JUMBO CASE|1 -JUMBO BOX|1 -JUMBO BAG|1 -JUMBO JAR|1 -JUMBO PACK|1 -JUMBO PKG|1 -JUMBO CAN|1 -JUMBO DRUM|1 -WRAP CASE|1 -WRAP BOX|1 -WRAP BAG|1 -WRAP JAR|1 -WRAP PACK|1 -WRAP PKG|1 -WRAP CAN|1 -WRAP DRUM|1 -end p_cntr -### -# instruct -### -begin instruct -count|4 -DELIVER IN PERSON|1 -COLLECT COD|1 -TAKE BACK RETURN|1 -NONE|1 -end instruct -### -# msegmnt -### -begin msegmnt -count|5 -AUTOMOBILE|1 -BUILDING|1 -FURNITURE|1 -HOUSEHOLD|1 -MACHINERY|1 -end msegmnt -### -# names -### -begin p_names -COUNT|4 -CLEANER|1 -SOAP|1 -DETERGENT|1 -EXTRA|1 -end p_names -### -# nations -# NOTE: this is a special case; the weights here are adjustments to -# map correctly into the regions table, and are *NOT* cummulative -# values to mimic a distribution -### -begin nations -count|25 -ALGERIA|0 -ARGENTINA|1 -BRAZIL|0 -CANADA|0 -EGYPT|3 -ETHIOPIA|-4 -FRANCE|3 -GERMANY|0 -INDIA|-1 -INDONESIA|0 -IRAN|2 -IRAQ|0 -JAPAN|-2 -JORDAN|2 -KENYA|-4 -MOROCCO|0 -MOZAMBIQUE|0 -PERU|1 -CHINA|1 -ROMANIA|1 -SAUDI ARABIA|1 -VIETNAM|-2 -RUSSIA|1 -UNITED KINGDOM|0 -UNITED STATES|-2 -end nations -### -# nations2 -### -begin nations2 -count|25 -ALGERIA|1 -ARGENTINA|1 -BRAZIL|1 -CANADA|1 -EGYPT|1 -ETHIOPIA|1 -FRANCE|1 -GERMANY|1 -INDIA|1 -INDONESIA|1 -IRAN|1 -IRAQ|1 -JAPAN|1 -JORDAN|1 -KENYA|1 -MOROCCO|1 -MOZAMBIQUE|1 -PERU|1 -CHINA|1 -ROMANIA|1 -SAUDI ARABIA|1 -VIETNAM|1 -RUSSIA|1 -UNITED KINGDOM|1 -UNITED STATES|1 -end nations2 -### -# regions -### -begin regions -count|5 -AFRICA|1 -AMERICA|1 -ASIA|1 -EUROPE|1 -MIDDLE EAST|1 -end regions -### -# o_prio -### -begin o_oprio -count|5 -1-URGENT|1 -2-HIGH|1 -3-MEDIUM|1 -4-NOT SPECIFIED|1 -5-LOW|1 -end o_oprio -### -# rflag -### -begin rflag -count|2 -R|1 -A|1 -end rflag -### -# smode -### -begin smode -count|7 -REG AIR|1 -AIR|1 -RAIL|1 -TRUCK|1 -MAIL|1 -FOB|1 -SHIP|1 -end smode -### -# types -### -begin p_types -COUNT|150 -STANDARD ANODIZED TIN|1 -STANDARD ANODIZED NICKEL|1 -STANDARD ANODIZED BRASS|1 -STANDARD ANODIZED STEEL|1 -STANDARD ANODIZED COPPER|1 -STANDARD BURNISHED TIN|1 -STANDARD BURNISHED NICKEL|1 -STANDARD BURNISHED BRASS|1 -STANDARD BURNISHED STEEL|1 -STANDARD BURNISHED COPPER|1 -STANDARD PLATED TIN|1 -STANDARD PLATED NICKEL|1 -STANDARD PLATED BRASS|1 -STANDARD PLATED STEEL|1 -STANDARD PLATED COPPER|1 -STANDARD POLISHED TIN|1 -STANDARD POLISHED NICKEL|1 -STANDARD POLISHED BRASS|1 -STANDARD POLISHED STEEL|1 -STANDARD POLISHED COPPER|1 -STANDARD BRUSHED TIN|1 -STANDARD BRUSHED NICKEL|1 -STANDARD BRUSHED BRASS|1 -STANDARD BRUSHED STEEL|1 -STANDARD BRUSHED COPPER|1 -SMALL ANODIZED TIN|1 -SMALL ANODIZED NICKEL|1 -SMALL ANODIZED BRASS|1 -SMALL ANODIZED STEEL|1 -SMALL ANODIZED COPPER|1 -SMALL BURNISHED TIN|1 -SMALL BURNISHED NICKEL|1 -SMALL BURNISHED BRASS|1 -SMALL BURNISHED STEEL|1 -SMALL BURNISHED COPPER|1 -SMALL PLATED TIN|1 -SMALL PLATED NICKEL|1 -SMALL PLATED BRASS|1 -SMALL PLATED STEEL|1 -SMALL PLATED COPPER|1 -SMALL POLISHED TIN|1 -SMALL POLISHED NICKEL|1 -SMALL POLISHED BRASS|1 -SMALL POLISHED STEEL|1 -SMALL POLISHED COPPER|1 -SMALL BRUSHED TIN|1 -SMALL BRUSHED NICKEL|1 -SMALL BRUSHED BRASS|1 -SMALL BRUSHED STEEL|1 -SMALL BRUSHED COPPER|1 -MEDIUM ANODIZED TIN|1 -MEDIUM ANODIZED NICKEL|1 -MEDIUM ANODIZED BRASS|1 -MEDIUM ANODIZED STEEL|1 -MEDIUM ANODIZED COPPER|1 -MEDIUM BURNISHED TIN|1 -MEDIUM BURNISHED NICKEL|1 -MEDIUM BURNISHED BRASS|1 -MEDIUM BURNISHED STEEL|1 -MEDIUM BURNISHED COPPER|1 -MEDIUM PLATED TIN|1 -MEDIUM PLATED NICKEL|1 -MEDIUM PLATED BRASS|1 -MEDIUM PLATED STEEL|1 -MEDIUM PLATED COPPER|1 -MEDIUM POLISHED TIN|1 -MEDIUM POLISHED NICKEL|1 -MEDIUM POLISHED BRASS|1 -MEDIUM POLISHED STEEL|1 -MEDIUM POLISHED COPPER|1 -MEDIUM BRUSHED TIN|1 -MEDIUM BRUSHED NICKEL|1 -MEDIUM BRUSHED BRASS|1 -MEDIUM BRUSHED STEEL|1 -MEDIUM BRUSHED COPPER|1 -LARGE ANODIZED TIN|1 -LARGE ANODIZED NICKEL|1 -LARGE ANODIZED BRASS|1 -LARGE ANODIZED STEEL|1 -LARGE ANODIZED COPPER|1 -LARGE BURNISHED TIN|1 -LARGE BURNISHED NICKEL|1 -LARGE BURNISHED BRASS|1 -LARGE BURNISHED STEEL|1 -LARGE BURNISHED COPPER|1 -LARGE PLATED TIN|1 -LARGE PLATED NICKEL|1 -LARGE PLATED BRASS|1 -LARGE PLATED STEEL|1 -LARGE PLATED COPPER|1 -LARGE POLISHED TIN|1 -LARGE POLISHED NICKEL|1 -LARGE POLISHED BRASS|1 -LARGE POLISHED STEEL|1 -LARGE POLISHED COPPER|1 -LARGE BRUSHED TIN|1 -LARGE BRUSHED NICKEL|1 -LARGE BRUSHED BRASS|1 -LARGE BRUSHED STEEL|1 -LARGE BRUSHED COPPER|1 -ECONOMY ANODIZED TIN|1 -ECONOMY ANODIZED NICKEL|1 -ECONOMY ANODIZED BRASS|1 -ECONOMY ANODIZED STEEL|1 -ECONOMY ANODIZED COPPER|1 -ECONOMY BURNISHED TIN|1 -ECONOMY BURNISHED NICKEL|1 -ECONOMY BURNISHED BRASS|1 -ECONOMY BURNISHED STEEL|1 -ECONOMY BURNISHED COPPER|1 -ECONOMY PLATED TIN|1 -ECONOMY PLATED NICKEL|1 -ECONOMY PLATED BRASS|1 -ECONOMY PLATED STEEL|1 -ECONOMY PLATED COPPER|1 -ECONOMY POLISHED TIN|1 -ECONOMY POLISHED NICKEL|1 -ECONOMY POLISHED BRASS|1 -ECONOMY POLISHED STEEL|1 -ECONOMY POLISHED COPPER|1 -ECONOMY BRUSHED TIN|1 -ECONOMY BRUSHED NICKEL|1 -ECONOMY BRUSHED BRASS|1 -ECONOMY BRUSHED STEEL|1 -ECONOMY BRUSHED COPPER|1 -PROMO ANODIZED TIN|1 -PROMO ANODIZED NICKEL|1 -PROMO ANODIZED BRASS|1 -PROMO ANODIZED STEEL|1 -PROMO ANODIZED COPPER|1 -PROMO BURNISHED TIN|1 -PROMO BURNISHED NICKEL|1 -PROMO BURNISHED BRASS|1 -PROMO BURNISHED STEEL|1 -PROMO BURNISHED COPPER|1 -PROMO PLATED TIN|1 -PROMO PLATED NICKEL|1 -PROMO PLATED BRASS|1 -PROMO PLATED STEEL|1 -PROMO PLATED COPPER|1 -PROMO POLISHED TIN|1 -PROMO POLISHED NICKEL|1 -PROMO POLISHED BRASS|1 -PROMO POLISHED STEEL|1 -PROMO POLISHED COPPER|1 -PROMO BRUSHED TIN|1 -PROMO BRUSHED NICKEL|1 -PROMO BRUSHED BRASS|1 -PROMO BRUSHED STEEL|1 -PROMO BRUSHED COPPER|1 -end p_types -### -# colors -# NOTE: This distribution CANNOT be used by pick_str(), since agg_str() perturbs its order -### -begin colors -COUNT|92 -almond|1 -antique|1 -aquamarine|1 -azure|1 -beige|1 -bisque|1 -black|1 -blanched|1 -blue|1 -blush|1 -brown|1 -burlywood|1 -burnished|1 -chartreuse|1 -chiffon|1 -chocolate|1 -coral|1 -cornflower|1 -cornsilk|1 -cream|1 -cyan|1 -dark|1 -deep|1 -dim|1 -dodger|1 -drab|1 -firebrick|1 -floral|1 -forest|1 -frosted|1 -gainsboro|1 -ghost|1 -goldenrod|1 -green|1 -grey|1 -honeydew|1 -hot|1 -indian|1 -ivory|1 -khaki|1 -lace|1 -lavender|1 -lawn|1 -lemon|1 -light|1 -lime|1 -linen|1 -magenta|1 -maroon|1 -medium|1 -metallic|1 -midnight|1 -mint|1 -misty|1 -moccasin|1 -navajo|1 -navy|1 -olive|1 -orange|1 -orchid|1 -pale|1 -papaya|1 -peach|1 -peru|1 -pink|1 -plum|1 -powder|1 -puff|1 -purple|1 -red|1 -rose|1 -rosy|1 -royal|1 -saddle|1 -salmon|1 -sandy|1 -seashell|1 -sienna|1 -sky|1 -slate|1 -smoke|1 -snow|1 -spring|1 -steel|1 -tan|1 -thistle|1 -tomato|1 -turquoise|1 -violet|1 -wheat|1 -white|1 -yellow|1 -end colors -################ -################ -## psuedo text distributions -################ -################ -### -# nouns -### -BEGIN nouns -COUNT|45 -packages|40 -requests|40 -accounts|40 -deposits|40 -foxes|20 -ideas|20 -theodolites|20 -pinto beans|20 -instructions|20 -dependencies|10 -excuses|10 -platelets|10 -asymptotes|10 -courts|5 -dolphins|5 -multipliers|1 -sauternes|1 -warthogs|1 -frets|1 -dinos|1 -attainments|1 -somas|1 -Tiresias|1 -patterns|1 -forges|1 -braids|1 -frays|1 -warhorses|1 -dugouts|1 -notornis|1 -epitaphs|1 -pearls|1 -tithes|1 -waters|1 -orbits|1 -gifts|1 -sheaves|1 -depths|1 -sentiments|1 -decoys|1 -realms|1 -pains|1 -grouches|1 -escapades|1 -hockey players|1 -END nouns -### -# verbs -### -BEGIN verbs -COUNT|40 -sleep|20 -wake|20 -are|20 -cajole|20 -haggle|20 -nag|10 -use|10 -boost|10 -affix|5 -detect|5 -integrate|5 -maintain|1 -nod|1 -was|1 -lose|1 -sublate|1 -solve|1 -thrash|1 -promise|1 -engage|1 -hinder|1 -print|1 -x-ray|1 -breach|1 -eat|1 -grow|1 -impress|1 -mold|1 -poach|1 -serve|1 -run|1 -dazzle|1 -snooze|1 -doze|1 -unwind|1 -kindle|1 -play|1 -hang|1 -believe|1 -doubt|1 -END verbs -### -# adverbs -## -BEGIN adverbs -COUNT|28 -sometimes|1 -always|1 -never|1 -furiously|50 -slyly|50 -carefully|50 -blithely|40 -quickly|30 -fluffily|20 -slowly|1 -quietly|1 -ruthlessly|1 -thinly|1 -closely|1 -doggedly|1 -daringly|1 -bravely|1 -stealthily|1 -permanently|1 -enticingly|1 -idly|1 -busily|1 -regularly|1 -finally|1 -ironically|1 -evenly|1 -boldly|1 -silently|1 -END adverbs -### -# articles -## -BEGIN articles -COUNT|3 -the|50 -a|20 -an|5 -END articles -### -# prepositions -## -BEGIN prepositions -COUNT|47 -about|50 -above|50 -according to|50 -across|50 -after|50 -against|40 -along|40 -alongside of|30 -among|30 -around|20 -at|10 -atop|1 -before|1 -behind|1 -beneath|1 -beside|1 -besides|1 -between|1 -beyond|1 -by|1 -despite|1 -during|1 -except|1 -for|1 -from|1 -in place of|1 -inside|1 -instead of|1 -into|1 -near|1 -of|1 -on|1 -outside|1 -over|1 -past|1 -since|1 -through|1 -throughout|1 -to|1 -toward|1 -under|1 -until|1 -up|1 -upon|1 -whithout|1 -with|1 -within|1 -END prepositions -### -# auxillaries -## -BEGIN auxillaries -COUNT|18 -do|1 -may|1 -might|1 -shall|1 -will|1 -would|1 -can|1 -could|1 -should|1 -ought to|1 -must|1 -will have to|1 -shall have to|1 -could have to|1 -should have to|1 -must have to|1 -need to|1 -try to|1 -END auxiallaries -### -# terminators -## -BEGIN terminators -COUNT|6 -.|50 -;|1 -:|1 -?|1 -!|1 ---|1 -END terminators -### -# adjectives -## -BEGIN adjectives -COUNT|29 -special|20 -pending|20 -unusual|20 -express|20 -furious|1 -sly|1 -careful|1 -blithe|1 -quick|1 -fluffy|1 -slow|1 -quiet|1 -ruthless|1 -thin|1 -close|1 -dogged|1 -daring|1 -brave|1 -stealthy|1 -permanent|1 -enticing|1 -idle|1 -busy|1 -regular|50 -final|40 -ironic|40 -even|30 -bold|20 -silent|10 -END adjectives -### -# grammar -# first level grammar. N=noun phrase, V=verb phrase, -# P=prepositional phrase, T=setence termination -## -BEGIN grammar -COUNT|5 -N V T|3 -N V P T|3 -N V N T|3 -N P V N T|1 -N P V P T|1 -END grammar -### -# NP -# second level grammar. Noun phrases. N=noun, A=article, -# J=adjective, D=adverb -## -BEGIN np -COUNT|4 -N|10 -J N|20 -J, J N|10 -D J N|50 -END np -### -# VP -# second level grammar. Verb phrases. V=verb, X=auxiallary, -# D=adverb -## -BEGIN vp -COUNT|4 -V|30 -X V|1 -V D|40 -X V D|1 -END vp -### -# Q13 -# Substitution parameters for Q13 -## -BEGIN Q13a -COUNT|4 -special|20 -pending|20 -unusual|20 -express|20 -END Q13a -BEGIN Q13b -COUNT|4 -packages|40 -requests|40 -accounts|40 -deposits|40 -END Q13b diff --git a/utils/ssbm/driver.c b/utils/ssbm/driver.c deleted file mode 100644 index e537cd862..000000000 --- a/utils/ssbm/driver.c +++ /dev/null @@ -1,1156 +0,0 @@ -/* @(#)driver.c 2.1.8.4 */ -/* main driver for dss banchmark */ - -#define DECLARER /* EXTERN references get defined here */ -#define NO_FUNC (int (*) ()) NULL /* to clean up tdefs */ -#define NO_LFUNC (long (*) ()) NULL /* to clean up tdefs */ - -#include "config.h" -#include -#if (defined(_POSIX_)||!defined(WIN32)) /* Change for Windows NT */ -#ifndef DOS -#include -#include -#endif - -#endif /* WIN32 */ -#include /* */ -#include -#include -#include -#include -#include -#include -#ifdef HP -#include -#endif -#if (defined(WIN32)&&!defined(_POSIX_)) -#include -#pragma warning(disable:4201) -#pragma warning(disable:4214) -#pragma warning(disable:4514) -#define WIN32_LEAN_AND_MEAN -#define NOATOM -#define NOGDICAPMASKS -#define NOMETAFILE -#define NOMINMAX -#define NOMSG -#define NOOPENFILE -#define NORASTEROPS -#define NOSCROLL -#define NOSOUND -#define NOSYSMETRICS -#define NOTEXTMETRIC -#define NOWH -#define NOCOMM -#define NOKANJI -#define NOMCX - -#include "windows.h" - -#pragma warning(default:4201) -#pragma warning(default:4214) -#endif - -#include "dss.h" -#include "dsstypes.h" -#include "bcd2.h" - -/* -* Function prototypes -*/ -void usage (void); -int prep_direct (char *); -int close_direct (void); -void kill_load (void); -int pload (int tbl); -void gen_tbl (int tnum, long start, long count, long upd_num); -int pr_drange (int tbl, long min, long cnt, long num); -int set_files (int t, int pload); -int partial (int, int); - - -extern int optind, opterr; -extern char *optarg; -long rowcnt = 0, minrow = 0, upd_num = 0; -double flt_scale; -#if (defined(WIN32)&&!defined(_POSIX_)) -char *spawn_args[25]; -#endif - - -/* -* general table descriptions. See dss.h for details on structure -* NOTE: tables with no scaling info are scaled according to -* another table -* -* -* the following is based on the tdef structure defined in dss.h as: -* typedef struct -* { -* char *name; -- name of the table; -* flat file output in .tbl -* long base; -- base scale rowcount of table; -* 0 if derived -* int (*header) (); -- function to prep output -* int (*loader[2]) (); -- functions to present output -* long (*gen_seed) (); -- functions to seed the RNG -* int (*verify) (); -- function to verfiy the data set without building it -* int child; -- non-zero if there is an associated detail table -* unsigned long vtotal; -- "checksum" total -* } tdef; -* -*/ - -/* -* flat file print functions; used with -F(lat) option -*/ -#ifdef SSBM -int pr_cust (customer_t * c, int mode); -int pr_part (part_t * p, int mode); -int pr_supp (supplier_t * s, int mode); -int pr_line (order_t * o, int mode); -#else -int pr_cust (customer_t * c, int mode); -int pr_line (order_t * o, int mode); -int pr_order (order_t * o, int mode); -int pr_part (part_t * p, int mode); -int pr_psupp (part_t * p, int mode); -int pr_supp (supplier_t * s, int mode); -int pr_order_line (order_t * o, int mode); -int pr_part_psupp (part_t * p, int mode); -int pr_nation (code_t * c, int mode); -int pr_region (code_t * c, int mode); -#endif - -/* -* inline load functions; used with -D(irect) option -*/ -#ifdef SSBM -int ld_cust (customer_t * c, int mode); -int ld_part (part_t * p, int mode); -int ld_supp (supplier_t * s, int mode); - -/*todo: get rid of ld_order*/ -int ld_line (order_t * o, int mode); -int ld_order (order_t * o, int mode); - -#else -int ld_cust (customer_t * c, int mode); -int ld_line (order_t * o, int mode); -int ld_order (order_t * o, int mode); -int ld_part (part_t * p, int mode); -int ld_psupp (part_t * p, int mode); -int ld_supp (supplier_t * s, int mode); -int ld_order_line (order_t * o, int mode); -int ld_part_psupp (part_t * p, int mode); -int ld_nation (code_t * c, int mode); -int ld_region (code_t * c, int mode); -#endif - -/* -* seed generation functions; used with '-O s' option -*/ -#ifdef SSBM -long sd_cust (int child, long skip_count); -long sd_part (int child, long skip_count); -long sd_supp (int child, long skip_count); - -long sd_line (int child, long skip_count); -long sd_order (int child, long skip_count); - -#else -long sd_cust (int child, long skip_count); -long sd_line (int child, long skip_count); -long sd_order (int child, long skip_count); -long sd_part (int child, long skip_count); -long sd_psupp (int child, long skip_count); -long sd_supp (int child, long skip_count); -long sd_order_line (int child, long skip_count); -long sd_part_psupp (int child, long skip_count); -#endif - -/* -* header output functions); used with -h(eader) option -*/ -#ifdef SSBM -int hd_cust (FILE * f); -int hd_part (FILE * f); -int hd_supp (FILE * f); -int hd_line (FILE * f); - -#else -int hd_cust (FILE * f); -int hd_line (FILE * f); -int hd_order (FILE * f); -int hd_part (FILE * f); -int hd_psupp (FILE * f); -int hd_supp (FILE * f); -int hd_order_line (FILE * f); -int hd_part_psupp (FILE * f); -int hd_nation (FILE * f); -int hd_region (FILE * f); -#endif - -/* -* data verfication functions; used with -O v option -*/ -#ifdef SSBM -int vrf_cust (customer_t * c, int mode); -int vrf_part (part_t * p, int mode); -int vrf_supp (supplier_t * s, int mode); -int vrf_line (order_t * o, int mode); -int vrf_order (order_t * o, int mode); -int vrf_date (date_t,int mode); -#else -int vrf_cust (customer_t * c, int mode); -int vrf_line (order_t * o, int mode); -int vrf_order (order_t * o, int mode); -int vrf_part (part_t * p, int mode); -int vrf_psupp (part_t * p, int mode); -int vrf_supp (supplier_t * s, int mode); -int vrf_order_line (order_t * o, int mode); -int vrf_part_psupp (part_t * p, int mode); -int vrf_nation (code_t * c, int mode); -int vrf_region (code_t * c, int mode); -#endif - - -#ifdef SSBM -tdef tdefs[] = -{ - - {"part.tbl", "part table", 200000, hd_part, - {pr_part, ld_part}, sd_part, vrf_part, PSUPP, 0}, - {0,0,0,0,{0,0}, 0,0,0,0}, - {"supplier.tbl", "suppliers table", 10000, hd_supp, - {pr_supp, ld_supp}, sd_supp, vrf_supp, NONE, 0}, - - {"customer.tbl", "customers table", 30000, hd_cust, - {pr_cust, ld_cust}, sd_cust, vrf_cust, NONE, 0}, - {"date.tbl","date table",2556,0,{pr_date,ld_date}, 0,vrf_date, NONE,0}, - /*line order is SF*1,500,000, however due to the implementation - the base here is 150,000 instead if 1500,000*/ - {"lineorder.tbl", "lineorder table", 150000, hd_line, - {pr_line, ld_line}, sd_line, vrf_line, NONE, 0}, - {0,0,0,0,{0,0}, 0,0,0,0}, - {0,0,0,0,{0,0}, 0,0,0,0}, - {0,0,0,0,{0,0}, 0,0,0,0}, - {0,0,0,0,{0,0}, 0,0,0,0}, -}; - -#else - -tdef tdefs[] = -{ - {"part.tbl", "part table", 200000, hd_part, - {pr_part, ld_part}, sd_part, vrf_part, PSUPP, 0}, - {"partsupp.tbl", "partsupplier table", 200000, hd_psupp, - {pr_psupp, ld_psupp}, sd_psupp, vrf_psupp, NONE, 0}, - {"supplier.tbl", "suppliers table", 10000, hd_supp, - {pr_supp, ld_supp}, sd_supp, vrf_supp, NONE, 0}, - {"customer.tbl", "customers table", 150000, hd_cust, - {pr_cust, ld_cust}, sd_cust, vrf_cust, NONE, 0}, - {"orders.tbl", "order table", 150000, hd_order, - {pr_order, ld_order}, sd_order, vrf_order, LINE, 0}, - {"lineitem.tbl", "lineitem table", 150000, hd_line, - {pr_line, ld_line}, sd_line, vrf_line, NONE, 0}, - {"orders.tbl", "orders/lineitem tables", 150000, hd_order_line, - {pr_order_line, ld_order_line}, sd_order, vrf_order_line, LINE, 0}, - {"part.tbl", "part/partsupplier tables", 200000, hd_part_psupp, - {pr_part_psupp, ld_part_psupp}, sd_part, vrf_part_psupp, PSUPP, 0}, - {"nation.tbl", "nation table", NATIONS_MAX, hd_nation, - {pr_nation, ld_nation}, NO_LFUNC, vrf_nation, NONE, 0}, - {"region.tbl", "region table", NATIONS_MAX, hd_region, - {pr_region, ld_region}, NO_LFUNC, vrf_region, NONE, 0}, -}; -#endif -int *pids; - - -/* -* routines to handle the graceful cleanup of multi-process loads -*/ - -void -stop_proc (int signum) -{ - exit (0); -} - -void -kill_load (void) -{ - int i; - -#if !defined(U2200) && !defined(DOS) - for (i = 0; i < children; i++) - if (pids[i]) - KILL (pids[i]); -#endif /* !U2200 && !DOS */ - return; -} - -/* -* re-set default output file names -*/ -int -set_files (int i, int pload) -{ - char line[80], *new_name; - - if (table & (1 << i)) -child_table: - { - if (pload != -1) - sprintf (line, "%s.%d", tdefs[i].name, pload); - else - { - printf ("Enter new destination for %s data: ", - tdefs[i].name); - if (fgets (line, sizeof (line), stdin) == NULL) - return (-1);; - if ((new_name = strchr (line, '\n')) != NULL) - *new_name = '\0'; - if (strlen (line) == 0) - return (0); - } - new_name = (char *) malloc (strlen (line) + 1); - MALLOC_CHECK (new_name); - strcpy (new_name, line); - tdefs[i].name = new_name; - if (tdefs[i].child != NONE) - { - i = tdefs[i].child; - tdefs[i].child = NONE; - goto child_table; - } - } - - return (0); -} - - - -/* -* read the distributions needed in the benchamrk -*/ -void -load_dists (void) -{ - read_dist (env_config (DIST_TAG, DIST_DFLT), "p_cntr", &p_cntr_set); - read_dist (env_config (DIST_TAG, DIST_DFLT), "colors", &colors); - read_dist (env_config (DIST_TAG, DIST_DFLT), "p_types", &p_types_set); - read_dist (env_config (DIST_TAG, DIST_DFLT), "nations", &nations); - read_dist (env_config (DIST_TAG, DIST_DFLT), "regions", ®ions); - read_dist (env_config (DIST_TAG, DIST_DFLT), "o_oprio", - &o_priority_set); - read_dist (env_config (DIST_TAG, DIST_DFLT), "instruct", - &l_instruct_set); - read_dist (env_config (DIST_TAG, DIST_DFLT), "smode", &l_smode_set); - read_dist (env_config (DIST_TAG, DIST_DFLT), "category", - &l_category_set); - read_dist (env_config (DIST_TAG, DIST_DFLT), "rflag", &l_rflag_set); - read_dist (env_config (DIST_TAG, DIST_DFLT), "msegmnt", &c_mseg_set); - - /* load the distributions that contain text generation */ - read_dist (env_config (DIST_TAG, DIST_DFLT), "nouns", &nouns); - read_dist (env_config (DIST_TAG, DIST_DFLT), "verbs", &verbs); - read_dist (env_config (DIST_TAG, DIST_DFLT), "adjectives", &adjectives); - read_dist (env_config (DIST_TAG, DIST_DFLT), "adverbs", &adverbs); - read_dist (env_config (DIST_TAG, DIST_DFLT), "auxillaries", &auxillaries); - read_dist (env_config (DIST_TAG, DIST_DFLT), "terminators", &terminators); - read_dist (env_config (DIST_TAG, DIST_DFLT), "articles", &articles); - read_dist (env_config (DIST_TAG, DIST_DFLT), "prepositions", &prepositions); - read_dist (env_config (DIST_TAG, DIST_DFLT), "grammar", &grammar); - read_dist (env_config (DIST_TAG, DIST_DFLT), "np", &np); - read_dist (env_config (DIST_TAG, DIST_DFLT), "vp", &vp); - -} - -/* -* generate a particular table -*/ -void -gen_tbl (int tnum, long start, long count, long upd_num) -{ - static order_t o; - supplier_t supp; - customer_t cust; - part_t part; -#ifdef SSBM - date_t dt; -#else - code_t code; -#endif - static int completed = 0; - static int init = 0; - long i; - - int rows_per_segment=0; - int rows_this_segment=-1; - int residual_rows=0; - - if (insert_segments) - { - rows_per_segment = count / insert_segments; - residual_rows = count - (rows_per_segment * insert_segments); - } - - if (init == 0) - { - INIT_HUGE(o.okey); - for (i=0; i < O_LCNT_MAX; i++) -#ifdef SSBM - INIT_HUGE(o.lineorders[i].okey); -#else - INIT_HUGE(o.l[i].okey); -#endif - init = 1; - } - - for (i = start; count; count--, i++) - { - LIFENOISE (1000, i); - row_start(tnum); - - switch (tnum) - { - case LINE: -#ifdef SSBM -#else - case ORDER: - case ORDER_LINE: -#endif - mk_order (i, &o, upd_num % 10000); - - if (insert_segments && (upd_num > 0)) - if((upd_num / 10000) < residual_rows) - { - if((++rows_this_segment) > rows_per_segment) - { - rows_this_segment=0; - upd_num += 10000; - } - } - else - { - if((++rows_this_segment) >= rows_per_segment) - { - rows_this_segment=0; - upd_num += 10000; - } - } - - if (set_seeds == 0) - if (validate) - tdefs[tnum].verify(&o, 0); - else - tdefs[tnum].loader[direct] (&o, upd_num); - break; - case SUPP: - mk_supp (i, &supp); - if (set_seeds == 0) - if (validate) - tdefs[tnum].verify(&supp, 0); - else - tdefs[tnum].loader[direct] (&supp, upd_num); - break; - case CUST: - mk_cust (i, &cust); - if (set_seeds == 0) - if (validate) - tdefs[tnum].verify(&cust, 0); - else - tdefs[tnum].loader[direct] (&cust, upd_num); - break; -#ifdef SSBM - case PART: -#else - case PSUPP: - case PART: - case PART_PSUPP: -#endif - mk_part (i, &part); - if (set_seeds == 0) - if (validate) - tdefs[tnum].verify(&part, 0); - else - tdefs[tnum].loader[direct] (&part, upd_num); - break; -#ifdef SSBM - case DATE: - mk_date (i, &dt); - if (set_seeds == 0) - if (validate) - tdefs[tnum].verify(&dt, 0); - else - tdefs[tnum].loader[direct] (&dt, 0); - break; -#else - case NATION: - mk_nation (i, &code); - if (set_seeds == 0) - if (validate) - tdefs[tnum].verify(&code, 0); - else - tdefs[tnum].loader[direct] (&code, 0); - break; - case REGION: - mk_region (i, &code); - if (set_seeds == 0) - if (validate) - tdefs[tnum].verify(&code, 0); - else - tdefs[tnum].loader[direct] (&code, 0); - break; -#endif - } - row_stop(tnum); - if (set_seeds && (i % tdefs[tnum].base) < 2) - { - printf("\nSeeds for %s at rowcount %ld\n", tdefs[tnum].comment, i); - dump_seeds(tnum); - } - } - completed |= 1 << tnum; -} - - - -void -usage (void) -{ -#ifdef SSBM - fprintf (stderr, "%s\n%s\n\t%s\n%s %s\n\n", - "USAGE:", - "dbgen [-{vfFD}] [-O {fhmsv}][-T {pcsdla}]", - "[-s ][-C ][-S ]", - "dbgen [-v] [-O {dfhmr}] [-s ]", - "[-U ] [-r ]"); - -#else - fprintf (stderr, "%s\n%s\n\t%s\n%s %s\n\n", - "USAGE:", - "dbgen [-{vfFD}] [-O {fhmsv}][-T {pcsoPSOL}]", - "[-s ][-C ][-S ]", - "dbgen [-v] [-O {dfhmr}] [-s ]", - "[-U ] [-r ]"); -#endif - fprintf (stderr, "-b -- load distributions for \n"); - fprintf (stderr, "-C -- use processes to generate data\n"); - fprintf (stderr, " [Under DOS, must be used with -S]\n"); - fprintf (stderr, "-D -- do database load in line\n"); - fprintf (stderr, "-d -- split deletes between files\n"); - fprintf (stderr, "-f -- force. Overwrite existing files\n"); - fprintf (stderr, "-F -- generate flat files output\n"); - fprintf (stderr, "-h -- display this message\n"); - fprintf (stderr, "-i -- split inserts between files\n"); - fprintf (stderr, "-n -- inline load into database \n"); - fprintf (stderr, "-O d -- generate SQL syntax for deletes\n"); - fprintf (stderr, "-O f -- over-ride default output file names\n"); - fprintf (stderr, "-O h -- output files with headers\n"); - fprintf (stderr, "-O m -- produce columnar output\n"); - fprintf (stderr, "-O r -- generate key ranges for deletes.\n"); - fprintf (stderr, "-O v -- Verify data set without generating it.\n"); - fprintf (stderr, "-q -- enable QUIET mode\n"); - fprintf (stderr, "-r -- updates refresh (n/100)%% of the\n"); - fprintf (stderr, " data set\n"); - fprintf (stderr, "-s -- set Scale Factor (SF) to \n"); - fprintf (stderr, "-S -- build the th step of the data/update set\n"); - -#ifdef SSBM - fprintf (stderr, "-T c -- generate cutomers dimension table ONLY\n"); - fprintf (stderr, "-T p -- generate parts dimension table ONLY\n"); - fprintf (stderr, "-T s -- generate suppliers dimension table ONLY\n"); - fprintf (stderr, "-T d -- generate date dimension table ONLY\n"); - fprintf (stderr, "-T l -- generate lineorder fact table ONLY\n"); -#else - fprintf (stderr, "-T c -- generate cutomers ONLY\n"); - fprintf (stderr, "-T l -- generate nation/region ONLY\n"); - fprintf (stderr, "-T L -- generate lineitem ONLY\n"); - fprintf (stderr, "-T n -- generate nation ONLY\n"); - fprintf (stderr, "-T o -- generate orders/lineitem ONLY\n"); - fprintf (stderr, "-T O -- generate orders ONLY\n"); - fprintf (stderr, "-T p -- generate parts/partsupp ONLY\n"); - fprintf (stderr, "-T P -- generate parts ONLY\n"); - fprintf (stderr, "-T r -- generate region ONLY\n"); - fprintf (stderr, "-T s -- generate suppliers ONLY\n"); - fprintf (stderr, "-T S -- generate partsupp ONLY\n"); -#endif - fprintf (stderr, "-X -- output to stdout\n"); - - fprintf (stderr, "-U -- generate update sets\n"); - fprintf (stderr, "-v -- enable VERBOSE mode\n"); - fprintf (stderr, - "\nTo generate the SF=1 (1GB), validation database population, use:\n"); - fprintf (stderr, "\tdbgen -vfF -s 1\n"); - fprintf (stderr, "\nTo generate updates for a SF=1 (1GB), use:\n"); - fprintf (stderr, "\tdbgen -v -U 1 -s 1\n"); -} - -/* -* pload() -- handle the parallel loading of tables -*/ -/* -* int partial(int tbl, int s) -- generate the s-th part of the named tables data -*/ -int -partial (int tbl, int s) -{ - long rowcnt; - long extra; - - if (verbose > 0) - { - fprintf (stderr, "\tStarting to load stage %d of %ld for %s...", - s, children, tdefs[tbl].comment); - } - - if (direct == 0) - set_files (tbl, s); - - rowcnt = set_state(tbl, scale, children, s, &extra); - - if (s == children) - gen_tbl (tbl, rowcnt * (s - 1) + 1, rowcnt + extra, upd_num); - else - gen_tbl (tbl, rowcnt * (s - 1) + 1, rowcnt, upd_num); - - if (verbose > 0) - fprintf (stderr, "done.\n"); - - return (0); -} - -#ifndef DOS - -int -pload (int tbl) -{ - int c = 0, i, status; - - if (verbose > 0) - { - fprintf (stderr, "Starting %ld children to load %s", - children, tdefs[tbl].comment); - } - for (c = 0; c < children; c++) - { - pids[c] = SPAWN (); - if (pids[c] == -1) - { - perror ("Child loader not created"); - kill_load (); - exit (-1); - } - else if (pids[c] == 0) /* CHILD */ - { - SET_HANDLER (stop_proc); - verbose = 0; - partial (tbl, c+1); - exit (0); - } - else if (verbose > 0) /* PARENT */ - fprintf (stderr, "."); - } - - if (verbose > 0) - fprintf (stderr, "waiting..."); - - c = children; - while (c) - { - i = WAIT (&status, pids[c - 1]); - if (i == -1 && children) - { - if (errno == ECHILD) - fprintf (stderr, "\nCould not wait on pid %d\n", pids[c - 1]); - else if (errno == EINTR) - fprintf (stderr, "\nProcess %d stopped abnormally\n", pids[c - 1]); - else if (errno == EINVAL) - fprintf (stderr, "\nProgram bug\n"); - } - if (! WIFEXITED(status)) { - (void) fprintf(stderr, "\nProcess %d: ", i); - if (WIFSIGNALED(status)) { - (void) fprintf(stderr, "rcvd signal %d\n", - WTERMSIG(status)); - } else if (WIFSTOPPED(status)) { - (void) fprintf(stderr, "stopped, signal %d\n", - WSTOPSIG(status)); - } - - } - c--; - } - - if (verbose > 0) - fprintf (stderr, "done\n"); - return (0); -} -#endif - - -void -process_options (int count, char **vector) -{ - int option; - - while ((option = getopt (count, vector, - "b:C:Dd:Ffi:hn:O:P:qr:s:S:T:XU:v")) != -1) - switch (option) - { - case 'b': /* load distributions from named file */ - d_path = (char *)malloc(strlen(optarg) + 1); - MALLOC_CHECK(d_path); - strcpy(d_path, optarg); - break; - case 'q': /* all prompts disabled */ - verbose = -1; - break; - case 'i': - insert_segments = atoi (optarg); - break; - case 'd': - delete_segments = atoi (optarg); - break; - case 'S': /* generate a particular STEP */ - step = atoi (optarg); - break; - case 'v': /* life noises enabled */ - verbose = 1; - break; - case 'f': /* blind overwrites; Force */ - force = 1; - break; - case 'T': /* generate a specifc table */ - switch (*optarg) - { -#ifdef SSBM - case 'c': /* generate customer ONLY */ - table = 1 << CUST; - break; - case 'p': /* generate part ONLY */ - table = 1 << PART; - break; - case 's': /* generate partsupp ONLY */ - table = 1 << SUPP; - break; - case 'd': /* generate date ONLY */ - table = 1 << DATE; - break; - case 'l': /* generate lineorder table ONLY */ - table = 1 << LINE; - break; - case 'a': - table = 1 << CUST; - table |= 1 << PART; - table |= 1 << SUPP; - table |= 1 << DATE; - table |= 1 << LINE; - break; -#else - case 'c': /* generate customer ONLY */ - table = 1 << CUST; - break; - case 'L': /* generate lineitems ONLY */ - table = 1 << LINE; - break; - case 'l': /* generate code table ONLY */ - table = 1 << NATION; - table |= 1 << REGION; - break; - case 'n': /* generate nation table ONLY */ - table = 1 << NATION; - break; - case 'O': /* generate orders ONLY */ - table = 1 << ORDER; - break; - case 'o': /* generate orders/lineitems ONLY */ - table = 1 << ORDER_LINE; - break; - case 'P': /* generate part ONLY */ - table = 1 << PART; - break; - case 'p': /* generate part/partsupp ONLY */ - table = 1 << PART_PSUPP; - break; - case 'r': /* generate region table ONLY */ - table = 1 << REGION; - break; - case 'S': /* generate partsupp ONLY */ - table = 1 << PSUPP; - break; - case 's': /* generate suppliers ONLY */ - table = 1 << SUPP; - break; -#endif - default: - fprintf (stderr, "Unknown table name %s\n", - optarg); - usage (); - exit (1); - } - break; - case 'X': - print_to_stdout = 1; - break; - case 's': /* scale by Percentage of base rowcount */ - case 'P': /* for backward compatibility */ - flt_scale = atof (optarg); - if (flt_scale < MIN_SCALE) - { - int i; - - scale = 1; - for (i = PART; i < REGION; i++) - { - tdefs[i].base *= flt_scale; - if (tdefs[i].base < 1) - tdefs[i].base = 1; - } - } - else - scale = (long) flt_scale; - if (scale > MAX_SCALE) - { - fprintf (stderr, "%s %5.0f %s\n\t%s\n\n", - "NOTE: Data generation for scale factors >", - MAX_SCALE, - "GB is still in development,", - "and is not yet supported.\n"); - fprintf (stderr, - "Your resulting data set MAY NOT BE COMPLIANT!\n"); - } - break; - case 'O': /* optional actions */ - switch (tolower (*optarg)) - { - case 'd': /* generate SQL for deletes */ - gen_sql = 1; - break; - case 'f': /* over-ride default file names */ - fnames = 1; - break; - case 'h': /* generate headers */ - header = 1; - break; - case 'm': /* generate columnar output */ - columnar = 1; - break; - case 'r': /* generate key ranges for delete */ - gen_rng = 1; - break; - case 's': /* calibrate the RNG usage */ - set_seeds = 1; - break; - case 'v': /* validate the data set */ - validate = 1; - break; - default: - fprintf (stderr, "Unknown option name %s\n", - optarg); - usage (); - exit (1); - } - break; - case 'D': /* direct load of generated data */ - direct = 1; - break; - case 'F': /* generate flat files for later loading */ - direct = 0; - break; - case 'U': /* generate flat files for update stream */ - updates = atoi (optarg); - break; - case 'r': /* set the refresh (update) percentage */ - refresh = atoi (optarg); - break; -#ifndef DOS - case 'C': - children = atoi (optarg); - break; -#endif /* !DOS */ - case 'n': /* set name of database for direct load */ - db_name = (char *) malloc (strlen (optarg) + 1); - MALLOC_CHECK (db_name); - strcpy (db_name, optarg); - break; - default: - printf ("ERROR: option '%c' unknown.\n", - *(vector[optind] + 1)); - case 'h': /* something unexpected */ - fprintf (stderr, - "%s Population Generator (Version %d.%d.%d%s)\n", - NAME, VERSION, RELEASE, - MODIFICATION, PATCH); - fprintf (stderr, "Copyright %s %s\n", TPC, C_DATES); - usage (); - exit (1); - } - - if (print_to_stdout) - { - if (table == 0) - fprintf(stderr, "-X must be used with -T together\n"); - if ((table & (table - 1)) != 0) - fprintf(stderr, "-X must be used towards single -T option\n"); - } - -#ifndef DOS - if (children != 1 && step == -1) - { - pids = malloc(children * sizeof(pid_t)); - MALLOC_CHECK(pids) - } -#else - if (children != 1 && step < 0) - { - fprintf(stderr, "ERROR: -C must be accompanied by -S on this platform\n"); - exit(1); - } -#endif /* DOS */ - - return; -} - -/* -* MAIN -* -* assumes the existance of getopt() to clean up the command -* line handling -*/ -int -main (int ac, char **av) -{ - int i; - - table = (1 << CUST) | - (1 << SUPP) | - (1 << NATION) | - (1 << REGION) | - (1 << PART_PSUPP) | - (1 << ORDER_LINE); - force = 0; - insert_segments=0; - delete_segments=0; - insert_orders_segment=0; - insert_lineitem_segment=0; - delete_segment=0; - verbose = 0; - columnar = 0; - set_seeds = 0; - header = 0; - direct = 0; - scale = 1; - flt_scale = 1.0; - updates = 0; - refresh = UPD_PCT; - step = -1; -#ifdef SSBM - tdefs[LINE].base *= - ORDERS_PER_CUST; /* have to do this after init */ -#else - tdefs[ORDER].base *= - ORDERS_PER_CUST; /* have to do this after init */ - tdefs[LINE].base *= - ORDERS_PER_CUST; /* have to do this after init */ - tdefs[ORDER_LINE].base *= - ORDERS_PER_CUST; /* have to do this after init */ -#endif - fnames = 0; - db_name = NULL; - gen_sql = 0; - gen_rng = 0; - children = 1; - d_path = NULL; - -#ifdef NO_SUPPORT - signal (SIGINT, exit); -#endif /* NO_SUPPORT */ - process_options (ac, av); -#if (defined(WIN32)&&!defined(_POSIX_)) - for (i = 0; i < ac; i++) - { - spawn_args[i] = malloc ((strlen (av[i]) + 1) * sizeof (char)); - MALLOC_CHECK (spawn_args[i]); - strcpy (spawn_args[i], av[i]); - } - spawn_args[ac] = NULL; -#endif - - if (verbose >= 0) - { - fprintf (stderr, - "%s Population Generator (Version %d.%d.%d%s)\n", - NAME, VERSION, RELEASE, MODIFICATION, PATCH); - fprintf (stderr, "Copyright %s %s\n", TPC, C_DATES); - } - - load_dists (); - /* have to do this after init */ - tdefs[NATION].base = nations.count; - tdefs[REGION].base = regions.count; - - /* - * updates are never parallelized - */ - if (updates) - { - /* - * set RNG to start generating rows beyond SF=scale - */ - double fix1; - -#ifdef SSBM - set_state (LINE, scale, 1, 2, (long *)&i); - fix1 = (double)tdefs[LINE].base / (double)10000; /*represent the %% percentage (n/100)%*/ -#else - set_state (ORDER, scale, 1, 2, (long *)&i); - fix1 = (double)tdefs[ORDER_LINE].base / (double)10000; -#endif - rowcnt = (int)(fix1 * scale * refresh); - if (step > 0) - { - /* - * adjust RNG for any prior update generation - */ - sd_order(0, rowcnt * (step - 1)); - sd_line(0, rowcnt * (step - 1)); - upd_num = step - 1; - } - else - upd_num = 0; - - while (upd_num < updates) - { - if (verbose > 0) -#ifdef SSBM - fprintf (stderr, - "Generating update pair #%ld for %s [pid: %d]", - upd_num + 1, tdefs[LINE].comment, DSS_PROC); -#else - fprintf (stderr, - "Generating update pair #%ld for %s [pid: %d]", - upd_num + 1, tdefs[ORDER_LINE].comment, DSS_PROC); - -#endif - insert_orders_segment=0; - insert_lineitem_segment=0; - delete_segment=0; - minrow = upd_num * rowcnt + 1; -#ifdef SSBM - gen_tbl (LINE, minrow, rowcnt, upd_num + 1); -#else - gen_tbl (ORDER_LINE, minrow, rowcnt, upd_num + 1); -#endif - if (verbose > 0) - fprintf (stderr, "done.\n"); -#ifdef SSBM - pr_drange (LINE, minrow, rowcnt, upd_num + 1); -#else - pr_drange (ORDER_LINE, minrow, rowcnt, upd_num + 1); -#endif - upd_num++; - } - - exit (0); - } - - /** - ** actual data generation section starts here - **/ -/* - * open database connection or set all the file names, as appropriate - */ - if (direct) - prep_direct ((db_name) ? db_name : DBNAME); - else if (fnames) - for (i = PART; i <= REGION; i++) - { - if (table & (1 << i)) - if (set_files (i, -1)) - { - fprintf (stderr, "Load aborted!\n"); - exit (1); - } - } - -/* - * traverse the tables, invoking the appropriate data generation routine for any to be built - */ - for (i = PART; i <= REGION; i++) - if (table & (1 << i)) - { - if (children > 1 && i < NATION) - if (step >= 0) - { - if (validate) - { - INTERNAL_ERROR("Cannot validate parallel data generation"); - } - else - partial (i, step); - } -#ifdef DOS - else - { - fprintf (stderr, - "Parallel load is not supported on your platform.\n"); - exit (1); - } -#else - else - { - if (validate) - { - INTERNAL_ERROR("Cannot validate parallel data generation"); - } - else - pload (i); - } -#endif /* DOS */ - else - { - minrow = 1; - if (i < NATION) - rowcnt = tdefs[i].base * scale; - else - rowcnt = tdefs[i].base; -#ifdef SSBM - if(i==PART){ - rowcnt = tdefs[i].base * (floor(1+log((double)(scale))/(log(2)))); - } - if(i==DATE){ - rowcnt = tdefs[i].base; - } -#endif - if (verbose > 0) - fprintf (stderr, "%s data for %s [pid: %u]", - (validate)?"Validating":"Generating", tdefs[i].comment, DSS_PROC); - gen_tbl (i, minrow, rowcnt, upd_num); - if (verbose > 0) - fprintf (stderr, "done.\n"); - } - if (validate) - printf("Validation checksum for %s at %ld GB: %0lx\n", - tdefs[i].name, scale, tdefs[i].vtotal); - } - - if (direct) - close_direct (); - - return (0); -} - - - - - - - - - - - diff --git a/utils/ssbm/dss.h b/utils/ssbm/dss.h deleted file mode 100644 index 2fd40a6b3..000000000 --- a/utils/ssbm/dss.h +++ /dev/null @@ -1,596 +0,0 @@ -/* - * Sccsid: @(#)dss.h 2.1.8.5 - * - * general definitions and control information for the DSS code - * generator; if it controls the data set, it's here - */ -#ifndef DSS_H -#define DSS_H - -#ifdef SSBM -#define NAME "SSBM (Star Schema Benchmark)" -#define VERSION 1 -#define RELEASE 0 -#define MODIFICATION 0 -#define PATCH "" - - -/*global variables*/ -/*SSBM added DATE table*/ -#define DATE 4 - -/*SSBM use the lineorder without partsupp and order table*/ -#define L_SKEY_MIN 1 -#define L_SKEY_MAX (tdefs[SUPP].base * scale) - -#endif - -#ifdef TPCH -#define NAME "TPC-H" -#define VERSION 1 -#define RELEASE 3 -#define MODIFICATION 0 -#define PATCH "" -#endif -#ifdef TPCR -#define NAME "TPC-R" -#define VERSION 1 -#define RELEASE 3 -#define MODIFICATION 0 -#define PATCH "" -#endif -#ifndef NAME -#error Benchmark version must be defined in config.h -#endif -#define TPC "Transaction Processing Performance Council" -#define C_DATES "1994 - 2000" - -#include "config.h" -#include "shared.h" - -#include -#include -#ifdef SSBM -#include -#endif - -#define NONE -1 -#define PART 0 -#define PSUPP 1 -#define SUPP 2 -#define CUST 3 -#define ORDER 4 -#define LINE 5 -#define ORDER_LINE 6 -#define PART_PSUPP 7 -#define NATION 8 -#define REGION 9 -#define UPDATE 10 -#define MAX_TABLE 11 -#define ONE_STREAM 1 -#define ADD_AT_END 2 - -#ifdef MAX -#undef MAX -#endif -#ifdef MIN -#undef MIN -#endif -#define MAX(a,b) ((a > b )?a:b) -#define MIN(A,B) ( (A) < (B) ? (A) : (B)) - -#define INTERNAL_ERROR(p) {fprintf(stderr,"%s", p);abort();} -#define LN_CNT 4 -static char lnoise[4] = {'|', '/', '-', '\\' }; -#define LIFENOISE(n, var) \ - if (verbose > 0) fprintf(stderr, "%c\b", lnoise[(var%LN_CNT)]) - -#define MALLOC_CHECK(var) \ - if ((var) == NULL) \ - { \ - fprintf(stderr, "Malloc failed at %s:%d\n", \ - __FILE__, __LINE__); \ - exit(1);\ - } -#define OPEN_CHECK(var, path) \ - if ((var) == NULL) \ - { \ - fprintf(stderr, "Open failed for %s at %s:%d\n", \ - path, __FILE__, __LINE__); \ - exit(1);\ - } -#ifndef MAX_CHILDREN -#define MAX_CHILDREN 1000 -#endif - -/* - * macros that control sparse keys - * - * refer to Porting.Notes for a complete explanation - */ -#ifndef BITS_PER_LONG -#define BITS_PER_LONG 32 -#define MAX_LONG 0x7FFFFFFF -#endif /* BITS_PER_LONG */ -#define SPARSE_BITS 2 -#define SPARSE_KEEP 3 -#define MK_SPARSE(key, seq) \ - (((((key>>3)<<2)|(seq & 0x0003))<<3)|(key & 0x0007)) - -#define RANDOM(tgt, lower, upper, stream) dss_random(&tgt, lower, upper, stream) -#ifdef SSBM -typedef struct{ - char * name; - int start_day; - int start_month; - int end_day; - int end_month; -} season; -typedef struct { - char * name; - int month; - int day; -} holiday; - - -#endif - - -typedef struct -{ - long weight; - char *text; -} set_member; - -typedef struct -{ - int count; - int max; - set_member *list; - long *permute; -} distribution; - -/* - * some handy access functions - */ -#define DIST_SIZE(d) d->count -#define DIST_MEMBER(d, i) ((set_member *)((d)->list + i))->text - -typedef struct -{ - char *name; - char *comment; - long base; - int (*header) (); - int (*loader[2]) (); - long (*gen_seed)(); - int (*verify) (); - int child; - unsigned long vtotal; -} tdef; - -typedef struct SEED_T { - long table; - long value; - long usage; - long boundary; - } seed_t; - - -#if defined(__STDC__) -#define PROTO(s) s -#else -#define PROTO(s) () -#endif - -/* bm_utils.c */ -char *env_config PROTO((char *var, char *dflt)); -long yes_no PROTO((char *prompt)); -int a_rnd PROTO((int min, int max, int column, char *dest)); -int tx_rnd PROTO((long min, long max, long column, char *tgt)); -long julian PROTO((long date)); -long unjulian PROTO((long date)); -FILE *tbl_open PROTO((int tbl, char *mode)); -long dssncasecmp PROTO((char *s1, char *s2, int n)); -long dsscasecmp PROTO((char *s1, char *s2)); -int pick_str PROTO((distribution * s, int c, char *target)); -void agg_str PROTO((distribution *set, long count, long col, char *dest)); -void read_dist PROTO((char *path, char *name, distribution * target)); -void embed_str PROTO((distribution *d, int min, int max, int stream, char *dest)); -#ifndef STDLIB_HAS_GETOPT -int getopt PROTO((int arg_cnt, char **arg_vect, char *oprions)); -#endif /* STDLIB_HAS_GETOPT */ -long set_state PROTO((int t, long scale, long procs, long step, long *e)); - -/* rnd.c */ -long NextRand PROTO((long nSeed)); -long UnifInt PROTO((long nLow, long nHigh, long nStream)); -double UnifReal PROTO((double dLow, double dHigh, long nStream)); -double Exponential PROTO((double dMean, long nStream)); -void dss_random(long *tgt, long min, long max, long seed); -void row_start(int t); -void row_stop(int t); -void dump_seeds(int t); - -/* text.c */ -#define MAX_GRAMMAR_LEN 12 /* max length of grammar component */ -#define MAX_SENT_LEN 256 /* max length of populated sentence */ -#define RNG_PER_SENT 27 /* max number of RNG calls per sentence */ - -int dbg_text PROTO((char * t, int min, int max, int s)); - -#ifdef DECLARER -#define EXTERN -#else -#define EXTERN extern -#endif /* DECLARER */ - -/* print.c */ -extern int print_to_stdout; - -EXTERN distribution nations; -EXTERN distribution nations2; -EXTERN distribution regions; -EXTERN distribution o_priority_set; -EXTERN distribution l_instruct_set; -EXTERN distribution l_smode_set; -EXTERN distribution l_category_set; -EXTERN distribution l_rflag_set; -EXTERN distribution c_mseg_set; -EXTERN distribution colors; -EXTERN distribution p_types_set; -EXTERN distribution p_cntr_set; - -/* distributions that control text generation */ -EXTERN distribution articles; -EXTERN distribution nouns; -EXTERN distribution adjectives; -EXTERN distribution adverbs; -EXTERN distribution prepositions; -EXTERN distribution verbs; -EXTERN distribution terminators; -EXTERN distribution auxillaries; -EXTERN distribution np; -EXTERN distribution vp; -EXTERN distribution grammar; - - -EXTERN long scale; -EXTERN int refresh; -EXTERN int resume; -EXTERN long verbose; -EXTERN long force; -EXTERN long header; -EXTERN long columnar; -EXTERN long direct; -EXTERN long updates; -EXTERN long table; -EXTERN long children; -EXTERN long fnames; -EXTERN int gen_sql; -EXTERN int gen_rng; -EXTERN char *db_name; -EXTERN int step; -EXTERN int set_seeds; -EXTERN int validate; -EXTERN char *d_path; - -/* added for segmented updates */ -EXTERN int insert_segments; -EXTERN int delete_segments; -EXTERN int insert_orders_segment; -EXTERN int insert_lineitem_segment; -EXTERN int delete_segment; - - -#ifndef DECLARER -extern tdef tdefs[]; - -#endif /* DECLARER */ - - -/***************************************************************** - ** table level defines use the following naming convention: t_ccc_xxx - ** with: t, a table identifier - ** ccc, a column identifier - ** xxx, a limit type - **************************************************************** - */ - -/* - * defines which control the parts table - */ -#define P_SIZE 126 -#ifdef SSBM -#define P_NAME_SCL 3 /*5 change to 3 according to the new schema*/ -#else -#define P_NAME_SCL 5 -#endif -#define P_MFG_TAG "Manufacturer#" -#define P_MFG_FMT "%s%01d" -#define P_MFG_MIN 1 -#define P_MFG_MAX 5 -#define P_BRND_TAG "Brand#" -#define P_BRND_FMT "%s%02d" -#define P_BRND_MIN 1 - -/*#ifdef SSBM -#define P_BRND_MAX 5 -#else*/ -#define P_BRND_MAX 40 -/*#endif*/ - -#define P_SIZE_MIN 1 -#define P_SIZE_MAX 50 -#define P_MCST_MIN 100 -#define P_MCST_MAX 99900 -#define P_MCST_SCL 100.0 -#define P_RCST_MIN 90000 -#define P_RCST_MAX 200000 -#define P_RCST_SCL 100.0 -/* - * defines which control the suppliers table - */ -#define S_SIZE 145 -#define S_NAME_TAG "Supplier#" -#define S_NAME_FMT "%s%09ld" -#define S_ABAL_MIN -99999 -#define S_ABAL_MAX 999999 -#define S_CMNT_MAX 101 -#define S_CMNT_BBB 10 /* number of BBB comments/SF */ -#define BBB_DEADBEATS 50 /* % that are complaints */ -#define BBB_BASE "Customer " -#define BBB_COMPLAIN "Complaints" -#define BBB_COMMEND "Recommends" -#define BBB_CMNT_LEN 19 -#define BBB_BASE_LEN 9 -#define BBB_TYPE_LEN 10 - -/* - * defines which control the partsupp table - */ -#define PS_SIZE 145 -#define PS_SKEY_MIN 0 -#define PS_SKEY_MAX ((tdefs[SUPP].base - 1) * scale) -#define PS_SCST_MIN 100 -#define PS_SCST_MAX 100000 -#define PS_QTY_MIN 1 -#define PS_QTY_MAX 9999 -/* - * defines which control the customers table - */ -#define C_SIZE 165 -#define C_NAME_TAG "Customer#" -#define C_NAME_FMT "%s%09ld" -#define C_MSEG_MAX 5 -#define C_ABAL_MIN -99999 -#define C_ABAL_MAX 999999 -/* - * defines which control the order table - */ -#define O_SIZE 109 -#define O_CKEY_MIN 1 -#define O_CKEY_MAX (long)(tdefs[CUST].base * scale) -#define O_ODATE_MIN STARTDATE -#define O_ODATE_MAX (STARTDATE + TOTDATE - \ - (L_SDTE_MAX + L_RDTE_MAX) - 1) -#define O_CLRK_TAG "Clerk#" -#define O_CLRK_FMT "%s%09d" -#define O_CLRK_SCL 1000 -#define O_LCNT_MIN 1 -#define O_LCNT_MAX 7 - -/* - * defines which control the lineitem table - */ -#define L_SIZE 144L -#define L_QTY_MIN 1 -#define L_QTY_MAX 50 -#define L_TAX_MIN 0 -#define L_TAX_MAX 8 -#define L_DCNT_MIN 0 -#define L_DCNT_MAX 10 -#define L_PKEY_MIN 1 - -#ifdef SSBM -/*part table log based*/ -#define L_PKEY_MAX (tdefs[PART].base * (floor(log((double)scale))+1)) -#else -#define L_PKEY_MAX (tdefs[PART].base * scale) -#endif - -#define L_SDTE_MIN 1 -#define L_SDTE_MAX 121 -#define L_CDTE_MIN 30 -#define L_CDTE_MAX 90 -#define L_RDTE_MIN 1 -#define L_RDTE_MAX 30 -/* - * defines which control the time table - */ -#define T_SIZE 30 -#define T_START_DAY 3 /* wednesday ? */ -#define LEAP(y) ((!(y % 4) && (y % 100))?1:0) - -/******************************************************************* - ******************************************************************* - *** - *** general or inter table defines - *** - ******************************************************************* - *******************************************************************/ -#define SUPP_PER_PART 4 -#define ORDERS_PER_CUST 10 /* sync this with CUST_MORTALITY */ -#define CUST_MORTALITY 3 /* portion with have no orders */ -#define NATIONS_MAX 90 /* limited by country codes in phone numbers */ -#define PHONE_FMT "%02d-%03d-%03d-%04d" -#define STARTDATE 92001 -#define CURRENTDATE 95168 -#define ENDDATE 98365 -#define TOTDATE 2557 -#define UPD_PCT 10 -#define MAX_STREAM 47 -#define V_STR_LOW 0.4 -#define PENNIES 100 /* for scaled int money arithmetic */ -#define Q11_FRACTION (double)0.0001 -/* - * max and min SF in GB; Larger SF will require changes to the build routines - */ -#define MIN_SCALE 1.0 -#define MAX_SCALE 1000.0 -/* - * beyond this point we need to allow for BCD calculations - */ -#define MAX_32B_SCALE 1000.0 -#define INIT_HUGE(v) { \ - v = (DSS_HUGE *)malloc(sizeof(DSS_HUGE) * HUGE_COUNT); \ - MALLOC_CHECK(v); \ - } -#define FREE_HUGE(v) free(v) -#ifdef SUPPORT_64BITS -#define LONG2HUGE(src, dst) *dst = (DSS_HUGE)src -#define HUGE2LONG(src, dst) *dst = (long)src -#define HUGE_SET(src, dst) *dst = *src -#define HUGE_MUL(op1, op2) *op1 *= op2 -#define HUGE_DIV(op1, op2) *op1 /= op2 -#define HUGE_ADD(op1, op2, dst) *dst = *op1 + op2 -#define HUGE_SUB(op1, op2, dst) *dst = *op1 - op2 -#define HUGE_MOD(op1, op2) *op1 % op2 -#define HUGE_CMP(op1, op2) (*op1 == *op2)?0:(*op1 < *op2)-1:1 -#else -#define LONG2HUGE(src, dst) {*dst = src; *(dst + 1) = 0;} -#define HUGE2LONG(src, dst) { dst=0 ; \ - bcd2_bin(dst, (src + 1)); \ - bcd2_bin(dst, src); } -#define HUGE_SET(src, dst) { *dst = *src ; *(dst + 1) = *(src + 1); } -#define HUGE_MUL(op1,op2) bcd2_mul(op1, (op1 + 1), op2) -#define HUGE_DIV(op1,op2) bcd2_div(op1, (op1 + 1), op2) -#define HUGE_ADD(op1,op2,d) { \ - HUGE_SET(op1, d); \ - bcd2_add(d, (d + 1), op2); \ - } -#define HUGE_SUB(op1,op2,d) { \ - HUGE_SET(op1, d); \ - bcd2_sub(d, (d + 1), op2); \ - } -#define HUGE_MOD(op1, op2) bcd2_mod(op1, (op1 + 1), op2) -#define HUGE_CMP(op1, op2) (bcd2_cmp(op1, (op1 + 1), op2) == 0)?0:\ - ((bcd2_cmp(op1, (op1 + 1), op2) < 0)?-1:1) -#endif /* SUPPORT_64BITS */ - -/******** environmental variables and defaults ***************/ -#define DIST_TAG "DSS_DIST" /* environment var to override ... */ -#define DIST_DFLT "dists.dss" /* default file to hold distributions */ -#define PATH_TAG "DSS_PATH" /* environment var to override ... */ -#define PATH_DFLT "." /* default directory to hold tables */ -#define CONFIG_TAG "DSS_CONFIG" /* environment var to override ... */ -#define CONFIG_DFLT "." /* default directory to config files */ -#define ADHOC_TAG "DSS_ADHOC" /* environment var to override ... */ -#define ADHOC_DFLT "adhoc.dss" /* default file name for adhoc vars */ - -/******* output macros ********/ -#ifndef SEPARATOR -#define SEPARATOR '|' /* field spearator for generated flat files */ -#endif -/* Data type flags for a single print routine */ -#define DT_STR 0 -#ifndef MVS -#define DT_VSTR DT_STR -#else -#define DT_VSTR 1 -#endif /* MVS */ -#define DT_INT 2 -#define DT_HUGE 3 -#define DT_KEY 4 -#define DT_MONEY 5 -#define DT_CHR 6 - -int dbg_print(int dt, FILE *tgt, long data, int len, int eol); -#define PR_STR(f, str, len) dbg_print(DT_STR, f, (long)str, len, 1) -#define PR_STR_LAST(f, str, len) dbg_print(DT_STR, f, (long)str, len, 0) -#define PR_VSTR(f, str, len) dbg_print(DT_VSTR, f, (long)str, len, 1) -#define PR_VSTR_LAST(f, str, len) dbg_print(DT_VSTR, f, (long)str, len, 0) -#define PR_INT(f, str) dbg_print(DT_INT, f, str, 0, 1) -#define PR_HUGE(f, str) dbg_print(DT_HUGE, f, (long)str, 0, 1) -#define PR_KEY(f, str) dbg_print(DT_KEY, f, str, 0, -1) -#define PR_MONEY(f, str) dbg_print(DT_MONEY, f, str, 0, 1) -#define PR_CHR(f, str) dbg_print(DT_CHR, f, str, 0, 1) -#define PR_STRT(fp) /* any line prep for a record goes here */ -#define PR_END(fp) {fputc('\n', fp);} /* finish the record here */ - -#ifdef SSBM -#define PR_DATE(tgt, yr, mn, dy) \ - sprintf(tgt, "19%02d%02d%02d", (int)(yr), (int)(mn), (int)(dy)) -#else -#ifdef MDY_DATE -#define PR_DATE(tgt, yr, mn, dy) \ - sprintf(tgt, "%02d-%02d-19%02d", mn, dy, yr) -#else -#define PR_DATE(tgt, yr, mn, dy) \ -sprintf(tgt, "19%02d-%02d-%02d", yr, mn, dy) -#endif /* DATE_FORMAT */ -#endif -/* - * verification macros - */ -#define VRF_STR(t, d) {char *xx = d; while (*xx) tdefs[t].vtotal += *xx++;} -#define VRF_INT(t,d) tdefs[t].vtotal += d -#ifdef SUPPORT_64BITS -#define VRF_HUGE(t,d) tdefs[t].vtotal = *((long *)&d) + *((long *)(&d + 1)) -#else -#define VRF_HUGE(t,d) tdefs[t].vtotal += d[0] + d[1] -#endif /* SUPPORT_64BITS */ -/* assume float is a 64 bit quantity */ -#define VRF_MONEY(t,d) tdefs[t].vtotal = *((long *)&d) + *((long *)(&d + 1)) -#define VRF_CHR(t,d) tdefs[t].vtotal += d -#define VRF_STRT(t) -#define VRF_END(t) - -/*********** distribuitons currently defined *************/ -#define UNIFORM 0 - -/* - * seed indexes; used to separate the generation of individual columns - */ -#define P_MFG_SD 0 -#define P_BRND_SD 1 -#define P_TYPE_SD 2 -#define P_SIZE_SD 3 -#define P_CNTR_SD 4 -#define P_RCST_SD 5 -#define PS_QTY_SD 7 -#define PS_SCST_SD 8 -#define O_SUPP_SD 10 -#define O_CLRK_SD 11 -#define O_ODATE_SD 13 -#define L_QTY_SD 14 -#define L_DCNT_SD 15 -#define L_TAX_SD 16 -#define L_SHIP_SD 17 -#define L_SMODE_SD 18 -#define L_PKEY_SD 19 -#define L_SKEY_SD 20 -#define L_SDTE_SD 21 -#define L_CDTE_SD 22 -#define L_RDTE_SD 23 -#define L_RFLG_SD 24 -#define C_NTRG_SD 27 -#define C_PHNE_SD 28 -#define C_ABAL_SD 29 -#define C_MSEG_SD 30 -#define S_NTRG_SD 33 -#define S_PHNE_SD 34 -#define S_ABAL_SD 35 -#define P_NAME_SD 37 -#define O_PRIO_SD 38 -#define HVAR_SD 39 -#define O_CKEY_SD 40 -#define N_CMNT_SD 41 -#define R_CMNT_SD 42 -#define O_LCNT_SD 43 -#define BBB_JNK_SD 44 -#define BBB_TYPE_SD 45 -#define BBB_CMNT_SD 46 -#define BBB_OFFSET_SD 47 - -#endif /* DSS_H */ diff --git a/utils/ssbm/dsstypes.h b/utils/ssbm/dsstypes.h deleted file mode 100644 index ce2b7d8bb..000000000 --- a/utils/ssbm/dsstypes.h +++ /dev/null @@ -1,312 +0,0 @@ - /* - * Sccsid: @(#)dsstypes.h 2.1.8.1 - * - * general definitions and control information for the DSS data types - * and function prototypes - * Modified for SSBM prototype - */ - -/* - * typedefs - */ -#ifdef SSBM -typedef struct -{ - long custkey; - char name[C_NAME_LEN + 1]; - int nlen; - char address[C_ADDR_MAX + 1]; - int alen; - char city[CITY_FIX+1]; - int nation_key; - char nation_name[C_NATION_NAME_LEN+1]; - int region_key; - char region_name[C_REGION_NAME_LEN+1]; - char phone[PHONE_LEN + 1]; - char mktsegment[MAXAGG_LEN + 1]; -} customer_t; -#else -typedef struct -{ - long custkey; - char name[C_NAME_LEN + 1]; - char address[C_ADDR_MAX + 1]; - int alen; - long nation_code; - char phone[PHONE_LEN + 1]; - long acctbal; - char mktsegment[MAXAGG_LEN + 1]; - char comment[C_CMNT_MAX + 1]; - int clen; -} customer_t; -#endif - -/* customers.c */ -long mk_cust PROTO((long n_cust, customer_t * c)); -int pr_cust PROTO((customer_t * c, int mode)); -int ld_cust PROTO((customer_t * c, int mode)); - -#ifdef SSBM - -typedef struct -{ - DSS_HUGE *okey; /*for clustering line items*/ - int linenumber; /*integer, constrain to max of 7*/ - long custkey; - long partkey; - long suppkey; - char orderdate[DATE_LEN]; - char opriority[MAXAGG_LEN + 1]; - long ship_priority; - long quantity; - long extended_price; - long order_totalprice; - long discount; - long revenue; - long supp_cost; - long tax; - char commit_date[DATE_LEN] ; - char shipmode[O_SHIP_MODE_LEN + 1]; -} lineorder_t; -#else -typedef struct -{ - DSS_HUGE *okey; - long partkey; - long suppkey; - long lcnt; - long quantity; - long eprice; - long discount; - long tax; - char rflag[1]; - char lstatus[1]; - char cdate[DATE_LEN]; - char sdate[DATE_LEN]; - char rdate[DATE_LEN]; - char shipinstruct[MAXAGG_LEN + 1]; - char shipmode[MAXAGG_LEN + 1]; - char comment[L_CMNT_MAX + 1]; - int clen; -} line_t; -#endif - -#ifdef SSBM -typedef struct -{ - DSS_HUGE *okey; - long custkey; - int totalprice; - char odate[DATE_LEN]; - char opriority[MAXAGG_LEN + 1]; - char clerk[O_CLRK_LEN + 1]; - int spriority; - long lines; - lineorder_t lineorders[O_LCNT_MAX]; -} order_t; -#else -typedef struct -{ - DSS_HUGE *okey; - long custkey; - char orderstatus; - long totalprice; - char odate[DATE_LEN]; - char opriority[MAXAGG_LEN + 1]; - char clerk[O_CLRK_LEN + 1]; - long spriority; - long lines; - char comment[O_CMNT_MAX + 1]; - int clen; - line_t l[O_LCNT_MAX]; -} order_t; -#endif - -/* order.c */ -long mk_order PROTO((long index, order_t * o, long upd_num)); -int pr_order PROTO((order_t * o, int mode)); -int ld_order PROTO((order_t * o, int mode)); -void ez_sparse PROTO((long index, DSS_HUGE *ok, long seq)); -#ifndef SUPPORT_64BITS -void hd_sparse PROTO((long index, DSS_HUGE *ok, long seq)); -#endif - -#ifdef SSBM -/*SSBM removes the part supplier table*/ -#else -typedef struct -{ - long partkey; - long suppkey; - long qty; - long scost; - char comment[PS_CMNT_MAX + 1]; - int clen; -} partsupp_t; -#endif - -#ifdef SSBM -typedef struct -{ - long partkey; - char name[P_NAME_LEN + 1]; - int nlen; - char mfgr[P_MFG_LEN + 1]; - char category[P_CAT_LEN + 1]; - char brand[P_BRND_LEN + 1]; - char color[P_COLOR_MAX + 1]; - int clen; - char type[P_TYPE_MAX + 1]; - int tlen; - long size; - char container[P_CNTR_LEN + 1]; -} part_t; -#else -typedef struct -{ - long partkey; - char name[P_NAME_LEN + 1]; - int nlen; - char mfgr[P_MFG_LEN + 1]; - char brand[P_BRND_LEN + 1]; - char type[P_TYPE_LEN + 1]; - int tlen; - long size; - char container[P_CNTR_LEN + 1]; - long retailprice; - char comment[P_CMNT_MAX + 1]; - int clen; - partsupp_t s[SUPP_PER_PART]; -} part_t; -#endif - -/* parts.c */ -long mk_part PROTO((long index, part_t * p)); -int pr_part PROTO((part_t * part, int mode)); -int ld_part PROTO((part_t * part, int mode)); - -#ifdef SSBM -typedef struct -{ - long suppkey; - char name[S_NAME_LEN + 1]; - char address[S_ADDR_MAX + 1]; - int alen; - char city[CITY_FIX +1]; - int nation_key; - char nation_name[S_NATION_NAME_LEN+1]; - int region_key; - char region_name[S_REGION_NAME_LEN+1]; - char phone[PHONE_LEN + 1]; -} supplier_t; -#else -typedef struct -{ - long suppkey; - char name[S_NAME_LEN + 1]; - char address[S_ADDR_MAX + 1]; - int alen; - long nation_code; - char phone[PHONE_LEN + 1]; - long acctbal; - char comment[S_CMNT_MAX + 1]; - int clen; -} supplier_t; -#endif - -/* supplier.c */ -long mk_supp PROTO((long index, supplier_t * s)); -int pr_supp PROTO((supplier_t * supp, int mode)); -int ld_supp PROTO((supplier_t * supp, int mode)); - -#ifdef SSBM -/*todo: add new date table*/ - -typedef struct -{ - long datekey; - char date[D_DATE_LEN+1]; - char dayofweek[D_DAYWEEK_LEN+1] ; - char month[D_MONTH_LEN+1]; - int year; - int yearmonthnum; - char yearmonth[D_YEARMONTH_LEN+1]; - int daynuminweek; - int daynuminmonth; - int daynuminyear; - int monthnuminyear; - int weeknuminyear; - char sellingseason[D_SEASON_LEN + 1]; - int slen; - char lastdayinweekfl[2]; - char lastdayinmonthfl[2]; - char holidayfl[2]; - char weekdayfl[2]; -} date_t; - -/* date.c */ - -long mk_date PROTO((long index, date_t * d)); -int pr_date PROTO((date_t * date, int mode)); -int ld_date PROTO((date_t * date, int mode)); - -#endif - -typedef struct -{ - long timekey; - char alpha[DATE_LEN]; - long year; - long month; - long week; - long day; -} dss_time_t; - -/* time.c */ -long mk_time PROTO((long index, dss_time_t * t)); - - - -/* - * this assumes that N_CMNT_LEN >= R_CMNT_LEN - */ -typedef struct -{ - long code; - char *text; - long join; - char comment[N_CMNT_MAX + 1]; - int clen; -} code_t; - -/* code table */ -int mk_nation PROTO((long i, code_t * c)); -int pr_nation PROTO((code_t * c, int mode)); -int ld_nation PROTO((code_t * c, int mode)); -int mk_region PROTO((long i, code_t * c)); -int pr_region PROTO((code_t * c, int mode)); -int ld_region PROTO((code_t * c, int mode)); - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/utils/ssbm/load_stub.c b/utils/ssbm/load_stub.c deleted file mode 100644 index c2c552aa1..000000000 --- a/utils/ssbm/load_stub.c +++ /dev/null @@ -1,282 +0,0 @@ -/***************************************************************** - * Title: load_stub.c - * Sccsid: @(#)load_stub.c 2.1.8.1 - * Description: - * stub routines for: - * inline load of dss benchmark - * header creation for dss benchmark - * - ***************************************************************** - */ - -#include -#include "config.h" -#include "dss.h" -#include "dsstypes.h" - -int -close_direct(void) -{ - /* any post load cleanup goes here */ - return(0); -} - -int -prep_direct(void) -{ - /* any preload prep goes here */ - return(0); -} - -int -hd_cust (FILE *f) -{ - static int count = 0; - - if (! count++) - printf("No header has been defined for the customer table\n"); - - return(0); -} - -int -ld_cust (customer_t *cp, int mode) -{ - static int count = 0; - - if (! count++) - printf("%s %s\n", - "No load routine has been defined", - "for the customer table"); - - return(0); -} - -int -hd_part (FILE *f) -{ - static int count = 0; - - if (! count++) - printf("No header has been defined for the part table\n"); - - return(0); -} - -int -ld_part (part_t *pp, int mode) -{ - static int count = 0; - - if (! count++) - printf("No load routine has been defined for the part table\n"); - - return(0); -} - -int -ld_psupp (part_t *pp, int mode) -{ - static int count = 0; - - if (! count++) - printf("%s %s\n", - "No load routine has been defined for the", - "psupp table\n"); - - return(0); - -} - - -int -hd_supp (FILE *f) -{ - static int count = 0; - - if (! count++) - printf("No header has been defined for the supplier table\n"); - - return(0); -} - -int -ld_supp (supplier_t *sp, int mode) -{ - static int count = 0; - - if (! count++) - printf("%s %s\n", - "No load routine has been defined", - "for the supplier table\n"); - - return(0); -} - - -int -hd_order (FILE *f) -{ - static int count = 0; - - if (! count++) - printf("No header has been defined for the order table\n"); - - return(0); -} - -int -ld_order (order_t *p, int mode) -{ - static int count = 0; - - if (! count++) - printf("%s %s\n", - "No load routine has been defined", - "for the order table"); - - return(0); -} - -int -ld_line (order_t *p, int mode) -{ - static int count = 0; - - if (! count++) - printf("%s %s\n", - "No load routine has been defined", - "for the line table"); - - return(0); -} - - - -int -hd_psupp (FILE *f) -{ - static int count = 0; - - if (! count++) - printf("%s %s\n", - "No header has been defined for the", - "part supplier table"); - - return(0); -} - - -int -hd_line (FILE *f) -{ - static int count = 0; - - if (! count++) - printf("No header has been defined for the lineitem table\n"); - - return(0); -} - -int -hd_nation (FILE *f) -{ - static int count = 0; - - if (! count++) - printf("No header has been defined for the nation table\n"); - - return(0); -} - -#ifdef SSBM -#else -int -ld_nation (code_t *cp, int mode) -{ - static int count = 0; - - if (! count++) - printf("%s %s\n", - "No load routine has been defined", - "for the nation table"); - - return(0); -} - -int -hd_region (FILE *f) -{ - static int count = 0; - - if (! count++) - printf("No header has been defined for the region table\n"); - - return(0); -} - -int -ld_region (code_t *cp, int mode) -{ - static int count = 0; - - if (! count++) - printf("%s %s\n", - "No load routine has been defined", - "for the region table"); - - return(0); -} - -int -ld_order_line (order_t *p, int mode) -{ - ld_order(p, mode); - ld_line (p, mode); - - return(0); -} - -int -hd_order_line (FILE *f) -{ - hd_order(f); - hd_line (f); - - return(0); -} - -int -ld_part_psupp (part_t *p, int mode) -{ - ld_part(p, mode); - ld_psupp (p, mode); - - return(0); -} - -int -hd_part_psupp (FILE *f) -{ - hd_part(f); - hd_psupp(f); - - return(0); -} -#endif - -#ifdef SSBM -int -ld_date (date_t *d, int mode) -{ - /*do nothing for now*/ - return(0); -} - -#endif - - - - - - diff --git a/utils/ssbm/permute.c b/utils/ssbm/permute.c deleted file mode 100644 index b34f04cbd..000000000 --- a/utils/ssbm/permute.c +++ /dev/null @@ -1,175 +0,0 @@ -/* @(#)permute.c 2.1.8.3 */ -/* -* permute.c -- a permutation generator for the query -* sequences in TPC-H and TPC-R -*/ - -#ifdef TEST -#define DECLARER -#endif -#include "config.h" -#include "dss.h" -#ifdef TEST -#include -#if (defined(_POSIX_)||!defined(WIN32)) /* Change for Windows NT */ -#include -#include -#endif /* WIN32 */ -#include /* */ -#include -#include -#include -#include -#include -#include -#ifdef HP -#include -#endif -#if (defined(WIN32)&&!defined(_POSIX_)) -#include -#pragma warning(disable:4201) -#pragma warning(disable:4214) -#pragma warning(disable:4514) -#define WIN32_LEAN_AND_MEAN -#define NOATOM -#define NOGDICAPMASKS -#define NOMETAFILE -#define NOMINMAX -#define NOMSG -#define NOOPENFILE -#define NORASTEROPS -#define NOSCROLL -#define NOSOUND -#define NOSYSMETRICS -#define NOTEXTMETRIC -#define NOWH -#define NOCOMM -#define NOKANJI -#define NOMCX -#include -#pragma warning(default:4201) -#pragma warning(default:4214) -#endif -#endif - -long NextRand(long seed); -long *permute(long *set, int cnt, long stream); -long *permute_dist(distribution *d, long stream); -long seed; -char *eol[2] = {" ", "},"}; -extern seed_t Seed[]; -#ifdef TEST -tdef tdefs = { NULL }; -#endif - - -#define MAX_QUERY 22 -#define ITERATIONS 1000 -#define UNSET 0 - -long * -permute(long *a, int c, long s) - { - int i; - static long source; - static long *set, temp; - - if (a != (long *)NULL) - { - set = a; - for (i=0; i < c; i++) - *(a + i) = i; - for (i=0; i < c; i++) - { - RANDOM(source, 0L, (long)(c - 1), s); - temp = *(a + source); - *(a + source) = *(a + i) ; - *(a + i) = temp; - source = 0; - } - } - else - source += 1; - - if (source >= c) - source -= c; - - return(set + source); - } - -long * -permute_dist(distribution *d, long stream) - { - static distribution *dist = NULL; - int i; - - if (d != NULL) - { - if (d->permute == (long *)NULL) - { - d->permute = (long *)malloc(sizeof(long) * DIST_SIZE(d)); - MALLOC_CHECK(d->permute); - for (i=0; i < DIST_SIZE(d); i++) - *(d->permute + i) = i; - } - dist = d; - return(permute(dist->permute, DIST_SIZE(dist), stream)); - } - - - if (dist != NULL) - return(permute(NULL, DIST_SIZE(dist), stream)); - else - INTERNAL_ERROR("Bad call to permute_dist"); - } - - -#ifdef TEST - -main(int ac, char *av[]) - { - long *sequence, - i, - j, - streams = UNSET, - *a; - char sep; - int index = 0; - - set_seeds = 0; - sequence = (long *)malloc(MAX_QUERY * sizeof(long)); - a = sequence; - for (i=0; i < MAX_QUERY; i++) - *(sequence + i) = i; - if (ac < 3) - goto usage; - Seed[0].value = (long)atoi(av[1]); - streams = atoi(av[2]); - if (Seed[0].value == UNSET || streams == UNSET) - goto usage; - - index = 0; - printf("long permutation[%d][%d] = {\n", streams, MAX_QUERY); - for (j=0; j < streams; j++) - { - sep = '{'; - printf("%s\n", eol[index]); - for (i=0; i < MAX_QUERY; i++) - { - printf("%c%2d", sep, *permute(a, MAX_QUERY, 0) + 1); - a = (long *)NULL; - sep = ','; - } - a = sequence; - index=1; - } - printf("}\n};\n"); - return(0); - -usage: - printf("Usage: %s \n",av[0]); - printf(" uses to start the generation of permutations of [1..%d]\n", MAX_QUERY); - return(-1); - - } -#endif /* TEST */ diff --git a/utils/ssbm/print.c b/utils/ssbm/print.c deleted file mode 100644 index 0c5fadef6..000000000 --- a/utils/ssbm/print.c +++ /dev/null @@ -1,1013 +0,0 @@ -/* @(#)print.c 2.1.8.2 */ -/* generate flat files for data load */ -#include -#ifndef VMS -#include -#endif - -#if defined(SUN) -#include -#endif - -#if defined(LINUX) -#include -#endif /*LINUX*/ - -#include - -#include "dss.h" -#include "dsstypes.h" -#include - -#include -#include - -/* option */ -int print_to_stdout = 0; - -/* - * Function Prototypes - */ -FILE *print_prep PROTO((int table, int update)); -int pr_drange PROTO((int tbl, long min, long cnt, long num)); - -FILE * -print_prep(int table, int update) -{ - char upath[128]; - FILE *res; - - if (updates) - { - if (update > 0) /* updates */ - if ( insert_segments ) - { - int this_segment; - if(strcmp(tdefs[table].name,"orders.tbl")) - this_segment=++insert_orders_segment; - else - this_segment=++insert_lineitem_segment; - sprintf(upath, "%s%c%s.u%d.%d", - env_config(PATH_TAG, PATH_DFLT), - PATH_SEP, tdefs[table].name, update%10000,this_segment); - } - else - { - sprintf(upath, "%s%c%s.u%d", - env_config(PATH_TAG, PATH_DFLT), - PATH_SEP, tdefs[table].name, update); - } - else /* deletes */ - if ( delete_segments ) - { - ++delete_segment; - sprintf(upath, "%s%cdelete.u%d.%d", - env_config(PATH_TAG, PATH_DFLT), PATH_SEP, -update%10000, - delete_segment); - } - else - { - sprintf(upath, "%s%cdelete.%d", - env_config(PATH_TAG, PATH_DFLT), PATH_SEP, -update); - } - return(fopen(upath, "w")); - } - if (print_to_stdout) - res = stdout; - else - { - res = tbl_open(table, "w"); - OPEN_CHECK(res, tdefs[table].name); - } - return(res); -} - -int -dbg_print(int format, FILE *target, long data, int len, int sep) -{ - int dollars, - cents; - - switch(format) - { - case DT_STR: - if (columnar) - fprintf(target, "%-*s", len, (char *)data); - else - fprintf(target, "%s", (char *)data); - break; -#ifdef MVS - case DT_VSTR: - /* note: only used in MVS, assumes columnar output */ - fprintf(target, "%c%c%-*s", - (len >> 8) & 0xFF, len & 0xFF, len, (char *)data); - break; -#endif /* MVS */ - case DT_INT: - if (columnar) - fprintf(target, "%12ld", (long)data); - else - fprintf(target, "%ld", (long)data); - break; - case DT_HUGE: -#ifndef SUPPORT_64BITS - if (*(long *)((long *)data + 1) == 0) \ - if (columnar) fprintf(target, "%12ld", *(long *)data); - else fprintf(target, "%ld", *(long *)data); - else - if (columnar) fprintf(target, "%5ld%07ld", - *(long *)((long *)data + 1), *(long *)data); - else fprintf(target,"%ld%07ld", - *(long *)((long *)data + 1), *(long *)data); -#else - fprintf(target, HUGE_FORMAT, *(DSS_HUGE *)data); -#endif /* SUPPORT_64BITS */ - break; - case DT_KEY: - fprintf(target, "%ld", (long)data); - break; - case DT_MONEY: - cents = (long)data; - if (cents < 0) - { - fprintf(target, "-"); - cents = -cents; - } - dollars = cents / 100; - cents %= 100; - if (columnar) - fprintf(target, "%12d.%02d", dollars, cents); - else - fprintf(target, "%d.%02d", dollars, cents); - break; - case DT_CHR: - if (columnar) - fprintf(target, "%c ", (int)data); - else - fprintf(target, "%c", (int)data); - break; - } - -#ifdef EOL_HANDLING - if (sep) -#endif /* EOL_HANDLING */ - if (!columnar && (sep != -1)) - fprintf(target, "%c", SEPARATOR); - - return(0); -} - -#ifdef SSBM -int -pr_cust(customer_t *c, int mode) -{ -static FILE *fp = NULL; - - if (fp == NULL) - fp = print_prep(CUST, 0); - - PR_STRT(fp); - PR_INT(fp, c->custkey); - PR_VSTR(fp, c->name, C_NAME_LEN); - PR_VSTR(fp, c->address, - (columnar)?(long)(ceil(C_ADDR_LEN * V_STR_HGH)):c->alen); - PR_STR(fp, c->city,CITY_FIX); - PR_STR(fp, c->nation_name, C_NATION_NAME_LEN); - PR_STR(fp, c->region_name, C_REGION_NAME_LEN); - PR_STR(fp, c->phone, PHONE_LEN); - PR_STR_LAST(fp, c->mktsegment,MAXAGG_LEN); - PR_END(fp); - - return(0); -} - -#else -int -pr_cust(customer_t *c, int mode) -{ -static FILE *fp = NULL; - - if (fp == NULL) - fp = print_prep(CUST, 0); - - PR_STRT(fp); - PR_INT(fp, c->custkey); - PR_VSTR(fp, c->name, C_NAME_LEN); - PR_VSTR(fp, c->address, - (columnar)?(long)(ceil(C_ADDR_LEN * V_STR_HGH)):c->alen); - PR_INT(fp, c->nation_code); - PR_STR(fp, c->phone, PHONE_LEN); - PR_MONEY(fp, c->acctbal); - PR_STR(fp, c->mktsegment, C_MSEG_LEN); - PR_VSTR_LAST(fp, c->comment, - (columnar)?(long)(ceil(C_CMNT_LEN * V_STR_HGH)):c->clen); - PR_END(fp); - - return(0); -} -#endif - -/* - * print the numbered order - */ -#ifdef SSBM - -#else -int -pr_order(order_t *o, int mode) -{ - static FILE *fp_o = NULL; - static int last_mode = 0; - - if (fp_o == NULL || mode != last_mode) - { - if (fp_o) - fclose(fp_o); - fp_o = print_prep(ORDER, mode); - last_mode = mode; - } - PR_STRT(fp_o); - PR_HUGE(fp_o, o->okey); - PR_INT(fp_o, o->custkey); - PR_CHR(fp_o, o->orderstatus); - PR_MONEY(fp_o, o->totalprice); - PR_STR(fp_o, o->odate, DATE_LEN); - PR_STR(fp_o, o->opriority, O_OPRIO_LEN); - PR_STR(fp_o, o->clerk, O_CLRK_LEN); - PR_INT(fp_o, o->spriority); - PR_VSTR_LAST(fp_o, o->comment, - (columnar)?(long)(ceil(O_CMNT_LEN * V_STR_HGH)):o->clen); - PR_END(fp_o); - - return(0); -} -#endif - -/* - * print an order's lineitems - */ -#ifdef SSBM -int -pr_line(order_t *o, int mode) -{ - - static FILE *fp_l = NULL; - static int last_mode = 0; - long i; - int days; - char buf[100]; - - if (fp_l == NULL || mode != last_mode) - { - if (fp_l) - fclose(fp_l); - fp_l = print_prep(LINE, mode); - last_mode = mode; - } - - for (i = 0; i < o->lines; i++) - { - PR_STRT(fp_l); - PR_HUGE(fp_l, o->lineorders[i].okey); - PR_INT(fp_l, o->lineorders[i].linenumber); - PR_INT(fp_l, o->lineorders[i].custkey); - PR_INT(fp_l, o->lineorders[i].partkey); - PR_INT(fp_l, o->lineorders[i].suppkey); - PR_STR(fp_l, o->lineorders[i].orderdate, DATE_LEN); - PR_STR(fp_l, o->lineorders[i].opriority, O_OPRIO_LEN); - PR_INT(fp_l, o->lineorders[i].ship_priority); - PR_INT(fp_l, o->lineorders[i].quantity); - PR_INT(fp_l, o->lineorders[i].extended_price); - PR_INT(fp_l, o->lineorders[i].order_totalprice); - PR_INT(fp_l, o->lineorders[i].discount); - PR_INT(fp_l, o->lineorders[i].revenue); - PR_INT(fp_l, o->lineorders[i].supp_cost); - PR_INT(fp_l, o->lineorders[i].tax); - PR_STR(fp_l, o->lineorders[i].commit_date, DATE_LEN); - PR_STR_LAST(fp_l, o->lineorders[i].shipmode, O_SHIP_MODE_LEN); - PR_END(fp_l); - } - - return(0); -} -#else -int -pr_line(order_t *o, int mode) -{ - static FILE *fp_l = NULL; - static int last_mode = 0; - long i; - int days; - char buf[100]; - - if (fp_l == NULL || mode != last_mode) - { - if (fp_l) - fclose(fp_l); - fp_l = print_prep(LINE, mode); - last_mode = mode; - } - - for (i = 0; i < o->lines; i++) - { - PR_STRT(fp_l); - PR_HUGE(fp_l, o->l[i].okey); - PR_INT(fp_l, o->l[i].partkey); - PR_INT(fp_l, o->l[i].suppkey); - PR_INT(fp_l, o->l[i].lcnt); - PR_INT(fp_l, o->l[i].quantity); - PR_MONEY(fp_l, o->l[i].eprice); - PR_MONEY(fp_l, o->l[i].discount); - PR_MONEY(fp_l, o->l[i].tax); - PR_CHR(fp_l, o->l[i].rflag[0]); - PR_CHR(fp_l, o->l[i].lstatus[0]); - PR_STR(fp_l, o->l[i].sdate, DATE_LEN); - PR_STR(fp_l, o->l[i].cdate, DATE_LEN); - PR_STR(fp_l, o->l[i].rdate, DATE_LEN); - PR_STR(fp_l, o->l[i].shipinstruct, L_INST_LEN); - PR_STR(fp_l, o->l[i].shipmode, L_SMODE_LEN); - PR_VSTR_LAST(fp_l, o->l[i].comment, - (columnar)?(long)(ceil(L_CMNT_LEN * - V_STR_HGH)):o->l[i].clen); - PR_END(fp_l); - } - - return(0); -} -#endif - -/* - * print the numbered order *and* its associated lineitems - */ -#ifdef SSBM -#else -int -pr_order_line(order_t *o, int mode) -{ - tdefs[ORDER].name = tdefs[ORDER_LINE].name; - pr_order(o, mode); - pr_line(o, mode); - - return(0); -} -#endif - -/* - * print the given part - */ -#ifdef SSBM -int -pr_part(part_t *part, int mode) -{ - static FILE *p_fp = NULL; - - if (p_fp == NULL) - p_fp = print_prep(PART, 0); - - PR_STRT(p_fp); - PR_INT(p_fp, part->partkey); - PR_VSTR(p_fp, part->name, - (columnar)?(long)P_NAME_LEN:part->nlen); - PR_STR(p_fp, part->mfgr, P_MFG_LEN); - PR_STR(p_fp, part->category, P_CAT_LEN); - PR_STR(p_fp, part->brand, P_BRND_LEN); - - /*need to handle color*/ - PR_VSTR(p_fp, part->color,(columnar)?(long)P_COLOR_LEN:part->clen); - PR_VSTR(p_fp, part->type, - (columnar)?(long)P_TYPE_LEN:part->tlen); - PR_INT(p_fp, part->size); - PR_STR_LAST(p_fp, part->container, P_CNTR_LEN); - PR_END(p_fp); - return(0); -} - -#else -int -pr_part(part_t *part, int mode) -{ -static FILE *p_fp = NULL; - - if (p_fp == NULL) - p_fp = print_prep(PART, 0); - - PR_STRT(p_fp); - PR_INT(p_fp, part->partkey); - PR_VSTR(p_fp, part->name, - (columnar)?(long)P_NAME_LEN:part->nlen); - PR_STR(p_fp, part->mfgr, P_MFG_LEN); - PR_STR(p_fp, part->brand, P_BRND_LEN); - PR_VSTR(p_fp, part->type, - (columnar)?(long)P_TYPE_LEN:part->tlen); - PR_INT(p_fp, part->size); - PR_STR(p_fp, part->container, P_CNTR_LEN); - PR_MONEY(p_fp, part->retailprice); - PR_VSTR_LAST(p_fp, part->comment, - (columnar)?(long)(ceil(P_CMNT_LEN * V_STR_HGH)):part->clen); - PR_END(p_fp); - - return(0); -} -#endif - -/* - * print the given part's suppliers - */ -#ifdef SSBM -/*SSBM don't have partsupplier table*/ -#else -int -pr_psupp(part_t *part, int mode) -{ - static FILE *ps_fp = NULL; - long i; - - if (ps_fp == NULL) - ps_fp = print_prep(PSUPP, mode); - - for (i = 0; i < SUPP_PER_PART; i++) - { - PR_STRT(ps_fp); - PR_INT(ps_fp, part->s[i].partkey); - PR_INT(ps_fp, part->s[i].suppkey); - PR_INT(ps_fp, part->s[i].qty); - PR_MONEY(ps_fp, part->s[i].scost); - PR_VSTR_LAST(ps_fp, part->s[i].comment, - (columnar)?(long)(ceil(PS_CMNT_LEN * V_STR_HGH)):part->s[i].clen); - PR_END(ps_fp); - } - - return(0); -} -#endif - -/* - * print the given part *and* its suppliers - */ -#ifdef SSBM -/*SSBM don't have partsupplier table*/ -#else -int -pr_part_psupp(part_t *part, int mode) -{ - tdefs[PART].name = tdefs[PART_PSUPP].name; - pr_part(part, mode); - pr_psupp(part, mode); - - return(0); -} -#endif - - -#ifdef SSBM -int -pr_supp(supplier_t *supp, int mode) -{ - static FILE *fp = NULL; - - if (fp == NULL) - fp = print_prep(SUPP, mode); - - PR_STRT(fp); - PR_INT(fp, supp->suppkey); - PR_STR(fp, supp->name, S_NAME_LEN); - - PR_VSTR(fp, supp->address, - (columnar)?(long)(ceil(S_ADDR_LEN * V_STR_HGH)):supp->alen); - PR_STR(fp, supp->city, CITY_FIX); - PR_STR(fp, supp->nation_name, C_NATION_NAME_LEN); - PR_STR(fp, supp->region_name, C_REGION_NAME_LEN); - PR_STR_LAST(fp, supp->phone, PHONE_LEN); - PR_END(fp); - - return(0); -} -#else -int -pr_supp(supplier_t *supp, int mode) -{ -static FILE *fp = NULL; - - if (fp == NULL) - fp = print_prep(SUPP, mode); - - PR_STRT(fp); - PR_INT(fp, supp->suppkey); - PR_STR(fp, supp->name, S_NAME_LEN); - PR_VSTR(fp, supp->address, - (columnar)?(long)(ceil(S_ADDR_LEN * V_STR_HGH)):supp->alen); - PR_INT(fp, supp->nation_code); - PR_STR(fp, supp->phone, PHONE_LEN); - PR_MONEY(fp, supp->acctbal); - PR_VSTR_LAST(fp, supp->comment, - (columnar)?(long)(ceil(S_CMNT_LEN * V_STR_HGH)):supp->clen); - PR_END(fp); - - return(0); -} -#endif - -#ifdef SSBM -#else -int -pr_nation(code_t *c, int mode) -{ -static FILE *fp = NULL; - - if (fp == NULL) - fp = print_prep(NATION, mode); - - PR_STRT(fp); - PR_INT(fp, c->code); - PR_STR(fp, c->text, NATION_LEN); - PR_INT(fp, c->join); - PR_VSTR_LAST(fp, c->comment, - (columnar)?(long)(ceil(N_CMNT_LEN * V_STR_HGH)):c->clen); - PR_END(fp); - - return(0); -} - -int -pr_region(code_t *c, int mode) -{ -static FILE *fp = NULL; - - if (fp == NULL) - fp = print_prep(REGION, mode); - - PR_STRT(fp); - PR_INT(fp, c->code); - PR_STR(fp, c->text, REGION_LEN); - PR_VSTR_LAST(fp, c->comment, - (columnar)?(long)(ceil(R_CMNT_LEN * V_STR_HGH)):c->clen); - PR_END(fp); - - return(0); -} -#endif - -/* - * NOTE: this routine does NOT use the BCD2_* routines. As a result, - * it WILL fail if the keys being deleted exceed 32 bits. Since this - * would require ~660 update iterations, this seems an acceptable - * oversight - */ -int -pr_drange(int tbl, long min, long cnt, long num) -{ - static int last_num = 0; - static FILE *dfp = NULL; - int child = -1; - long start, last, new; - - static int rows_per_segment=0; - static int rows_this_segment=0; - static int residual_rows=0; - - if (last_num != num) - { - if (dfp) - fclose(dfp); - dfp = print_prep(tbl, -num); - if (dfp == NULL) - return(-1); - last_num = num; - rows_this_segment=0; - } - - start = MK_SPARSE(min, (num - 1)/ (10000 / refresh)); - last = start - 1; - for (child=min; cnt > 0; child++, cnt--) - { - new = MK_SPARSE(child, (num - 1) / (10000 / refresh)); - if (gen_rng == 1 && new - last == 1) - { - last = new; - continue; - } - if (gen_sql) - { - fprintf(dfp, - "delete from %s where %s between %ld and %ld;\n", - tdefs[ORDER].name, "o_orderkey", start, last); - fprintf(dfp, - "delete from %s where %s between %ld and %ld;\n", - tdefs[LINE].name, "l_orderkey", start, last); - fprintf(dfp, "commit work;\n"); - } - else - if (gen_rng) - { - PR_STRT(dfp); - PR_INT(dfp, start); - PR_INT(dfp, last); - PR_END(dfp); - } - else - { - if (delete_segments) - { - if(rows_per_segment==0) - { - rows_per_segment = (cnt / delete_segments); - residual_rows = (cnt % delete_segments); - rows_per_segment++; - } - if(delete_segment <= residual_rows) - { - if((++rows_this_segment) > rows_per_segment) - { - fclose(dfp); - dfp = print_prep(tbl, -num); - if (dfp == NULL) return(-1); - last_num = num; - rows_this_segment=1; - } - } - else - { - if((++rows_this_segment) >= rows_per_segment) - { - fclose(dfp); - dfp = print_prep(tbl, -num); - if (dfp == NULL) return(-1); - last_num = num; - rows_this_segment=1; - } - } - } - PR_STRT(dfp); - PR_KEY(dfp, new); - PR_END(dfp); - } - start = new; - last = new; - } - if (gen_rng) - { - PR_STRT(dfp); - PR_INT(dfp, start); - PR_INT(dfp, last); - PR_END(dfp); - } - - return(0); -} - -#ifdef SSBM -int pr_date(date_t *d, int mode){ - static FILE *d_fp = NULL; - - if (d_fp == NULL) - d_fp = print_prep(DATE, 0); - - PR_STRT(d_fp); - PR_INT(d_fp, d->datekey); - PR_STR(d_fp, d->date,D_DATE_LEN); - PR_STR(d_fp, d->dayofweek,D_DAYWEEK_LEN); - PR_STR(d_fp, d->month,D_MONTH_LEN); - PR_INT(d_fp, d->year); - PR_INT(d_fp, d->yearmonthnum); - PR_STR(d_fp, d->yearmonth,D_YEARMONTH_LEN); - PR_INT(d_fp, d->daynuminweek); - PR_INT(d_fp, d->daynuminmonth); - PR_INT(d_fp, d->daynuminyear); - PR_INT(d_fp, d->monthnuminyear); - PR_INT(d_fp, d->weeknuminyear); - PR_VSTR(d_fp, - d->sellingseason,(columnar)?(long)D_SEASON_LEN:d->slen); - PR_STR(d_fp,d->lastdayinweekfl,2); - PR_STR(d_fp,d->lastdayinmonthfl,2); - PR_STR(d_fp,d->holidayfl,2); - PR_STR_LAST(d_fp,d->weekdayfl,2); - - PR_END(d_fp); - return(0); - -} - -#endif -/* - * verify functions: routines which replace the pr_routines and generate a pseudo checksum - * instead of generating the actual contents of the tables. Meant to allow large scale data - * validation without requiring a large amount of storage - */ -#ifdef SSBM -int -vrf_cust(customer_t *c, int mode) -{ - VRF_STRT(CUST); - VRF_INT(CUST, c->custkey); - VRF_STR(CUST, c->name); - VRF_STR(CUST, c->address); - VRF_STR(CUST, c->city); - VRF_STR(CUST, c->nation_name); - VRF_STR(CUST, c->region_name); - VRF_STR(CUST, c->phone); - VRF_STR(CUST, c->mktsegment); - VRF_END(CUST); - - return(0); -} - -#else -int -vrf_cust(customer_t *c, int mode) -{ - VRF_STRT(CUST); - VRF_INT(CUST, c->custkey); - VRF_STR(CUST, c->name); - VRF_STR(CUST, c->address); - VRF_INT(CUST, c->nation_code); - VRF_STR(CUST, c->phone); - VRF_MONEY(CUST, c->acctbal); - VRF_STR(CUST, c->mktsegment); - VRF_STR(CUST, c->comment); - VRF_END(CUST); - - return(0); -} -#endif - -/* - * print the numbered order - */ -#ifdef SSBM -#else -int -vrf_order(order_t *o, int mode) -{ - VRF_STRT(ORDER); - VRF_HUGE(ORDER, o->okey); - VRF_INT(ORDER, o->custkey); - VRF_CHR(ORDER, o->orderstatus); - VRF_MONEY(ORDER, o->totalprice); - VRF_STR(ORDER, o->odate); - VRF_STR(ORDER, o->opriority); - VRF_STR(ORDER, o->clerk); - VRF_INT(ORDER, o->spriority); - VRF_STR(ORDER, o->comment); - VRF_END(ORDER); - - return(0); -} -#endif - -/* - * print an order's lineitems - */ -#ifdef SSBM -int -vrf_line(order_t *o, int mode) -{ - int i; - - for (i = 0; i < o->lines; i++) - { - VRF_STRT(LINE); - VRF_HUGE(LINE, o->lineorders[i].okey); - VRF_INT(LINE, o->lineorders[i].linenumber); - VRF_INT(LINE, o->lineorders[i].custkey); - VRF_INT(LINE, o->lineorders[i].partkey); - VRF_INT(LINE, o->lineorders[i].suppkey); - VRF_STR(LINE, o->lineorders[i].orderdate); - VRF_STR(LINE, o->lineorders[i].opriority); - VRF_INT(LINE, o->lineorders[i].ship_priority); - VRF_INT(LINE, o->lineorders[i].quantity); - VRF_INT(LINE, o->lineorders[i].extended_price); - VRF_INT(LINE, o->lineorders[i].order_totalprice); - VRF_INT(LINE, o->lineorders[i].discount); - VRF_INT(LINE, o->lineorders[i].revenue); - VRF_INT(LINE, o->lineorders[i].supp_cost); - VRF_INT(LINE, o->lineorders[i].tax); - VRF_STR(LINE, o->lineorders[i].commit_date); - VRF_STR(LINE, o->lineorders[i].shipmode); - VRF_END(LINE); - } - - return(0); -} - -#else -int -vrf_line(order_t *o, int mode) -{ - int i; - - for (i = 0; i < o->lines; i++) - { - VRF_STRT(LINE); - VRF_HUGE(LINE, o->l[i].okey); - VRF_INT(LINE, o->l[i].partkey); - VRF_INT(LINE, o->l[i].suppkey); - VRF_INT(LINE, o->l[i].lcnt); - VRF_INT(LINE, o->l[i].quantity); - VRF_MONEY(LINE, o->l[i].eprice); - VRF_MONEY(LINE, o->l[i].discount); - VRF_MONEY(LINE, o->l[i].tax); - VRF_CHR(LINE, o->l[i].rflag[0]); - VRF_CHR(LINE, o->l[i].lstatus[0]); - VRF_STR(LINE, o->l[i].sdate); - VRF_STR(LINE, o->l[i].cdate); - VRF_STR(LINE, o->l[i].rdate); - VRF_STR(LINE, o->l[i].shipinstruct); - VRF_STR(LINE, o->l[i].shipmode); - VRF_STR(LINE, o->l[i].comment); - VRF_END(LINE); - } - - return(0); -} -#endif - -/* - * print the numbered order *and* its associated lineitems - */ -#ifdef SSBM -#else -int -vrf_order_line(order_t *o, int mode) -{ - vrf_order(o, mode); - vrf_line(o, mode); - - return(0); -} -#endif - -/* - * print the given part - */ -#ifdef SSBM -int -vrf_part(part_t *part, int mode) -{ - - VRF_STRT(PART); - VRF_INT(PART, part->partkey); - VRF_STR(PART, part->name); - VRF_STR(PART, part->mfgr); - VRF_STR(PART, part->brand); - VRF_STR(PART, part->type); - VRF_INT(PART, part->size); - VRF_STR(PART, part->container); - VRF_STR(PART, part->category); - VRF_END(PART); - - return(0); -} - -#else -int -vrf_part(part_t *part, int mode) -{ - - VRF_STRT(PART); - VRF_INT(PART, part->partkey); - VRF_STR(PART, part->name); - VRF_STR(PART, part->mfgr); - VRF_STR(PART, part->brand); - VRF_STR(PART, part->type); - VRF_INT(PART, part->size); - VRF_STR(PART, part->container); - VRF_MONEY(PART, part->retailprice); - VRF_STR(PART, part->comment); - VRF_END(PART); - - return(0); -} -#endif - -/* - * print the given part's suppliers - */ -#ifdef SSBM -#else -int -vrf_psupp(part_t *part, int mode) -{ - long i; - - for (i = 0; i < SUPP_PER_PART; i++) - { - VRF_STRT(PSUPP); - VRF_INT(PSUPP, part->s[i].partkey); - VRF_INT(PSUPP, part->s[i].suppkey); - VRF_INT(PSUPP, part->s[i].qty); - VRF_MONEY(PSUPP, part->s[i].scost); - VRF_STR(PSUPP, part->s[i].comment); - VRF_END(PSUPP); - } - - return(0); -} -#endif - -/* - * print the given part *and* its suppliers - */ -#ifdef SSBM -#else -int -vrf_part_psupp(part_t *part, int mode) -{ - vrf_part(part, mode); - vrf_psupp(part, mode); - - return(0); -} -#endif - -#ifdef SSBM -int -vrf_supp(supplier_t *supp, int mode) -{ - VRF_STRT(SUPP); - VRF_INT(SUPP, supp->suppkey); - VRF_STR(SUPP, supp->name); - - VRF_STR(CUST, supp->address); - VRF_INT(CUST, supp->nation_key); - VRF_STR(CUST, supp->nation_name); - VRF_INT(CUST, supp->region_key); - VRF_STR(CUST, supp->region_name); - VRF_STR(CUST, supp->phone); - VRF_END(SUPP); - - return(0); -} - -#else -int -vrf_supp(supplier_t *supp, int mode) -{ - VRF_STRT(SUPP); - VRF_INT(SUPP, supp->suppkey); - VRF_STR(SUPP, supp->name); - VRF_STR(SUPP, supp->address); - VRF_INT(SUPP, supp->nation_code); - VRF_STR(SUPP, supp->phone); - VRF_MONEY(SUPP, supp->acctbal); - VRF_STR(SUPP, supp->comment); - VRF_END(SUPP); - - return(0); -} -#endif - -#ifdef SSBM -#else -int -vrf_nation(code_t *c, int mode) -{ - VRF_STRT(NATION); - VRF_INT(NATION, c->code); - VRF_STR(NATION, c->text); - VRF_INT(NATION, c->join); - VRF_STR(NATION, c->comment); - VRF_END(NATION); - - return(0); -} - -int -vrf_region(code_t *c, int mode) -{ - VRF_STRT(REGION); - VRF_INT(REGION, c->code); - VRF_STR(REGION, c->text); - VRF_STR(REGION, c->comment); - VRF_END(fp); - - return(0); -} -#endif - - -#ifdef SSBM -int vrf_date(date_t * d, int mode) -{ - VRF_STRT(DATE); - VRF_INT(DATE, d->datekey); - VRF_STR(DATE, d->date); - VRF_STR(DATE, d->dayofweek); - VRF_STR(DATE, d->month); - VRF_INT(DATE, d->year); - VRF_INT(DATE, d->yearmonthnum); - VRF_STR(DATE, d->yearmonth); - VRF_INT(DATE, d->daynuminweek); - VRF_INT(DATE, d->daynuminmonth); - VRF_INT(DATE, d->daynuminyear); - VRF_INT(DATE, d->monthnuminyear); - VRF_INT(DATE, d->weeknuminyear); - VRF_STR(DATE, d->sellingseason); - VRF_STR(DATE, d->lastdayinweekfl); - VRF_STR(DATE, d->lastdayinmonthfl); - VRF_STR(DATE, d->weekdayfl); - VRF_END(DATE); - return(0); - -} -#endif - diff --git a/utils/ssbm/rnd.c b/utils/ssbm/rnd.c deleted file mode 100644 index 4fabf7184..000000000 --- a/utils/ssbm/rnd.c +++ /dev/null @@ -1,262 +0,0 @@ -/* @(#)rnd.c 2.1.8.2 - * - * - * RANDOM.C -- Implements Park & Miller's "Minimum Standard" RNG - * - * (Reference: CACM, Oct 1988, pp 1192-1201) - * - * NextRand: Computes next random integer - * UnifInt: Yields an long uniformly distributed between given bounds - * UnifReal: ields a real uniformly distributed between given bounds - * Exponential: Yields a real exponentially distributed with given mean - * - */ - -#include "config.h" -#include -#include -#include "dss.h" -#include "rnd.h" - -char *env_config PROTO((char *tag, char *dflt)); -void NthElement(long, long *); - -void -dss_random(long *tgt, long lower, long upper, long stream) -{ - *tgt = UnifInt((long)lower, (long)upper, (long)stream); - Seed[stream].usage += 1; - - return; -} - -void -row_start(int t) \ -{ - int i; - for (i=0; i <= MAX_STREAM; i++) - Seed[i].usage = 0 ; - - return; -} - -void -row_stop(int t) \ - { - int i; - - /* need to allow for handling the master and detail together */ - if (t == ORDER_LINE) - t = ORDER; - if (t == PART_PSUPP) - t = PART; - - for (i=0; i <= MAX_STREAM; i++) - if ((Seed[i].table == t) || (Seed[i].table == tdefs[t].child)) - { - if (set_seeds && (Seed[i].usage > Seed[i].boundary)) - { - fprintf(stderr, "\nSEED CHANGE: seed[%d].usage = %ld\n", - i, Seed[i].usage); - Seed[i].boundary = Seed[i].usage; - } - else - { - NthElement((Seed[i].boundary - Seed[i].usage), &Seed[i].value); - } - } - return; - } - -void -dump_seeds(int tbl) -{ - int i; - - for (i=0; i <= MAX_STREAM; i++) - if (Seed[i].table == tbl) - printf("%d:\t%ld\n", i, Seed[i].value); - return; -} - -/****************************************************************** - - NextRand: Computes next random integer - -*******************************************************************/ - -/* - * long NextRand( long nSeed ) - */ -long -NextRand(long nSeed) - -/* - * nSeed is the previous random number; the returned value is the - * next random number. The routine generates all numbers in the - * range 1 .. nM-1. - */ - -{ - - /* - * The routine returns (nSeed * nA) mod nM, where nA (the - * multiplier) is 16807, and nM (the modulus) is - * 2147483647 = 2^31 - 1. - * - * nM is prime and nA is a primitive element of the range 1..nM-1. - * This * means that the map nSeed = (nSeed*nA) mod nM, starting - * from any nSeed in 1..nM-1, runs through all elements of 1..nM-1 - * before repeating. It never hits 0 or nM. - * - * To compute (nSeed * nA) mod nM without overflow, use the - * following trick. Write nM as nQ * nA + nR, where nQ = nM / nA - * and nR = nM % nA. (For nM = 2147483647 and nA = 16807, - * get nQ = 127773 and nR = 2836.) Write nSeed as nU * nQ + nV, - * where nU = nSeed / nQ and nV = nSeed % nQ. Then we have: - * - * nM = nA * nQ + nR nQ = nM / nA nR < nA < nQ - * - * nSeed = nU * nQ + nV nU = nSeed / nQ nV < nU - * - * Since nA < nQ, we have nA*nQ < nM < nA*nQ + nA < nA*nQ + nQ, - * i.e., nM/nQ = nA. This gives bounds on nU and nV as well: - * nM > nSeed => nM/nQ * >= nSeed/nQ => nA >= nU ( > nV ). - * - * Using ~ to mean "congruent mod nM" this gives: - * - * nA * nSeed ~ nA * (nU*nQ + nV) - * - * ~ nA*nU*nQ + nA*nV - * - * ~ nU * (-nR) + nA*nV (as nA*nQ ~ -nR) - * - * Both products in the last sum can be computed without overflow - * (i.e., both have absolute value < nM) since nU*nR < nA*nQ < nM, - * and nA*nV < nA*nQ < nM. Since the two products have opposite - * sign, their sum lies between -(nM-1) and +(nM-1). If - * non-negative, it is the answer (i.e., it's congruent to - * nA*nSeed and lies between 0 and nM-1). Otherwise adding nM - * yields a number still congruent to nA*nSeed, but now between - * 0 and nM-1, so that's the answer. - */ - - long nU, nV; - - nU = nSeed / nQ; - nV = nSeed - nQ * nU; /* i.e., nV = nSeed % nQ */ - nSeed = nA * nV - nU * nR; - if (nSeed < 0) - nSeed += nM; - return (nSeed); -} - -/****************************************************************** - - UnifInt: Yields an long uniformly distributed between given bounds - -*******************************************************************/ - -/* - * long UnifInt( long nLow, long nHigh, long nStream ) - */ -long -UnifInt(long nLow, long nHigh, long nStream) - -/* - * Returns an integer uniformly distributed between nLow and nHigh, - * including * the endpoints. nStream is the random number stream. - * Stream 0 is used if nStream is not in the range 0..MAX_STREAM. - */ - -{ - double dRange; - long nTemp; - - if (nStream < 0 || nStream > MAX_STREAM) - nStream = 0; - - if (nLow > nHigh) - { - nTemp = nLow; - nLow = nHigh; - nHigh = nTemp; - } - - dRange = DOUBLE_CAST (nHigh - nLow + 1); - Seed[nStream].value = NextRand(Seed[nStream].value); - nTemp = (long) (((double) Seed[nStream].value / dM) * (dRange)); - return (nLow + nTemp); -} - - - -/****************************************************************** - - UnifReal: Yields a real uniformly distributed between given bounds - -*******************************************************************/ - -/* - * double UnifReal( double dLow, double dHigh, long nStream ) - */ -double -UnifReal(double dLow, double dHigh, long nStream) - -/* - * Returns a double uniformly distributed between dLow and dHigh, - * excluding the endpoints. nStream is the random number stream. - * Stream 0 is used if nStream is not in the range 0..MAX_STREAM. - */ - -{ - double dTemp; - - if (nStream < 0 || nStream > MAX_STREAM) - nStream = 0; - if (dLow == dHigh) - return (dLow); - if (dLow > dHigh) - { - dTemp = dLow; - dLow = dHigh; - dHigh = dTemp; - } - Seed[nStream].value = NextRand(Seed[nStream].value); - dTemp = ((double) Seed[nStream].value / dM) * (dHigh - dLow); - return (dLow + dTemp); -} - - - -/******************************************************************% - - Exponential: Yields a real exponentially distributed with given mean - -*******************************************************************/ - -/* - * double Exponential( double dMean, long nStream ) - */ -double -Exponential(double dMean, long nStream) - -/* - * Returns a double uniformly distributed with mean dMean. - * 0.0 is returned iff dMean <= 0.0. nStream is the random number - * stream. Stream 0 is used if nStream is not in the range - * 0..MAX_STREAM. - */ - -{ - double dTemp; - - if (nStream < 0 || nStream > MAX_STREAM) - nStream = 0; - if (dMean <= 0.0) - return (0.0); - - Seed[nStream].value = NextRand(Seed[nStream].value); - dTemp = (double) Seed[nStream].value / dM; /* unif between 0..1 */ - return (-dMean * log(1.0 - dTemp)); -} diff --git a/utils/ssbm/rnd.h b/utils/ssbm/rnd.h deleted file mode 100644 index a8e8d36b7..000000000 --- a/utils/ssbm/rnd.h +++ /dev/null @@ -1,80 +0,0 @@ -/* - * Sccsid: @(#)rnd.h 2.1.8.1 - * - * rnd.h -- header file for use withthe portable random number generator - * provided by Frank Stephens of Unisys - */ - -/* function protypes */ -long NextRand PROTO((long)); -long UnifInt PROTO((long, long, long)); -double UnifReal PROTO((double, double, long)); -double Exponential PROTO((double, long)); - -static long nA = 16807; /* the multiplier */ -static long nM = 2147483647;/* the modulus == 2^31 - 1 */ -static long nQ = 127773; /* the quotient nM / nA */ -static long nR = 2836; /* the remainder nM % nA */ - -static double dM = 2147483647.0; - -/* - * macros to control RNG and assure reproducible multi-stream - * runs without the need for seed files. Keep track of invocations of RNG - * and always round-up to a known per-row boundary. - */ -/* - * preferred solution, but not initializing correctly - */ -#define VSTR_MAX(len) (long)(len / 5 + (len % 5 == 0)?0:1 + 1) -seed_t Seed[MAX_STREAM + 1] = -{ - {PART, 1, 0, 1}, /* P_MFG_SD 0 */ - {PART, 46831694, 0, 1}, /* P_BRND_SD 1 */ - {PART, 1841581359, 0, 1}, /* P_TYPE_SD 2 */ - {PART, 1193163244, 0, 1}, /* P_SIZE_SD 3 */ - {PART, 727633698, 0, 1}, /* P_CNTR_SD 4 */ - {NONE, 933588178, 0, 1}, /* P_RCST_SD 5 UNUSED 2-4-98 */ - {PART, 804159733, 0, RNG_PER_SENT * 3}, /* P_CMNT_SD 6 */ - {PSUPP, 1671059989, 0, SUPP_PER_PART}, /* PS_QTY_SD 7 */ - {PSUPP, 1051288424, 0, SUPP_PER_PART}, /* PS_SCST_SD 8 */ - {PSUPP, 1961692154, 0, SUPP_PER_PART * RNG_PER_SENT * 20}, /* PS_CMNT_SD 9 */ - {ORDER, 1227283347, 0, 1}, /* O_SUPP_SD 10 */ - {ORDER, 1171034773, 0, 1}, /* O_CLRK_SD 11 */ - {ORDER, 276090261, 0, RNG_PER_SENT * 8}, /* O_CMNT_SD 12 */ - {ORDER, 1066728069, 0, 1}, /* O_ODATE_SD 13 */ - {LINE, 209208115, 0, O_LCNT_MAX}, /* L_QTY_SD 14 */ - {LINE, 554590007, 0, O_LCNT_MAX}, /* L_DCNT_SD 15 */ - {LINE, 721958466, 0, O_LCNT_MAX}, /* L_TAX_SD 16 */ - {LINE, 1371272478, 0, O_LCNT_MAX}, /* L_SHIP_SD 17 */ - {LINE, 675466456, 0, O_LCNT_MAX}, /* L_SMODE_SD 18 */ - {LINE, 1808217256, 0, O_LCNT_MAX}, /* L_PKEY_SD 19 */ - {LINE, 2095021727, 0, O_LCNT_MAX}, /* L_SKEY_SD 20 */ - {LINE, 1769349045, 0, O_LCNT_MAX}, /* L_SDTE_SD 21 */ - {LINE, 904914315, 0, O_LCNT_MAX}, /* L_CDTE_SD 22 */ - {LINE, 373135028, 0, O_LCNT_MAX}, /* L_RDTE_SD 23 */ - {LINE, 717419739, 0, O_LCNT_MAX}, /* L_RFLG_SD 24 */ - {LINE, 1095462486, 0, O_LCNT_MAX * RNG_PER_SENT * 5}, /* L_CMNT_SD 25 */ - {CUST, 881155353, 0, 9}, /* C_ADDR_SD 26 */ - {CUST, 1489529863, 0, 1}, /* C_NTRG_SD 27 */ - {CUST, 1521138112, 0, 3}, /* C_PHNE_SD 28 */ - {CUST, 298370230, 0, 1}, /* C_ABAL_SD 29 */ - {CUST, 1140279430, 0, 1}, /* C_MSEG_SD 30 */ - {CUST, 1335826707, 0, RNG_PER_SENT * 12}, /* C_CMNT_SD 31 */ - {SUPP, 706178559, 0, 9}, /* S_ADDR_SD 32 */ - {SUPP, 110356601, 0, 1}, /* S_NTRG_SD 33 */ - {SUPP, 884434366, 0, 3}, /* S_PHNE_SD 34 */ - {SUPP, 962338209, 0, 1}, /* S_ABAL_SD 35 */ - {SUPP, 1341315363, 0, RNG_PER_SENT * 11}, /* S_CMNT_SD 36 */ - {PART, 709314158, 0, 92}, /* P_NAME_SD 37 */ - {ORDER, 591449447, 0, 1}, /* O_PRIO_SD 38 */ - {LINE, 431918286, 0, 1}, /* HVAR_SD 39 */ - {ORDER, 851767375, 0, 1}, /* O_CKEY_SD 40 */ - {NATION, 606179079, 0, RNG_PER_SENT * 16}, /* N_CMNT_SD 41 */ - {REGION, 1500869201, 0, RNG_PER_SENT * 16}, /* R_CMNT_SD 42 */ - {ORDER, 1434868289, 0, 1}, /* O_LCNT_SD 43 */ - {SUPP, 263032577, 0, 1}, /* BBB offset 44 */ - {SUPP, 753643799, 0, 1}, /* BBB type 45 */ - {SUPP, 202794285, 0, 1}, /* BBB comment 46 */ - {SUPP, 715851524, 0, 1} /* BBB junk 47 */ -}; diff --git a/utils/ssbm/shared.h b/utils/ssbm/shared.h deleted file mode 100644 index c1c18ce44..000000000 --- a/utils/ssbm/shared.h +++ /dev/null @@ -1,140 +0,0 @@ -/* - * Sccsid: @(#)shared.h 2.1.8.1 - * Modified for SSBM - */ -#define N_CMNT_LEN 72 -#define N_CMNT_MAX 152 -#define R_CMNT_LEN 72 -#define R_CMNT_MAX 152 -#define MONEY_SCL 0.01 -#define V_STR_HGH 1.6 - -#ifdef SSBM -#define P_NAME_LEN 22 -#define P_MFG_LEN 6 -#define P_COLOR_LEN 3 -#define P_COLOR_MAX 11 -#define P_TYPE_MAX 25 -#define P_CAT_LEN 7 -#define P_CAT_MIN 1 -#define P_CAT_MAX 5 -#define P_CAT_SD 97 -#define S_NATION_NAME_LEN 15 -#define S_REGION_NAME_LEN 12 -#define C_NATION_NAME_LEN 15 -#define C_REGION_NAME_LEN 12 -#define C_NAT_SD 16 -#define C_REG_SD 3 -#define O_SHIP_STRU_LEN 25 -#define O_SHIP_MODE_LEN 10 -#define O_SHIP_PRIO_LEN 1 -#define D_DATE_LEN 18 -#define D_DAYWEEK_LEN 9 -#define D_YEARMONTH_LEN 7 -#define D_SEASON_LEN 12 -#define D_MONTH_LEN 9 -#define D_STARTDATE 694245661 /*corresponding to 1/1/1992 1:1:1*/ -#define NAMTION_BRIEF_LEN 9 -#define CITY_CODE_SEED 15 -#define NUM_DAYS 2556 -#define NUM_SEASONS 5 -#define NUM_HOLIDAYS 10 -#define CITY_FIX 10 -#else - -#define P_NAME_LEN 55 -#define P_MFG_LEN 25 - -#endif - -#define P_BRND_LEN 10 - -#ifdef SSBM -#define P_TYPE_LEN 12 - -#else - -#define P_TYPE_LEN 25 - -#endif - -#define P_CNTR_LEN 10 -#define P_CMNT_LEN 14 -#define P_CMNT_MAX 23 -#define P_CAT_SEED 25 - -#define S_NAME_LEN 25 - -#ifdef SSBM -#define S_ADDR_LEN 15 -#define S_ADDR_MAX 25 -#else - -#define S_ADDR_LEN 25 -#define S_ADDR_MAX 40 -#endif - -#define S_CMNT_LEN 63 -#define S_CMNT_MAX 101 -#define PS_CMNT_LEN 124 -#define PS_CMNT_MAX 199 - -#ifdef SSBM -#define C_NAME_LEN 25 -#define C_MSEG_MIN 1 -#define C_MSEG_MAX 5 -#define C_ADDR_LEN 15 -#define C_ADDR_MAX 25 -#else -#define C_NAME_LEN 18 -#define C_ADDR_LEN 25 -#define C_ADDR_MAX 40 -#endif - -#define C_MSEG_LEN 10 -#define C_CMNT_LEN 73 -#define C_CMNT_MAX 117 - -#ifdef SSBM -#define O_OPRIO_LEN 8 - -#else -#define O_OPRIO_LEN 15 - -#endif - -#define O_CLRK_LEN 15 -#define O_CMNT_LEN 49 -#define O_CMNT_MAX 79 -#define L_CMNT_LEN 27 -#define L_CMNT_MAX 44 -#define L_INST_LEN 25 -#define L_SMODE_LEN 10 -#define T_ALPHA_LEN 10 -#define DATE_LEN 13 /* long enough to hold either date format */ -#define NATION_LEN 25 -#define REGION_LEN 25 -#define PHONE_LEN 15 - -#ifdef SSBM -#define MAXAGG_LEN 10 /* max component length for a agg str */ - -#else -#define MAXAGG_LEN 20 /* max component length for a agg str */ - -#endif - -#define P_CMNT_SD 6 -#define PS_CMNT_SD 9 -#define O_CMNT_SD 12 -#define C_ADDR_SD 26 -#define C_CMNT_SD 31 -#define S_ADDR_SD 32 -#define S_CMNT_SD 36 -#define L_CMNT_SD 25 - - - - - - diff --git a/utils/ssbm/speed_seed.c b/utils/ssbm/speed_seed.c deleted file mode 100644 index 402b7de6b..000000000 --- a/utils/ssbm/speed_seed.c +++ /dev/null @@ -1,325 +0,0 @@ -/* @(#)speed_seed.c 2.1.8.2 */ -#include -#include -#include "dss.h" - -/* _tal long RandSeed = "Random^SeedFromTimestamp" (void); */ - -#define FAKE_V_STR(avg, sd, cnt) \ - ADVANCE_STREAM(sd, \ - (long)(Seed[sd].boundary*cnt)) -#define ADVANCE_STREAM(stream_id, num_calls) \ - NthElement(num_calls, &Seed[stream_id].value) - -#define MAX_COLOR 92 -long name_bits[MAX_COLOR / BITS_PER_LONG]; -extern seed_t Seed[]; - -/* WARNING! This routine assumes the existence of 64-bit */ -/* integers. The notation used here- "HUGE" is *not* ANSI standard. */ -/* Hopefully, you have this extension as well. If not, use whatever */ -/* nonstandard trick you need to in order to get 64 bit integers. */ -/* The book says that this will work if MAXINT for the type you choose */ -/* is at least 2**46 - 1, so 64 bits is more than you *really* need */ - -static DSS_HUGE Multiplier = 16807; /* or whatever nonstandard */ -static DSS_HUGE Modulus = 2147483647; /* trick you use to get 64 bit int */ - -/* Advances value of Seed after N applications of the random number generator - with multiplier Mult and given Modulus. - NthElement(Seed[],count); - - Theory: We are using a generator of the form - X_n = [Mult * X_(n-1)] mod Modulus. It turns out that - X_n = [(Mult ** n) X_0] mod Modulus. - This can be computed using a divide-and-conquer technique, see - the code below. - - In words, this means that if you want the value of the Seed after n - applications of the generator, you multiply the initial value of the - Seed by the "super multiplier" which is the basic multiplier raised - to the nth power, and then take mod Modulus. -*/ - -/* Nth Element of sequence starting with StartSeed */ -/* Warning, needs 64-bit integers */ -#ifdef SUPPORT_64BITS -void NthElement (long N, long *StartSeed) - { - DSS_HUGE Z; - DSS_HUGE Mult; - static int ln=-1; - int i; - - if ((verbose > 0) && ++ln % 1000 == 0) - { - i = ln % LN_CNT; - fprintf(stderr, "%c\b", lnoise[i]); - } - Mult = Multiplier; - Z = (DSS_HUGE) *StartSeed; - while (N > 0 ) - { - if (N % 2 != 0) /* testing for oddness, this seems portable */ - Z = (Mult * Z) % Modulus; - N = N / 2; /* integer division, truncates */ - Mult = (Mult * Mult) % Modulus; - } - *StartSeed = (long)Z; - - return; - } -#else -/* add 32 bit version of NthElement HERE */ -/* - * MODMULT.C - * R. M. Shelton -- Unisys - * July 26, 1995 - * - * RND_seed: Computes the nth seed in the total sequence - * RND_shift: Shifts a random number by a given number of seeds - * RND_ModMult: Multiplies two numbers mod (2^31 - 1) - * - */ - - - -#include -#include /* required only for F_FatalError */ - -typedef signed long RND; -typedef unsigned long URND; - -#define FatalError(e) F_FatalError( (e), __FILE__, __LINE__ ) -void F_FatalError( int x, char *y, int z ) {fprintf(stderr, "Bang!\n");} - - -/* Prototypes */ -RND RND_seed( RND ); -RND RND_shift( RND, RND ); -static RND RND_ModMult( RND, RND ); - - - -RND -RND_seed ( RND Order ) -{ -static const RND TopMask = 0x40000000; -RND Mask; -RND Result; - - -if (Order <= -Modulus || Order >= Modulus) - FatalError(1023); - -if (Order < 0) Order = Modulus - 1L + Order; - -Mask = TopMask; -Result = 1L; - -while (Mask > Order) Mask >>= 1; - -while (Mask > 0) - { - if (Mask & Order) - { - Result = RND_ModMult( Result, Result); - Result = RND_ModMult( Result, Multiplier ); - } - else - { - Result = RND_ModMult( Result, Result ); - } - Mask >>= 1; - } - -return (Result); - -} /* RND_seed */ - - - -/*********************************************************************** - - RND_shift: Shifts a random number by a given number of seeds - -***********************************************************************/ - -void -NthElement ( long Shift, long *Seed) - -{ - RND Power; - static int ln=-1; - int i; - - if ((verbose > 0) && ++ln % 100 == 0) - { - i = (ln/100) % LN_CNT; - fprintf(stderr, "%c\b", lnoise[i]); - } - - -if (*Seed <= 0 || *Seed >= Modulus) - FatalError(1023); -if (Shift <= -Modulus || Shift >= Modulus) - FatalError(1023); - -Power = RND_seed( Shift ); - -*Seed = RND_ModMult( *Seed, Power ); - -return; -} /* RND_shift */ - - - -/********************************************************************* - - RND_ModMult: Multiplies two numbers mod (2^31 - 1) - -*********************************************************************/ - -static RND -RND_ModMult ( RND nA, RND nB) - -{ - -static const double dTwoPowPlus31 = 2147483648.; -static const double dTwoPowMinus31 = 1./2147483648.; -static const double dTwoPowPlus15 = 32768.; -static const double dTwoPowMinus15 = 1./32768.; -static const RND nLowMask = 0xFFFFL; -static const URND ulBit31 = 1uL << 31; - -double dAH, dAL, dX, dY, dZ, dW; -RND nH, nL; -URND ulP, ulQ, ulResult; - -nL = nB & nLowMask; -nH = (nB - nL) >> 16; -dAH = (double)nA * (double)nH; -dAL = (double)nA * (double)nL; -dX = floor( dAH * dTwoPowMinus15 ); -dY = dAH - dX*dTwoPowPlus15; -dZ = floor( dAL * dTwoPowMinus31 ); -dW = dAL - dZ*dTwoPowPlus31; - -ulQ = (URND)dW + ((URND)dY << 16); -ulP = (URND)dX + (URND)dZ; -if (ulQ & ulBit31) { ulQ -= ulBit31; ulP++; } - -ulResult = ulP + ulQ; -if (ulResult & ulBit31) { ulResult -= ulBit31; ulResult++; } - -return (RND)ulResult; -} -#endif /* SUPPORT_64BITS */ - -/* updates Seed[column] using the a_rnd algorithm */ -void -fake_a_rnd(int min, int max, int column) -{ - long len, itcount; - RANDOM(len, (long)min, (long)max, (long)column); - if (len % 5L == 0) - itcount = len/5; - else itcount = len/5 + 1L; - NthElement(itcount, &Seed[column].usage); - return; -} - - -long -sd_part(int child, long skip_count) -{ - int i; - - for (i=P_MFG_SD; i<= P_CNTR_SD; i++) - ADVANCE_STREAM(i, skip_count); - - FAKE_V_STR(P_CMNT_LEN, P_CMNT_SD, skip_count); - ADVANCE_STREAM(P_NAME_SD, skip_count * 92); - - return(0L); -} - -long -sd_line(int child, long skip_count) - { - int i,j; - - for (j=0; j < O_LCNT_MAX; j++) - { - for (i=L_QTY_SD; i<= L_RFLG_SD; i++) - ADVANCE_STREAM(i, skip_count); - } - - FAKE_V_STR(L_CMNT_LEN, L_CMNT_SD, skip_count); - /* need to special case this as the link between master and detail */ - if (child == 1) - { - ADVANCE_STREAM(O_ODATE_SD, skip_count); - ADVANCE_STREAM(O_LCNT_SD, skip_count); - } - - return(0L); - } - -long -sd_order(int child, long skip_count) -{ - ADVANCE_STREAM(O_LCNT_SD, skip_count); - ADVANCE_STREAM(O_CKEY_SD, skip_count); - FAKE_V_STR(O_CMNT_LEN, O_CMNT_SD, skip_count); - ADVANCE_STREAM(O_SUPP_SD, skip_count); - ADVANCE_STREAM(O_CLRK_SD, skip_count); - ADVANCE_STREAM(O_PRIO_SD, skip_count); - ADVANCE_STREAM(O_ODATE_SD, skip_count); - - return (0L); -} - -long -sd_psupp(int child, long skip_count) - { - int j; - - for (j=0; j < SUPP_PER_PART; j++) - { - ADVANCE_STREAM(PS_QTY_SD, skip_count); - ADVANCE_STREAM(PS_SCST_SD, skip_count); - } - FAKE_V_STR(PS_CMNT_LEN, PS_CMNT_SD, skip_count); - - return(0L); - } - -long -sd_cust(int child, long skip_count) -{ - - FAKE_V_STR(C_ADDR_LEN, C_ADDR_SD, skip_count); - FAKE_V_STR(C_CMNT_LEN, C_CMNT_SD, skip_count); - ADVANCE_STREAM(C_NTRG_SD, skip_count); - ADVANCE_STREAM(C_PHNE_SD, 3L * skip_count); - ADVANCE_STREAM(C_ABAL_SD, skip_count); - ADVANCE_STREAM(C_MSEG_SD, skip_count); - return(0L); -} - -long -sd_supp(int child, long skip_count) -{ - ADVANCE_STREAM(S_NTRG_SD, skip_count); - ADVANCE_STREAM(S_PHNE_SD, 3L * skip_count); - ADVANCE_STREAM(S_ABAL_SD, skip_count); - FAKE_V_STR(S_ADDR_LEN, S_ADDR_SD, skip_count); - FAKE_V_STR(S_CMNT_LEN, S_CMNT_SD, skip_count); - ADVANCE_STREAM(BBB_CMNT_SD, skip_count); - ADVANCE_STREAM(BBB_JNK_SD, skip_count); - ADVANCE_STREAM(BBB_OFFSET_SD, skip_count); - ADVANCE_STREAM(BBB_TYPE_SD, skip_count); /* avoid one trudge */ - - return(0L); -} diff --git a/utils/ssbm/ssbm-ddl.sql b/utils/ssbm/ssbm-ddl.sql deleted file mode 100644 index 8f331dbb1..000000000 --- a/utils/ssbm/ssbm-ddl.sql +++ /dev/null @@ -1,175 +0,0 @@ --- --- PostgreSQL database dump --- - --- Dumped from database version 9.5.3 --- Dumped by pg_dump version 9.5.3 - -SET statement_timeout = 0; -SET lock_timeout = 0; -SET client_encoding = 'UTF8'; -SET standard_conforming_strings = on; -SET check_function_bodies = false; -SET client_min_messages = warning; -SET row_security = off; - -SET search_path = public, pg_catalog; - -SET default_tablespace = ''; - -SET default_with_oids = false; - --- --- Name: customer; Type: TABLE; Schema: public; Owner: postgres --- - -CREATE TABLE customer ( - c_custkey numeric NOT NULL, - c_name character varying(25), - c_address character varying(25), - c_city character(10), - c_nation character(15), - c_region character(12), - c_phone character(15), - c_mktsegment character(10) -); - - -ALTER TABLE customer OWNER TO postgres; - --- --- Name: date1; Type: TABLE; Schema: public; Owner: postgres --- - -CREATE TABLE date1 ( - d_datekey integer NOT NULL, - d_date character(18), - d_dayofweek character(12), - d_month character(9), - d_year integer, - d_yearmonthnum numeric, - d_yearmonth character(7), - d_daynuminweek numeric, - d_daynuminmonth numeric, - d_daynuminyear numeric, - d_monthnuminyear numeric, - d_weeknuminyear numeric, - d_sellingseason character(12), - d_lastdayinweekfl character(1), - d_lastdayinmonthfl character(1), - d_holidayfl character(1), - d_weekdayfl character(1) -); - - -ALTER TABLE date1 OWNER TO postgres; - --- --- Name: lineorder; Type: TABLE; Schema: public; Owner: postgres --- - -CREATE TABLE lineorder ( - lo_orderkey numeric, - lo_linenumber integer, - lo_custkey numeric, - lo_partkey integer, - lo_suppkey numeric, - lo_orderdate integer, - lo_orderpriority character(15), - lo_shippriority character(1), - lo_quantity numeric, - lo_extendedprice numeric, - lo_ordertotalprice numeric, - lo_discount numeric, - lo_revenue numeric, - lo_supplycost numeric, - lo_tax numeric, - lo_commit_date character(8), - lo_shipmode character(10) -); - - -ALTER TABLE lineorder OWNER TO postgres; - --- --- Name: part; Type: TABLE; Schema: public; Owner: postgres --- - -CREATE TABLE part ( - p_partkey integer NOT NULL, - p_name character varying(22), - p_mfgr character(6), - p_category character(7), - p_brand1 character(9), - p_color character varying(11), - p_type character varying(25), - p_size numeric, - p_container character(10) -); - - -ALTER TABLE part OWNER TO postgres; - --- --- Name: supplier; Type: TABLE; Schema: public; Owner: postgres --- - -CREATE TABLE supplier ( - s_suppkey numeric NOT NULL, - s_name character(25), - s_address character varying(25), - s_city character(10), - s_nation character(15), - s_region character(12), - s_phone character(15) -); - - -ALTER TABLE supplier OWNER TO postgres; - --- --- Name: customer_pkey; Type: CONSTRAINT; Schema: public; Owner: postgres --- - -ALTER TABLE ONLY customer - ADD CONSTRAINT customer_pkey PRIMARY KEY (c_custkey); - - --- --- Name: date1_pkey; Type: CONSTRAINT; Schema: public; Owner: postgres --- - -ALTER TABLE ONLY date1 - ADD CONSTRAINT date1_pkey PRIMARY KEY (d_datekey); - - --- --- Name: part_pkey; Type: CONSTRAINT; Schema: public; Owner: postgres --- - -ALTER TABLE ONLY part - ADD CONSTRAINT part_pkey PRIMARY KEY (p_partkey); - - --- --- Name: supplier_pkey; Type: CONSTRAINT; Schema: public; Owner: postgres --- - -ALTER TABLE ONLY supplier - ADD CONSTRAINT supplier_pkey PRIMARY KEY (s_suppkey); - - --- --- Name: public; Type: ACL; Schema: -; Owner: postgres --- - -REVOKE ALL ON SCHEMA public FROM PUBLIC; -REVOKE ALL ON SCHEMA public FROM postgres; -GRANT ALL ON SCHEMA public TO postgres; -GRANT ALL ON SCHEMA public TO PUBLIC; - - --- --- PostgreSQL database dump complete --- - diff --git a/utils/ssbm/text.c b/utils/ssbm/text.c deleted file mode 100644 index ef4df3c73..000000000 --- a/utils/ssbm/text.c +++ /dev/null @@ -1,313 +0,0 @@ -/* @(#)text.c 2.1.8.1 */ -/* - * text.c --- pseaudo text generator for use in DBGEN 2.0 - * - * Defined Routines: - * dbg_text() -- select and translate a sentance form - */ - -#ifdef TEST -#define DECLARER -#endif /* TEST */ - -#include "config.h" -#include -#if (defined(_POSIX_)||!defined(WIN32)) /* Change for Windows NT */ -/*#include -#include */ -#endif /* WIN32 */ -#include /* */ -#include -#include -#include -#include -#include -#include -#ifdef HP -#include -#endif -#if (defined(WIN32)&&!defined(_POSIX_)) -#include -#pragma warning(disable:4201) -#pragma warning(disable:4214) -#pragma warning(disable:4514) -#define WIN32_LEAN_AND_MEAN -#define NOATOM -#define NOGDICAPMASKS -#define NOMETAFILE -#define NOMINMAX -#define NOMSG -#define NOOPENFILE -#define NORASTEROPS -#define NOSCROLL -#define NOSOUND -#define NOSYSMETRICS -#define NOTEXTMETRIC -#define NOWH -#define NOCOMM -#define NOKANJI -#define NOMCX -#include -#pragma warning(default:4201) -#pragma warning(default:4214) -#endif - -#include "dss.h" -#include "dsstypes.h" - -/* - * txt_vp() -- - * generate a verb phrase by - * 1) selecting a verb phrase form - * 2) parsing it to select parts of speech - * 3) selecting appropriate words - * 4) adding punctuation as required - * - * Returns: length of generated phrase - * Called By: txt_sentence() - * Calls: pick_str() - */ -static int -txt_vp(char *dest, int sd) -{ - char syntax[MAX_GRAMMAR_LEN + 1], - *cptr, - *parse_target; - distribution *src; - int i, - res = 0; - - - pick_str(&vp, sd, &syntax[0]); - parse_target = syntax; - while ((cptr = strtok(parse_target, " ")) != NULL) - { - src = NULL; - switch(*cptr) - { - case 'D': - src = &adverbs; - break; - case 'V': - src = &verbs; - break; - case 'X': - src = &auxillaries; - break; - } /* end of POS switch statement */ - i = pick_str(src, sd, dest); - i = strlen(DIST_MEMBER(src, i)); - dest += i; - res += i; - if (*(++cptr)) /* miscelaneous fillagree, like punctuation */ - { - dest += 1; - res += 1; - *dest = *cptr; - } - *dest = ' '; - dest++; - res++; - parse_target = NULL; - } /* end of while loop */ - - return(res); -} - -/* - * txt_np() -- - * generate a noun phrase by - * 1) selecting a noun phrase form - * 2) parsing it to select parts of speech - * 3) selecting appropriate words - * 4) adding punctuation as required - * - * Returns: length of generated phrase - * Called By: txt_sentence() - * Calls: pick_str(), - */ -static int -txt_np(char *dest, int sd) -{ - char syntax[MAX_GRAMMAR_LEN + 1], - *cptr, - *parse_target; - distribution *src; - int i, - res = 0; - - - pick_str(&np, sd, &syntax[0]); - parse_target = syntax; - while ((cptr = strtok(parse_target, " ")) != NULL) - { - src = NULL; - switch(*cptr) - { - case 'A': - src = &articles; - break; - case 'J': - src = &adjectives; - break; - case 'D': - src = &adverbs; - break; - case 'N': - src = &nouns; - break; - } /* end of POS switch statement */ - i = pick_str(src, sd, dest); - i = strlen(DIST_MEMBER(src, i)); - dest += i; - res += i; - if (*(++cptr)) /* miscelaneous fillagree, like punctuation */ - { - *dest = *cptr; - dest += 1; - res += 1; - } - *dest = ' '; - dest++; - res++; - parse_target = NULL; - } /* end of while loop */ - - return(res); -} - -/* - * txt_sentence() -- - * generate a sentence by - * 1) selecting a sentence form - * 2) parsing it to select parts of speech or phrase types - * 3) selecting appropriate words - * 4) adding punctuation as required - * - * Returns: length of generated sentence - * Called By: dbg_text() - * Calls: pick_str(), txt_np(), txt_vp() - */ -static int -txt_sentence(char *dest, int sd) -{ - char syntax[MAX_GRAMMAR_LEN + 1], - *cptr; - int i, - res = 0, - len = 0; - - - pick_str(&grammar, sd, syntax); - cptr = syntax; - -next_token: /* I hate goto's, but can't seem to have parent and child use strtok() */ - while (*cptr && *cptr == ' ') - cptr++; - if (*cptr == '\0') - goto done; - switch(*cptr) - { - case 'V': - len = txt_vp(dest, sd); - break; - case 'N': - len = txt_np(dest, sd); - break; - case 'P': - i = pick_str(&prepositions, sd, dest); - len = strlen(DIST_MEMBER(&prepositions, i)); - strcpy((dest + len), " the "); - len += 5; - len += txt_np(dest + len, sd); - break; - case 'T': - i = pick_str(&terminators, sd, --dest); /*terminators should abut previous word */ - len = strlen(DIST_MEMBER(&terminators, i)); - break; - } /* end of POS switch statement */ - dest += len; - res += len; - cptr++; - if (*cptr && *cptr != ' ') /* miscelaneous fillagree, like punctuation */ - { - dest += 1; - res += 1; - *dest = *cptr; - } - goto next_token; -done: - *dest = '\0'; - return(--res); -} - -/* - * dbg_text() -- - * produce ELIZA-like text of random, bounded length, truncating the last - * generated sentence as required - */ -int -dbg_text(char *tgt, int min, int max, int sd) -{ - long length = 0; - int wordlen = 0, - needed, - s_len; - char sentence[MAX_SENT_LEN + 1]; - - RANDOM(length, min, max, sd); - - while (wordlen < length) - { - s_len = txt_sentence(sentence, sd); - if ( s_len < 0) - INTERNAL_ERROR("Bad sentence formation"); - needed = length - wordlen; - if (needed >= s_len + 1) /* need the entire sentence */ - { - strcpy(tgt, sentence); - tgt += s_len; - wordlen += s_len + 1; - *(tgt++) = ' '; - } - else /* chop the new sentence off to match the length target */ - { - sentence[needed] = '\0'; - strcpy(tgt, sentence); - wordlen += needed; - tgt += needed; - } - } - *tgt = '\0'; - - return(wordlen); -} - -#ifdef TEST -tdef tdefs = { NULL }; - -main() -{ - char prattle[401]; - - read_dist (env_config (DIST_TAG, DIST_DFLT), "nouns", &nouns); - read_dist (env_config (DIST_TAG, DIST_DFLT), "verbs", &verbs); - read_dist (env_config (DIST_TAG, DIST_DFLT), "adjectives", &adjectives); - read_dist (env_config (DIST_TAG, DIST_DFLT), "adverbs", &adverbs); - read_dist (env_config (DIST_TAG, DIST_DFLT), "auxillaries", &auxillaries); - read_dist (env_config (DIST_TAG, DIST_DFLT), "terminators", &terminators); - read_dist (env_config (DIST_TAG, DIST_DFLT), "articles", &articles); - read_dist (env_config (DIST_TAG, DIST_DFLT), "prepositions", &prepositions); - read_dist (env_config (DIST_TAG, DIST_DFLT), "grammar", &grammar); - read_dist (env_config (DIST_TAG, DIST_DFLT), "np", &np); - read_dist (env_config (DIST_TAG, DIST_DFLT), "vp", &vp); - - while (1) - { - dbg_text(&prattle[0], 300, 400, 0); - printf("<%s>\n", prattle); - } - - return(0); -} -#endif /* TEST */