diff --git a/src/dpu_join.c b/src/dpu_join.c index 7831d0329..a41850b7c 100644 --- a/src/dpu_join.c +++ b/src/dpu_join.c @@ -13,7 +13,7 @@ /* static variables */ static set_join_pathlist_hook_type set_join_pathlist_next = NULL; -static CustomPathMethods dpujoin_path_methods; + CustomPathMethods dpujoin_path_methods; static CustomScanMethods dpujoin_plan_methods; static CustomExecMethods dpujoin_exec_methods; bool pgstrom_enable_dpujoin; /* GUC */ diff --git a/src/dpu_scan.c b/src/dpu_scan.c index 3c7268b2a..c1453a668 100644 --- a/src/dpu_scan.c +++ b/src/dpu_scan.c @@ -46,15 +46,12 @@ DpuScanAddScanPath(PlannerInfo *root, cpath = buildXpuScanPath(root, baserel, + TASK_KIND__DPUSCAN, (try_parallel > 0), true, /* allow host quals */ false, /* disallow no device quals */ - TASK_KIND__DPUSCAN); - if (cpath && custom_path_remember(root, - baserel, - (try_parallel > 0), - TASK_KIND__DPUSCAN, - cpath)) + &dpuscan_path_methods); + if (cpath) { if (try_parallel == 0) add_path(baserel, &cpath->path); diff --git a/src/gpu_join.c b/src/gpu_join.c index acb8de8e4..a148a3885 100644 --- a/src/gpu_join.c +++ b/src/gpu_join.c @@ -48,6 +48,9 @@ form_pgstrom_plan_info(CustomScan *cscan, pgstromPlanInfo *pp_info) privs = lappend(privs, pp_info->scan_quals_fallback); privs = lappend(privs, __makeFloat(pp_info->scan_tuples)); privs = lappend(privs, __makeFloat(pp_info->scan_rows)); + privs = lappend(privs, __makeFloat(pp_info->scan_startup_cost)); + privs = lappend(privs, __makeFloat(pp_info->scan_run_cost)); + privs = lappend(privs, makeInteger(pp_info->parallel_nworkers)); privs = lappend(privs, __makeFloat(pp_info->parallel_divisor)); privs = lappend(privs, __makeFloat(pp_info->final_cost)); privs = lappend(privs, makeBoolean(pp_info->scan_needs_ctid)); @@ -86,6 +89,8 @@ form_pgstrom_plan_info(CustomScan *cscan, pgstromPlanInfo *pp_info) __privs = lappend(__privs, makeInteger(pp_inner->join_type)); __privs = lappend(__privs, __makeFloat(pp_inner->join_nrows)); + __privs = lappend(__privs, __makeFloat(pp_inner->join_startup_cost)); + __privs = lappend(__privs, __makeFloat(pp_inner->join_run_cost)); __exprs = lappend(__exprs, pp_inner->hash_outer_keys); __privs = lappend(__privs, pp_inner->hash_outer_keys_fallback); __exprs = lappend(__exprs, pp_inner->hash_inner_keys); @@ -141,6 +146,9 @@ deform_pgstrom_plan_info(CustomScan *cscan) pp_data.scan_quals_fallback = list_nth(privs, pindex++); pp_data.scan_tuples = floatVal(list_nth(privs, pindex++)); pp_data.scan_rows = floatVal(list_nth(privs, pindex++)); + pp_data.scan_startup_cost = floatVal(list_nth(privs, pindex++)); + pp_data.scan_run_cost = floatVal(list_nth(privs, pindex++)); + pp_data.parallel_nworkers = intVal(list_nth(privs, pindex++)); pp_data.parallel_divisor = floatVal(list_nth(privs, pindex++)); pp_data.final_cost = floatVal(list_nth(privs, pindex++)); pp_data.scan_needs_ctid = boolVal(list_nth(privs, pindex++)); @@ -183,6 +191,8 @@ deform_pgstrom_plan_info(CustomScan *cscan) pp_inner->join_type = intVal(list_nth(__privs, __pindex++)); pp_inner->join_nrows = floatVal(list_nth(__privs, __pindex++)); + pp_inner->join_startup_cost = floatVal(list_nth(__privs, __pindex++)); + pp_inner->join_run_cost = floatVal(list_nth(__privs, __pindex++)); pp_inner->hash_outer_keys = list_nth(__exprs, __eindex++); pp_inner->hash_outer_keys_fallback = list_nth(__privs, __pindex++); pp_inner->hash_inner_keys = list_nth(__exprs, __eindex++); @@ -208,7 +218,7 @@ deform_pgstrom_plan_info(CustomScan *cscan) * copy_pgstrom_plan_info */ static pgstromPlanInfo * -copy_pgstrom_plan_info(pgstromPlanInfo *pp_orig) +copy_pgstrom_plan_info(const pgstromPlanInfo *pp_orig) { pgstromPlanInfo *pp_dest; @@ -250,181 +260,89 @@ copy_pgstrom_plan_info(pgstromPlanInfo *pp_orig) } /* - * extract_input_path_params - * - * centralized point to extract the information from the input path + * try_fetch_xpujoin_planinfo */ -void -extract_input_path_params(const Path *input_path, - const Path *inner_path, /* optional */ - pgstromPlanInfo **p_pp_info, - List **p_input_rels_tlist, - List **p_inner_paths_list) +pgstromPlanInfo * +try_fetch_xpujoin_planinfo(const Path *path) { - const CustomPath *input_cpath = (const CustomPath *)input_path; - pgstromPlanInfo *pp_info; - List *input_rels_tlist; - List *inner_paths_list; - ListCell *lc; + const CustomPath *cpath = (const CustomPath *)path; - Assert(IsA(input_cpath, CustomPath)); - pp_info = linitial(input_cpath->custom_private); - input_rels_tlist = list_make1(makeInteger(pp_info->scan_relid)); - inner_paths_list = list_copy(input_cpath->custom_paths); - foreach (lc, inner_paths_list) - { - Path *i_path = lfirst(lc); - input_rels_tlist = lappend(input_rels_tlist, i_path->pathtarget); - } - if (inner_path) - input_rels_tlist = lappend(input_rels_tlist, inner_path->pathtarget); - - if (p_pp_info) - *p_pp_info = copy_pgstrom_plan_info(pp_info); - if (p_input_rels_tlist) - *p_input_rels_tlist = input_rels_tlist; - if (p_inner_paths_list) - *p_inner_paths_list = inner_paths_list; + if (IsA(cpath, CustomPath) && + (cpath->methods == &gpujoin_path_methods || + cpath->methods == &dpujoin_path_methods)) + return (pgstromPlanInfo *)linitial(cpath->custom_private); + return NULL; } /* - * try_add_simple_xpujoin_path + * __buildXpuJoinPlanInfo */ -static bool -try_add_simple_xpujoin_path(PlannerInfo *root, - RelOptInfo *joinrel, - RelOptInfo *outer_rel, - Path *inner_path, - JoinType join_type, - JoinPathExtraData *extra, - bool try_parallel_path, - uint32_t xpu_task_flags, - const CustomPathMethods *xpujoin_path_methods) +static pgstromPlanInfo * +__buildXpuJoinPlanInfo(PlannerInfo *root, + RelOptInfo *joinrel, + JoinType join_type, + List *restrict_clauses, + RelOptInfo *outer_rel, + const pgstromPlanInfo *pp_prev, + List *inner_paths_list) { - Path *outer_path; - RelOptInfo *inner_rel = inner_path->parent; - List *inner_paths_list = NIL; - List *restrict_clauses = extra->restrictlist; - Relids required_outer = NULL; - ParamPathInfo *param_info; - CustomPath *cpath; - pgstromPlanInfo *pp_prev; - pgstromPlanInfo *pp_info; + pgstromPlanInfo *pp_info; pgstromPlanInnerInfo *pp_inner; - List *join_quals = NIL; - List *other_quals = NIL; - List *hash_outer_keys = NIL; - List *hash_inner_keys = NIL; - List *input_rels_tlist = NIL; - bool enable_xpuhashjoin; - bool enable_xpugistindex; - double xpu_ratio; - Cost xpu_tuple_cost; - Cost startup_cost = 0.0; - Cost run_cost = 0.0; - Cost comp_cost = 0.0; - Cost final_cost = 0.0; - QualCost join_quals_cost; - ListCell *lc; + Path *inner_path = llast(inner_paths_list); + RelOptInfo *inner_rel = inner_path->parent; + Cardinality outer_nrows; + Cost startup_cost; + Cost run_cost; + bool enable_xpuhashjoin; + bool enable_xpugistindex; + double xpu_tuple_cost; + Cost xpu_ratio; + Cost comp_cost = 0.0; + Cost final_cost = 0.0; + QualCost join_quals_cost; + List *join_quals = NIL; + List *other_quals = NIL; + List *hash_outer_keys = NIL; + List *hash_inner_keys = NIL; + List *input_rels_tlist = NIL; + ListCell *lc; + + if (!restrict_clauses) + return NULL; /* cross join is not welcome */ - /* sanity checks */ - Assert(join_type == JOIN_INNER || join_type == JOIN_FULL || - join_type == JOIN_LEFT || join_type == JOIN_RIGHT); /* - * Parameters related to devices + * device specific parameters */ - if ((xpu_task_flags & DEVKIND__ANY) == DEVKIND__NVIDIA_GPU) + if ((pp_prev->xpu_task_flags & DEVKIND__ANY) == DEVKIND__NVIDIA_GPU) { enable_xpuhashjoin = pgstrom_enable_gpuhashjoin; enable_xpugistindex = pgstrom_enable_gpugistindex; xpu_tuple_cost = pgstrom_gpu_tuple_cost; xpu_ratio = pgstrom_gpu_operator_ratio(); } - else if ((xpu_task_flags & DEVKIND__ANY) == DEVKIND__NVIDIA_DPU) + else if ((pp_prev->xpu_task_flags & DEVKIND__ANY) == DEVKIND__NVIDIA_DPU) { enable_xpuhashjoin = pgstrom_enable_dpuhashjoin; enable_xpugistindex = pgstrom_enable_dpugistindex; - xpu_tuple_cost = pgstrom_dpu_tuple_cost; + xpu_tuple_cost = pgstrom_dpu_tuple_cost; xpu_ratio = pgstrom_dpu_operator_ratio(); } else { - elog(ERROR, "Bug? unexpected xpu_task_flags: %08x", xpu_task_flags); + elog(ERROR, "Bug? unexpected xpu_task_flags: %08x", + pp_prev->xpu_task_flags); } - /* - * Setup Outer Path - */ - if (IS_SIMPLE_REL(outer_rel)) - { - outer_path = (Path *) buildXpuScanPath(root, - outer_rel, - try_parallel_path, - false, - true, - xpu_task_flags); - if (!outer_path) - return false; - } - else + /* setup inner_paths_list */ + input_rels_tlist = list_make1(makeInteger(pp_prev->scan_relid)); + foreach (lc, inner_paths_list) { - outer_path = (Path *) custom_path_find_cheapest(root, - outer_rel, - try_parallel_path, - xpu_task_flags); - if (!outer_path) - return false; - } - if (bms_overlap(PATH_REQ_OUTER(outer_path), inner_rel->relids)) - return false; - /* extract the parameters of outer_path */ - extract_input_path_params(outer_path, - inner_path, - &pp_prev, - &input_rels_tlist, - &inner_paths_list); - startup_cost = outer_path->startup_cost; - run_cost = (outer_path->total_cost - - outer_path->startup_cost - pp_prev->final_cost); + Path *i_path = lfirst(lc); + PathTarget *i_target = i_path->pathtarget; - /* - * Check to see if proposed path is still parameterized, and reject - * if the parameterization wouldn't be sensible. - * Note that GpuNestLoop does not support parameterized nest-loop, - * only cross-join or non-symmetric join are supported, therefore, - * calc_non_nestloop_required_outer() is sufficient. - */ - required_outer = calc_non_nestloop_required_outer(outer_path, - inner_path); - if (required_outer && !bms_overlap(required_outer, - extra->param_source_rels)) - { - bms_free(required_outer); - return false; + input_rels_tlist = lappend(input_rels_tlist, i_target); } - /* - * Get param info - */ - param_info = get_joinrel_parampathinfo(root, - joinrel, - outer_path, - inner_path, - extra->sjinfo, - required_outer, - &restrict_clauses); - if (!restrict_clauses) - return false; /* cross join is not welcome */ - - /* - * Setup pgstromPlanInfo - */ - pp_info = palloc0(offsetof(pgstromPlanInfo, inners[pp_prev->num_rels+1])); - memcpy(pp_info, pp_prev, offsetof(pgstromPlanInfo, inners[pp_prev->num_rels])); - pp_info->xpu_task_flags = xpu_task_flags; - pp_info->num_rels = pp_prev->num_rels + 1; - pp_inner = &pp_info->inners[pp_prev->num_rels]; - /* * All the join-clauses must be executable on GPU device. * Even though older version supports HostQuals to be @@ -438,11 +356,11 @@ try_add_simple_xpujoin_path(PlannerInfo *root, RestrictInfo *rinfo = lfirst(lc); if (!pgstrom_xpu_expression(rinfo->clause, - xpu_task_flags, + pp_prev->xpu_task_flags, input_rels_tlist, NULL)) { - return false; + return NULL; } /* @@ -510,12 +428,18 @@ try_add_simple_xpujoin_path(PlannerInfo *root, bms_free(relids2); } } + /* + * Setup pgstromPlanInfo + */ + pp_info = copy_pgstrom_plan_info(pp_prev); + pp_inner = &pp_info->inners[pp_info->num_rels++]; pp_inner->join_type = join_type; pp_inner->join_nrows = joinrel->rows; pp_inner->hash_outer_keys = hash_outer_keys; pp_inner->hash_inner_keys = hash_inner_keys; pp_inner->join_quals = join_quals; pp_inner->other_quals = other_quals; + /* GiST-Index availability checks */ if (enable_xpugistindex && pp_inner->hash_outer_keys == NIL && pp_inner->hash_inner_keys == NIL) @@ -523,16 +447,29 @@ try_add_simple_xpujoin_path(PlannerInfo *root, inner_path = pgstromTryFindGistIndex(root, inner_path, restrict_clauses, - xpu_task_flags, + pp_info->xpu_task_flags, input_rels_tlist, pp_inner); } /* * Cost estimation */ + if (pp_prev->num_rels == 0) + { + outer_nrows = pp_prev->scan_rows; + startup_cost = pp_prev->scan_startup_cost; + run_cost = pp_prev->scan_run_cost; + } + else + { + const pgstromPlanInnerInfo *__pp_inner = &pp_prev->inners[pp_prev->num_rels-1]; + + outer_nrows = __pp_inner->join_nrows; + startup_cost = __pp_inner->join_startup_cost; + run_cost = __pp_inner->join_run_cost; + } startup_cost += (inner_path->total_cost + inner_path->rows * cpu_tuple_cost); - /* cost for join_quals */ cost_qual_eval(&join_quals_cost, join_quals, root); startup_cost += join_quals_cost.startup; @@ -550,9 +487,9 @@ try_add_simple_xpujoin_path(PlannerInfo *root, /* cost to comput hash value by GPU */ comp_cost += (cpu_operator_cost * xpu_ratio * num_hashkeys * - outer_path->rows); + outer_nrows); /* cost to evaluate join qualifiers */ - comp_cost += join_quals_cost.per_tuple * xpu_ratio * outer_path->rows; + comp_cost += join_quals_cost.per_tuple * xpu_ratio * outer_nrows; } else if (OidIsValid(pp_inner->gist_index_oid)) { @@ -569,10 +506,10 @@ try_add_simple_xpujoin_path(PlannerInfo *root, startup_cost += seq_page_cost * pp_inner->gist_npages; /* cost to evaluate GiST index by GPU */ cost_qual_eval_node(&gist_clause_cost, (Node *)gist_clause, root); - comp_cost += gist_clause_cost.per_tuple * xpu_ratio * outer_path->rows; + comp_cost += gist_clause_cost.per_tuple * xpu_ratio * outer_nrows; /* cost to evaluate join qualifiers by GPU */ comp_cost += (join_quals_cost.per_tuple * xpu_ratio * - outer_path->rows * + outer_nrows * gist_selectivity * inner_path->rows); } @@ -590,18 +527,220 @@ try_add_simple_xpujoin_path(PlannerInfo *root, /* cost to evaluate join qualifiers by GPU */ run_cost += (join_quals_cost.per_tuple * xpu_ratio * inner_path->rows * - outer_path->rows); + outer_nrows); } /* discount if CPU parallel is enabled */ run_cost += (comp_cost / pp_info->parallel_divisor); - /* cost for DMA receive (xPU --> Host) */ final_cost += xpu_tuple_cost * joinrel->rows; - /* cost for host projection */ - final_cost += joinrel->reltarget->cost.per_tuple * joinrel->rows; + final_cost += (joinrel->reltarget->cost.per_tuple * + joinrel->rows / pp_info->parallel_divisor); pp_info->final_cost = final_cost; + pp_inner->join_nrows = (joinrel->rows / pp_info->parallel_divisor); + pp_inner->join_startup_cost = startup_cost; + pp_inner->join_run_cost = run_cost; + + return pp_info; +} + +/* + * buildOuterJoinPlanInfo + */ +pgstromPlanInfo * +buildOuterJoinPlanInfo(PlannerInfo *root, + RelOptInfo *outer_rel, + uint32_t xpu_task_flags, + bool try_parallel_path, + ParamPathInfo **p_param_info, + List **p_inner_paths_list) +{ + const pgstromPlanInfo *pp_prev; + pgstromPlanInfo *pp_info; + ParamPathInfo *param_info; /* dummy */ + List *pathlist; + JoinPath *jpath = NULL; + ListCell *lc; + + if (IS_SIMPLE_REL(outer_rel)) + { + pp_info = buildOuterScanPlanInfo(root, + outer_rel, + xpu_task_flags, + try_parallel_path, + false, + true, + ¶m_info); + if (pp_info) + { + *p_param_info = param_info; + *p_inner_paths_list = NIL; + } + return pp_info; + } + else if (IS_JOIN_REL(outer_rel)) + { + if (!try_parallel_path) + pathlist = outer_rel->pathlist; + else + pathlist = outer_rel->partial_pathlist; + foreach (lc, pathlist) + { + Path *path = lfirst(lc); + + if ((pp_prev = try_fetch_xpuscan_planinfo(path)) != NULL || + (pp_prev = try_fetch_xpujoin_planinfo(path)) != NULL) + { + const CustomPath *cpath = (const CustomPath *)path; + + pp_info = copy_pgstrom_plan_info(pp_prev); + *p_param_info = path->param_info; + *p_inner_paths_list = list_copy(cpath->custom_paths); + return pp_info; + } + else if ((path->pathtype == T_NestLoop || + path->pathtype == T_MergeJoin || + path->pathtype == T_HashJoin) && + (!jpath || jpath->path.total_cost > path->total_cost)) + { + jpath = (JoinPath *)path; + } + } + + /* + * Even if GpuJoin/GpuScan does not exist at the outer-relation, + * we try to build the pgstromPlanInfo according to the built-in + * join order. + */ + if (jpath) + { + Path *i_path = jpath->innerjoinpath; + Path *o_path = jpath->outerjoinpath; + List *inner_paths_list = NIL; + + /* only supported join type */ + if (jpath->jointype != JOIN_INNER && + jpath->jointype != JOIN_LEFT && + jpath->jointype != JOIN_FULL && + jpath->jointype != JOIN_RIGHT) + return NULL; + + pp_prev = buildOuterJoinPlanInfo(root, + o_path->parent, + xpu_task_flags, + try_parallel_path, + ¶m_info, /* dummy */ + &inner_paths_list); + if (!pp_prev) + return NULL; + inner_paths_list = lappend(inner_paths_list, i_path); + pp_info = __buildXpuJoinPlanInfo(root, + jpath->path.parent, + jpath->jointype, + jpath->joinrestrictinfo, + o_path->parent, + pp_prev, + inner_paths_list); + if (pp_info) + { + *p_param_info = jpath->path.param_info; + *p_inner_paths_list = inner_paths_list; + } + return pp_info; + } + } + return NULL; +} + +/* + * try_add_simple_xpujoin_path + */ +static bool +try_add_simple_xpujoin_path(PlannerInfo *root, + RelOptInfo *joinrel, + RelOptInfo *outer_rel, + Path *inner_path, + JoinType join_type, + JoinPathExtraData *extra, + bool try_parallel_path, + uint32_t xpu_task_flags, + const CustomPathMethods *xpujoin_path_methods) +{ + List *inner_paths_list = NIL; + List *restrict_clauses = extra->restrictlist; + Relids required_outer = NULL; + ParamPathInfo *param_info; + Path outer_path; /* dummy path */ + CustomPath *cpath; + pgstromPlanInfo *pp_prev; + pgstromPlanInfo *pp_info; + pgstromPlanInnerInfo *pp_inner; + + /* sanity checks */ + Assert(join_type == JOIN_INNER || join_type == JOIN_FULL || + join_type == JOIN_LEFT || join_type == JOIN_RIGHT); + /* + * Setup a dummy outer-path node + * + * MEMO: This dummy outer-path node is only used to carry 'parent', + * 'param_info' and 'rows' fields to the get_joinrel_parampathinfo(), + * but other fields are not referenced at all. + * So, we setup a simplified dummy outer-path node, not an actual + * outer path. + */ + memset(&outer_path, 0, sizeof(Path)); + outer_path.parent = outer_rel; + + pp_prev = buildOuterJoinPlanInfo(root, + outer_rel, + xpu_task_flags, + try_parallel_path, + &outer_path.param_info, + &inner_paths_list); + if (!pp_prev) + return false; + inner_paths_list = lappend(inner_paths_list, inner_path); + if (pp_prev->num_rels == 0) + outer_path.rows = pp_prev->scan_rows; + else + outer_path.rows = pp_prev->inners[pp_prev->num_rels-1].join_nrows; + + /* + * Get param info + */ + required_outer = calc_non_nestloop_required_outer(&outer_path, + inner_path); + if (required_outer && !bms_overlap(required_outer, + extra->param_source_rels)) + { + bms_free(required_outer); + return false; + } + + param_info = get_joinrel_parampathinfo(root, + joinrel, + &outer_path, /* dummy path */ + inner_path, + extra->sjinfo, + required_outer, + &restrict_clauses); + if (!restrict_clauses) + return false; /* cross join is not welcome */ + + /* + * Build a new pgstromPlanInfo + */ + pp_info = __buildXpuJoinPlanInfo(root, + joinrel, + join_type, + restrict_clauses, + outer_rel, + pp_prev, + inner_paths_list); + if (!pp_info) + return false; + pp_inner = &pp_info->inners[pp_info->num_rels-1]; /* * Build the CustomPath @@ -613,27 +752,22 @@ try_add_simple_xpujoin_path(PlannerInfo *root, cpath->path.param_info = param_info; cpath->path.parallel_aware = try_parallel_path; cpath->path.parallel_safe = joinrel->consider_parallel; - cpath->path.parallel_workers = outer_path->parallel_workers; + cpath->path.parallel_workers = pp_info->parallel_nworkers; cpath->path.pathkeys = NIL; - cpath->path.rows = joinrel->rows; - cpath->path.startup_cost = startup_cost; - cpath->path.total_cost = startup_cost + run_cost + final_cost; + cpath->path.rows = pp_inner->join_nrows; + cpath->path.startup_cost = pp_inner->join_startup_cost; + cpath->path.total_cost = (pp_inner->join_startup_cost + + pp_inner->join_run_cost + + pp_info->final_cost); cpath->flags = CUSTOMPATH_SUPPORT_PROJECTION; cpath->methods = xpujoin_path_methods; - cpath->custom_paths = lappend(inner_paths_list, inner_path); + cpath->custom_paths = inner_paths_list; cpath->custom_private = list_make1(pp_info); - - if (custom_path_remember(root, - joinrel, - try_parallel_path, - xpu_task_flags, - cpath)) - { - if (!try_parallel_path) - add_path(joinrel, &cpath->path); - else - add_partial_path(joinrel, &cpath->path); - } + + if (!try_parallel_path) + add_path(joinrel, &cpath->path); + else + add_partial_path(joinrel, &cpath->path); return true; } @@ -960,13 +1094,20 @@ __pgstrom_build_tlist_dev_walker(Node *node, void *__priv) return expression_tree_walker(node, __pgstrom_build_tlist_dev_walker, __priv); } +/* + * __build_explain_tlist_junks + * + * it builds junk TLEs for EXPLAIN output only + */ static void -__build_explain_tlist_junks(build_tlist_dev_context *context) +__build_explain_tlist_junks(codegen_context *context, + PlannerInfo *root, + List *input_rels_tlist, + const Bitmapset *outer_refs) { - PlannerInfo *root = context->root; ListCell *cell; - foreach (cell, context->input_rels_tlist) + foreach (cell, input_rels_tlist) { Node *node = lfirst(cell); @@ -975,23 +1116,25 @@ __build_explain_tlist_junks(build_tlist_dev_context *context) Index relid = intVal(node); RelOptInfo *baserel = root->simple_rel_array[relid]; RangeTblEntry *rte = root->simple_rte_array[relid]; + int j, k; Assert(IS_SIMPLE_REL(baserel) && rte->rtekind == RTE_RELATION); - for (int j=baserel->min_attr; j <= baserel->max_attr; j++) + for (j = bms_next_member(outer_refs, -1); + j >= 0; + j = bms_next_member(outer_refs, j)) { Form_pg_attribute attr; HeapTuple htup; Var *var; ListCell *lc; - if (bms_is_empty(baserel->attr_needed[j-baserel->min_attr])) - continue; + k = j + FirstLowInvalidHeapAttributeNumber; htup = SearchSysCache2(ATTNUM, ObjectIdGetDatum(rte->relid), - Int16GetDatum(j)); + Int16GetDatum(k)); if (!HeapTupleIsValid(htup)) elog(ERROR, "cache lookup failed for attribute %d of relation %u", - j, rte->relid); + k, rte->relid); attr = (Form_pg_attribute) GETSTRUCT(htup); var = makeVar(baserel->relid, attr->attnum, @@ -999,7 +1142,6 @@ __build_explain_tlist_junks(build_tlist_dev_context *context) attr->atttypmod, attr->attcollation, 0); - foreach (lc, context->tlist_dev) { TargetEntry *tle = lfirst(lc); @@ -1108,8 +1250,6 @@ pgstrom_build_tlist_dev(PlannerInfo *root, context.only_vars = true; __pgstrom_build_tlist_dev_walker((Node *)host_quals, &context); - context.resjunk = true; - __build_explain_tlist_junks(&context); return context.tlist_dev; } @@ -1152,7 +1292,6 @@ pgstrom_build_groupby_dev(PlannerInfo *root, } } } - __build_explain_tlist_junks(&context); return context.tlist_dev; } @@ -1178,7 +1317,7 @@ PlanXpuJoinPathCommon(PlannerInfo *root, List *input_rels_tlist; List *fallback_tlist = NIL; ListCell *lc; - + Assert(pp_info->num_rels == list_length(custom_plans)); codegen_context_init(&context, pp_info->xpu_task_flags); input_rels_tlist = list_make1(makeInteger(pp_info->scan_relid)); @@ -1249,10 +1388,10 @@ PlanXpuJoinPathCommon(PlannerInfo *root, */ if ((pp_info->xpu_task_flags & DEVTASK__MASK) == DEVTASK__PREAGG) { - context.tlist_dev = pgstrom_build_groupby_dev(root, - tlist, - NIL, - input_rels_tlist); + context.tlist_dev = pgstrom_build_groupby_dev(root, + tlist, + NIL, + input_rels_tlist); codegen_build_groupby_actions(&context, pp_info); } else @@ -1269,6 +1408,8 @@ PlanXpuJoinPathCommon(PlannerInfo *root, pull_varattnos((Node *)context.tlist_dev, pp_info->scan_relid, &outer_refs); + __build_explain_tlist_junks(&context, root, input_rels_tlist, outer_refs); + /* assign remaining PlanInfo members */ pp_info->kexp_join_quals_packed = codegen_build_packed_joinquals(&context, diff --git a/src/gpu_preagg.c b/src/gpu_preagg.c index 1fcfeeed1..9ce270468 100644 --- a/src/gpu_preagg.c +++ b/src/gpu_preagg.c @@ -871,11 +871,13 @@ aggfunc_catalog_lookup_by_oid(Oid aggfn_oid) */ typedef struct { - bool device_executable; + bool device_executable; PlannerInfo *root; RelOptInfo *group_rel; + RelOptInfo *input_rel; + ParamPathInfo *param_info; double num_groups; - Path *input_path; + bool try_parallel; PathTarget *target_upper; PathTarget *target_partial; PathTarget *target_final; @@ -884,7 +886,6 @@ typedef struct List *input_rels_tlist; List *inner_paths_list; Node *havingQual; - uint32_t xpu_task_flags; const CustomPathMethods *custom_path_methods; } xpugroupby_build_path_context; @@ -1012,7 +1013,7 @@ make_alternative_aggref(xpugroupby_build_path_context *con, Aggref *aggref) if (type_oid != dest_oid) expr = make_expr_typecast(expr, dest_oid); if (!pgstrom_xpu_expression(expr, - con->xpu_task_flags, + pp_info->xpu_task_flags, con->input_rels_tlist, NULL)) { @@ -1176,7 +1177,7 @@ xpugroupby_build_path_target(xpugroupby_build_path_context *con) } /* grouping-key must be device executable. */ if (!pgstrom_xpu_expression(expr, - con->xpu_task_flags, + pp_info->xpu_task_flags, con->input_rels_tlist, NULL)) { @@ -1263,27 +1264,27 @@ prepend_partial_groupby_custompath(xpugroupby_build_path_context *con) { Query *parse = con->root->parse; CustomPath *cpath = makeNode(CustomPath); - Path *input_path = con->input_path; PathTarget *target_partial = con->target_partial; pgstromPlanInfo *pp_info = con->pp_info; + double input_nrows = PP_INFO_NUM_ROWS(pp_info); double num_group_keys; double xpu_ratio; Cost xpu_operator_cost; Cost xpu_tuple_cost; - Cost startup_cost = 0.0; - Cost run_cost = 0.0; - Cost final_cost = 0.0; + Cost startup_cost = PP_INFO_STARTUP_COST(pp_info); + Cost run_cost = PP_INFO_RUN_COST(pp_info) - pp_info->final_cost; + Cost final_cost; /* * Parameters related to devices */ - if ((con->xpu_task_flags & DEVKIND__ANY) == DEVKIND__NVIDIA_GPU) + if ((pp_info->xpu_task_flags & DEVKIND__ANY) == DEVKIND__NVIDIA_GPU) { xpu_operator_cost = pgstrom_gpu_operator_cost; xpu_tuple_cost = pgstrom_gpu_tuple_cost; xpu_ratio = pgstrom_gpu_operator_ratio(); } - else if ((con->xpu_task_flags & DEVKIND__ANY) == DEVKIND__NVIDIA_DPU) + else if ((pp_info->xpu_task_flags & DEVKIND__ANY) == DEVKIND__NVIDIA_DPU) { xpu_operator_cost = pgstrom_dpu_operator_cost; xpu_tuple_cost = pgstrom_dpu_tuple_cost; @@ -1291,31 +1292,26 @@ prepend_partial_groupby_custompath(xpugroupby_build_path_context *con) } else { - elog(ERROR, "Bug? unexpected task_kind: %08x", con->xpu_task_flags); + elog(ERROR, "Bug? unexpected task_kind: %08x", pp_info->xpu_task_flags); } - startup_cost = input_path->startup_cost; - run_cost = (input_path->total_cost - - input_path->startup_cost - pp_info->final_cost); /* Cost estimation for grouping */ num_group_keys = list_length(parse->groupClause); startup_cost += (xpu_operator_cost * num_group_keys * - input_path->rows); + input_nrows); /* Cost estimation for aggregate function */ - startup_cost += (target_partial->cost.per_tuple * input_path->rows + + startup_cost += (target_partial->cost.per_tuple * input_nrows + target_partial->cost.startup) * xpu_ratio; /* Cost estimation to fetch results */ final_cost = xpu_tuple_cost * con->num_groups; - if (input_path->parallel_workers > 0) - final_cost *= (0.5 + (double)input_path->parallel_workers); cpath->path.pathtype = T_CustomScan; - cpath->path.parent = input_path->parent; + cpath->path.parent = con->input_rel; cpath->path.pathtarget = con->target_partial; - cpath->path.param_info = input_path->param_info; - cpath->path.parallel_safe = input_path->parallel_safe; - cpath->path.parallel_aware = input_path->parallel_aware; - cpath->path.parallel_workers = input_path->parallel_workers; + cpath->path.param_info = con->param_info; + cpath->path.parallel_aware = con->try_parallel; + cpath->path.parallel_safe = con->input_rel->consider_parallel; + cpath->path.parallel_workers = pp_info->parallel_nworkers; cpath->path.rows = con->num_groups; cpath->path.startup_cost = startup_cost; cpath->path.total_cost = startup_cost + run_cost + final_cost; @@ -1381,40 +1377,47 @@ try_add_final_groupby_paths(xpugroupby_build_path_context *con, static void __xpupreagg_add_custompath(PlannerInfo *root, - Path *input_path, RelOptInfo *group_rel, + RelOptInfo *input_rel, + pgstromPlanInfo *pp_info, + ParamPathInfo *param_info, + List *inner_paths_list, void *extra, bool try_parallel, double num_groups, - uint32_t xpu_task_flags, const CustomPathMethods *custom_path_methods) { xpugroupby_build_path_context con; Path *part_path; + List *inner_rels_tlist; + ListCell *lc; + + inner_rels_tlist = list_make1(makeInteger(pp_info->scan_relid)); + foreach (lc, inner_paths_list) + { + Path *i_path = (Path *)lfirst(lc); + inner_rels_tlist = lappend(inner_rels_tlist, i_path->pathtarget); + } /* setup context */ memset(&con, 0, sizeof(con)); con.device_executable = true; con.root = root; con.group_rel = group_rel; + con.input_rel = input_rel; + con.param_info = param_info; con.num_groups = num_groups; - con.input_path = input_path; + con.try_parallel = try_parallel; con.target_upper = root->upper_targets[UPPERREL_GROUP_AGG]; con.target_partial = create_empty_pathtarget(); con.target_final = create_empty_pathtarget(); - con.xpu_task_flags = xpu_task_flags; + con.pp_info = pp_info; + con.input_rels_tlist = inner_rels_tlist; + con.inner_paths_list = inner_paths_list; con.custom_path_methods = custom_path_methods; - extract_input_path_params(input_path, - NULL, - &con.pp_info, - &con.input_rels_tlist, - &con.inner_paths_list); /* construction of the target-list for each level */ if (!xpugroupby_build_path_target(&con)) return; - - con.pp_info->xpu_task_flags = xpu_task_flags; - /* build partial groupby custom-path */ part_path = prepend_partial_groupby_custompath(&con); @@ -1445,14 +1448,13 @@ __xpupreagg_add_custompath(PlannerInfo *root, void xpupreagg_add_custompath(PlannerInfo *root, - RelOptInfo *input_rel, - RelOptInfo *group_rel, - void *extra, - uint32_t xpu_task_flags, - const CustomPathMethods *custom_path_methods) + RelOptInfo *input_rel, + RelOptInfo *group_rel, + void *extra, + uint32_t xpu_task_flags, + const CustomPathMethods *custom_path_methods) { Query *parse = root->parse; - Path *input_path; /* quick bailout if not supported */ if (parse->groupingSets != NIL || @@ -1464,24 +1466,17 @@ xpupreagg_add_custompath(PlannerInfo *root, for (int try_parallel=0; try_parallel < 2; try_parallel++) { - if (IS_SIMPLE_REL(input_rel)) - { - input_path = (Path *)buildXpuScanPath(root, - input_rel, - (try_parallel > 0), - false, - true, - xpu_task_flags); - } - else - { - input_path = (Path *)custom_path_find_cheapest(root, - input_rel, - (try_parallel > 0), - xpu_task_flags); - } - - if (input_path) + pgstromPlanInfo *pp_info; + ParamPathInfo *param_info = NULL; + List *inner_paths_list = NIL; + + pp_info = buildOuterJoinPlanInfo(root, + input_rel, + xpu_task_flags, + (try_parallel > 0), + ¶m_info, + &inner_paths_list); + if (pp_info) { double num_groups = 1.0; @@ -1490,22 +1485,26 @@ xpupreagg_add_custompath(PlannerInfo *root, { GroupPathExtraData *gp_extra = extra; List *groupExprs; + double input_nrows = PP_INFO_NUM_ROWS(pp_info); /* see get_number_of_groups() */ groupExprs = get_sortgrouplist_exprs(parse->groupClause, gp_extra->targetList); num_groups = estimate_num_groups(root, groupExprs, - input_path->rows, + input_nrows, NULL, NULL); } __xpupreagg_add_custompath(root, - input_path, group_rel, + input_rel, + pp_info, + param_info, + inner_paths_list, extra, (try_parallel > 0), num_groups, - xpu_task_flags, custom_path_methods); + } } } @@ -1532,11 +1531,11 @@ gpupreagg_add_custompath(PlannerInfo *root, return; /* add custom-paths */ xpupreagg_add_custompath(root, - input_rel, - group_rel, - extra, - TASK_KIND__GPUPREAGG, - &gpupreagg_path_methods); + input_rel, + group_rel, + extra, + TASK_KIND__GPUPREAGG, + &gpupreagg_path_methods); } /* diff --git a/src/gpu_scan.c b/src/gpu_scan.c index 33a640fcf..c6b6dfffc 100644 --- a/src/gpu_scan.c +++ b/src/gpu_scan.c @@ -20,20 +20,67 @@ static bool enable_gpuscan; /* GUC */ static bool enable_pullup_outer_scan; /* GUC */ /* - * __setupXpuScanPath + * sort_device_qualifiers */ -static CustomPath * -__setupXpuScanPath(PlannerInfo *root, - RelOptInfo *baserel, - ParamPathInfo *param_info, - bool parallel_path, - uint32_t xpu_task_flags, - List *dev_quals, - List *host_quals) +void +sort_device_qualifiers(List *dev_quals_list, List *dev_costs_list) +{ + int nitems = list_length(dev_quals_list); + ListCell **dev_quals = alloca(sizeof(ListCell *) * nitems); + int *dev_costs = alloca(sizeof(int) * nitems); + int i, j, k; + ListCell *lc1, *lc2; + + i = 0; + forboth (lc1, dev_quals_list, + lc2, dev_costs_list) + { + dev_quals[i] = lc1; + dev_costs[i] = lfirst_int(lc2); + i++; + } + Assert(i == nitems); + + for (i=0; i < nitems; i++) + { + int dcost = dev_costs[i]; + void *dqual = dev_quals[i]->ptr_value; + + k = i; + for (j=i+1; j < nitems; j++) + { + if (dcost > dev_costs[j]) + { + dcost = dev_costs[j]; + dqual = dev_quals[j]->ptr_value; + k = j; + } + } + + if (i != k) + { + dev_costs[k] = dev_costs[i]; + dev_costs[i] = dcost; + dev_quals[k]->ptr_value = dev_quals[i]->ptr_value; + dev_quals[i]->ptr_value = dqual; + } + } +} + +/* + * buildOuterScanPlanInfo + */ +static pgstromPlanInfo * +__buildOuterScanPlanInfo(PlannerInfo *root, + RelOptInfo *baserel, + uint32_t xpu_task_flags, + bool parallel_path, + List *dev_quals, + List *host_quals, + Cardinality scan_nrows) { RangeTblEntry *rte = root->simple_rte_array[baserel->relid]; - CustomPath *cpath = makeNode(CustomPath); - pgstromPlanInfo *pp_info = palloc0(sizeof(pgstromPlanInfo)); + pgstromPlanInfo *pp_info; int gpu_cache_dindex = -1; const Bitmapset *gpu_direct_devs = NULL; const DpuStorageEntry *ds_entry = NULL; @@ -54,7 +101,7 @@ __setupXpuScanPath(PlannerInfo *root, double xpu_ratio; double xpu_tuple_cost; QualCost qcost; - double ntuples; + double ntuples = baserel->tuples; double selectivity; /* @@ -68,7 +115,7 @@ __setupXpuScanPath(PlannerInfo *root, baserel->pages, -1, max_parallel_workers_per_gather); if (parallel_nworkers <= 0) - return false; + return NULL; parallel_divisor = (double)parallel_nworkers; if (parallel_leader_participation) { @@ -76,6 +123,9 @@ __setupXpuScanPath(PlannerInfo *root, if (leader_contribution > 0.0) parallel_divisor += leader_contribution; } + /* discount # of rows to be produced per backend */ + ntuples /= parallel_divisor; + scan_nrows /= parallel_divisor; } /* @@ -100,7 +150,6 @@ __setupXpuScanPath(PlannerInfo *root, pgstrom_gpu_direct_seq_page_cost * baserel->allvisfrac; else avg_seq_page_cost = spc_seq_page_cost; - cpath->methods = &gpuscan_path_methods; } else if ((xpu_task_flags & DEVKIND__ANY) == DEVKIND__NVIDIA_DPU) { @@ -113,10 +162,9 @@ __setupXpuScanPath(PlannerInfo *root, else ds_entry = GetOptimalDpuForBaseRel(root, baserel); if (!ds_entry) - return false; + return NULL; avg_seq_page_cost = (spc_seq_page_cost * (1.0 - baserel->allvisfrac) + pgstrom_dpu_seq_page_cost * baserel->allvisfrac); - cpath->methods = &dpuscan_path_methods; } else { @@ -131,7 +179,6 @@ __setupXpuScanPath(PlannerInfo *root, disk_cost = avg_seq_page_cost * baserel->pages; if (parallel_path) disk_cost /= parallel_divisor; - ntuples = baserel->tuples; /* * Is BRIN-index available? @@ -200,9 +247,10 @@ __setupXpuScanPath(PlannerInfo *root, * Cost for host projection */ startup_cost += baserel->reltarget->cost.startup; - final_cost += baserel->reltarget->cost.per_tuple * baserel->rows; + final_cost += baserel->reltarget->cost.per_tuple * scan_nrows; /* Setup the result */ + pp_info = palloc0(sizeof(pgstromPlanInfo)); pp_info->xpu_task_flags = xpu_task_flags; pp_info->gpu_cache_dindex = gpu_cache_dindex; pp_info->gpu_direct_devs = gpu_direct_devs; @@ -211,8 +259,11 @@ __setupXpuScanPath(PlannerInfo *root, pp_info->host_quals = extract_actual_clauses(host_quals, false); pp_info->scan_quals = extract_actual_clauses(dev_quals, false); pp_info->scan_tuples = baserel->tuples; - pp_info->scan_rows = baserel->rows; + pp_info->scan_rows = scan_nrows; + pp_info->parallel_nworkers = parallel_nworkers; pp_info->parallel_divisor = parallel_divisor; + pp_info->scan_startup_cost = startup_cost; + pp_info->scan_run_cost = run_cost; pp_info->final_cost = final_cost; if (indexOpt) { @@ -224,83 +275,17 @@ __setupXpuScanPath(PlannerInfo *root, pull_varattnos((Node *)pp_info->host_quals, baserel->relid, &outer_refs); pull_varattnos((Node *)pp_info->scan_quals, baserel->relid, &outer_refs); pp_info->outer_refs = outer_refs; - - cpath->path.pathtype = T_CustomScan; - cpath->path.parent = baserel; - cpath->path.pathtarget = baserel->reltarget; - cpath->path.param_info = param_info; - cpath->path.parallel_aware = (parallel_nworkers > 0); - cpath->path.parallel_safe = baserel->consider_parallel; - cpath->path.parallel_workers = parallel_nworkers; - cpath->path.rows = (param_info ? param_info->ppi_rows : baserel->rows); - cpath->path.startup_cost = startup_cost; - cpath->path.total_cost = startup_cost + run_cost + pp_info->final_cost; - cpath->path.pathkeys = NIL; /* unsorted results */ - cpath->flags = CUSTOMPATH_SUPPORT_PROJECTION; - cpath->custom_paths = NIL; - cpath->custom_private = list_make1(pp_info); - Assert(cpath->methods != NULL); - return cpath; + return pp_info; } -/* - * sort_device_qualifiers - */ -void -sort_device_qualifiers(List *dev_quals_list, List *dev_costs_list) -{ - int nitems = list_length(dev_quals_list); - ListCell **dev_quals = alloca(sizeof(ListCell *) * nitems); - int *dev_costs = alloca(sizeof(int) * nitems); - int i, j, k; - ListCell *lc1, *lc2; - - i = 0; - forboth (lc1, dev_quals_list, - lc2, dev_costs_list) - { - dev_quals[i] = lc1; - dev_costs[i] = lfirst_int(lc2); - i++; - } - Assert(i == nitems); - - for (i=0; i < nitems; i++) - { - int dcost = dev_costs[i]; - void *dqual = dev_quals[i]->ptr_value; - - k = i; - for (j=i+1; j < nitems; j++) - { - if (dcost > dev_costs[j]) - { - dcost = dev_costs[j]; - dqual = dev_quals[j]->ptr_value; - k = j; - } - } - - if (i != k) - { - dev_costs[k] = dev_costs[i]; - dev_costs[i] = dcost; - dev_quals[k]->ptr_value = dev_quals[i]->ptr_value; - dev_quals[i]->ptr_value = dqual; - } - } -} - -/* - * buildXpuScanPath - */ -CustomPath * -buildXpuScanPath(PlannerInfo *root, - RelOptInfo *baserel, - bool parallel_path, - bool allow_host_quals, - bool allow_no_device_quals, - uint32_t xpu_task_flags) +pgstromPlanInfo * +buildOuterScanPlanInfo(PlannerInfo *root, + RelOptInfo *baserel, + uint32_t xpu_task_flags, + bool parallel_path, + bool allow_host_quals, + bool allow_no_device_quals, + ParamPathInfo **p_param_info) { RangeTblEntry *rte = root->simple_rte_array[baserel->relid]; List *input_rels_tlist = list_make1(makeInteger(baserel->relid)); @@ -308,6 +293,7 @@ buildXpuScanPath(PlannerInfo *root, List *dev_costs = NIL; List *host_quals = NIL; ParamPathInfo *param_info; + Cardinality scan_nrows = baserel->rows; ListCell *lc; Assert(IS_SIMPLE_REL(baserel)); @@ -345,10 +331,14 @@ buildXpuScanPath(PlannerInfo *root, dev_quals = lappend(dev_quals, rinfo); dev_costs = lappend_int(dev_costs, devcost); } - else + else if (allow_host_quals) { host_quals = lappend(host_quals, rinfo); } + else + { + return NULL; + } } /* also checks parametalized qualifiers */ param_info = get_baserel_parampathinfo(root, baserel, @@ -367,22 +357,75 @@ buildXpuScanPath(PlannerInfo *root, dev_quals = lappend(dev_quals, rinfo); dev_costs = lappend_int(dev_costs, devcost); } - else + else if (allow_host_quals) + { host_quals = lappend(host_quals, rinfo); + } + else + { + return NULL; + } } + scan_nrows = param_info->ppi_rows; } - sort_device_qualifiers(dev_quals, dev_costs); - if (!allow_host_quals && host_quals != NIL) - return NULL; + *p_param_info = param_info; if (!allow_no_device_quals && dev_quals == NIL) return NULL; - return __setupXpuScanPath(root, - baserel, - param_info, - parallel_path, - xpu_task_flags, - dev_quals, - host_quals); + sort_device_qualifiers(dev_quals, dev_costs); + return __buildOuterScanPlanInfo(root, + baserel, + xpu_task_flags, + parallel_path, + dev_quals, + host_quals, + scan_nrows); +} + +/* + * buildXpuScanPath + */ +CustomPath * +buildXpuScanPath(PlannerInfo *root, + RelOptInfo *baserel, + uint32_t xpu_task_flags, + bool parallel_path, + bool allow_host_quals, + bool allow_no_device_quals, + const CustomPathMethods *xpuscan_path_methods) +{ + pgstromPlanInfo *pp_info; + CustomPath *cpath; + ParamPathInfo *param_info; + + pp_info = buildOuterScanPlanInfo(root, + baserel, + xpu_task_flags, + parallel_path, + allow_host_quals, + allow_no_device_quals, + ¶m_info); + if (!pp_info) + return NULL; + + cpath = makeNode(CustomPath); + cpath->path.pathtype = T_CustomScan; + cpath->path.parent = baserel; + cpath->path.pathtarget = baserel->reltarget; + cpath->path.param_info = param_info; + cpath->path.parallel_aware = (pp_info->parallel_nworkers > 0); + cpath->path.parallel_safe = baserel->consider_parallel; + cpath->path.parallel_workers = pp_info->parallel_nworkers; + cpath->path.rows = pp_info->scan_rows; + cpath->path.startup_cost = pp_info->scan_startup_cost; + cpath->path.total_cost = (pp_info->scan_startup_cost + + pp_info->scan_run_cost + + pp_info->final_cost); + cpath->path.pathkeys = NIL; /* unsorted results */ + cpath->flags = CUSTOMPATH_SUPPORT_PROJECTION; + cpath->custom_paths = NIL; + cpath->custom_private = list_make1(pp_info); + cpath->methods = xpuscan_path_methods; + return cpath; } /* @@ -413,15 +456,12 @@ GpuScanAddScanPath(PlannerInfo *root, cpath = buildXpuScanPath(root, baserel, + TASK_KIND__GPUSCAN, (try_parallel > 0), true, /* allow host quals */ false, /* disallow no device quals */ - TASK_KIND__GPUSCAN); - if (cpath && custom_path_remember(root, - baserel, - (try_parallel > 0), - TASK_KIND__GPUSCAN, - cpath)) + &gpuscan_path_methods); + if (cpath) { if (try_parallel == 0) add_path(baserel, &cpath->path); diff --git a/src/main.c b/src/main.c index d8045648f..a33bb802d 100644 --- a/src/main.c +++ b/src/main.c @@ -107,108 +107,6 @@ pgstrom_init_gucs(void) NULL, NULL, NULL); } -/* - * xPU-aware path tracker - * - * motivation: add_path() and add_partial_path() keeps only cheapest paths. - * Once some other dominates GpuXXX paths, it shall be wiped out, even if - * it potentially has a chance for more optimization (e.g, GpuJoin outer - * pull-up, GpuPreAgg + GpuJoin combined mode). - * So, we preserve PG-Strom related Path-nodes for the later referenced. - */ -typedef struct -{ - PlannerInfo *root; - Relids relids; - bool parallel_path; - uint32_t devkind; /* one of DEVKIND_* */ - CustomPath *cpath; -} custom_path_entry; - -static HTAB *custom_path_htable = NULL; - -static uint32 -custom_path_entry_hashvalue(const void *key, Size keysize) -{ - custom_path_entry *cent = (custom_path_entry *)key; - uint32 hash; - - hash = hash_bytes((unsigned char *)¢->root, sizeof(PlannerInfo *)); - hash ^= bms_hash_value(cent->relids); - if (cent->parallel_path) - hash ^= 0x9e3779b9U; - hash ^= hash_uint32(cent->devkind); - - return hash; -} - -static int -custom_path_entry_compare(const void *key1, const void *key2, Size keysize) -{ - custom_path_entry *cent1 = (custom_path_entry *)key1; - custom_path_entry *cent2 = (custom_path_entry *)key2; - - if (cent1->root == cent2->root && - bms_equal(cent1->relids, cent2->relids) && - cent1->parallel_path == cent2->parallel_path && - cent1->devkind == cent2->devkind) - return 0; - /* not equal */ - return 1; -} - -CustomPath * -custom_path_find_cheapest(PlannerInfo *root, - RelOptInfo *rel, - bool parallel_path, - uint32_t devkind) -{ - custom_path_entry hkey; - custom_path_entry *cent; - - memset(&hkey, 0, sizeof(custom_path_entry)); - hkey.root = root; - hkey.relids = rel->relids; - hkey.parallel_path = (parallel_path ? true : false); - hkey.devkind = (devkind & DEVKIND__ANY); - - cent = hash_search(custom_path_htable, &hkey, HASH_FIND, NULL); - if (!cent) - return NULL; - return cent->cpath; -} - -bool -custom_path_remember(PlannerInfo *root, - RelOptInfo *rel, - bool parallel_path, - uint32_t devkind, - const CustomPath *cpath) -{ - custom_path_entry hkey; - custom_path_entry *cent; - bool found; - - Assert((devkind & DEVKIND__ANY) == DEVKIND__NVIDIA_GPU || - (devkind & DEVKIND__ANY) == DEVKIND__NVIDIA_DPU); - memset(&hkey, 0, sizeof(custom_path_entry)); - hkey.root = root; - hkey.relids = rel->relids; - hkey.parallel_path = (parallel_path ? true : false); - hkey.devkind = (devkind & DEVKIND__ANY); - - cent = hash_search(custom_path_htable, &hkey, HASH_ENTER, &found); - if (found) - { - /* new path is more expensive than prior one! */ - if (cent->cpath->path.total_cost <= cpath->path.total_cost) - return false; - } - cent->cpath = (CustomPath *)pgstrom_copy_pathnode(&cpath->path); - - return true; -} - /* -------------------------------------------------------------------------------- * * add/remove dummy plan node @@ -413,41 +311,13 @@ pgstrom_post_planner(Query *parse, int cursorOptions, ParamListInfo boundParams) { - HTAB *custom_path_htable_saved = custom_path_htable; - HASHCTL hctl; PlannedStmt *pstmt; ListCell *lc; - PG_TRY(); - { - memset(&hctl, 0, sizeof(HASHCTL)); - hctl.hcxt = CurrentMemoryContext; - hctl.keysize = offsetof(custom_path_entry, cpath); - hctl.entrysize = sizeof(custom_path_entry); - hctl.hash = custom_path_entry_hashvalue; - hctl.match = custom_path_entry_compare; - custom_path_htable = hash_create("HTable to preserve Custom-Paths", - 512, - &hctl, - HASH_CONTEXT | - HASH_ELEM | - HASH_FUNCTION | - HASH_COMPARE); - pstmt = planner_hook_next(parse, - query_string, - cursorOptions, - boundParams); - } - PG_CATCH(); - { - hash_destroy(custom_path_htable); - custom_path_htable = custom_path_htable_saved; - PG_RE_THROW(); - } - PG_END_TRY(); - hash_destroy(custom_path_htable); - custom_path_htable = custom_path_htable_saved; - + pstmt = planner_hook_next(parse, + query_string, + cursorOptions, + boundParams); /* remove dummy plan */ pgstrom_removal_dummy_plans(pstmt, &pstmt->planTree); foreach (lc, pstmt->subplans) diff --git a/src/pg_strom.h b/src/pg_strom.h index 7b393e167..65358917e 100644 --- a/src/pg_strom.h +++ b/src/pg_strom.h @@ -237,6 +237,8 @@ typedef struct { JoinType join_type; /* one of JOIN_* */ double join_nrows; /* estimated nrows in this depth */ + Cost join_startup_cost; /* estimated startup cost */ + Cost join_run_cost; /* estimated run cost (incl final_cost) */ List *hash_outer_keys;/* hash-keys for outer-side */ List *hash_outer_keys_fallback; List *hash_inner_keys;/* hash-keys for inner-side */ @@ -272,6 +274,9 @@ typedef struct List *scan_quals_fallback;/* 'scan_quals' for CPU fallback */ double scan_tuples; /* copy of baserel->tuples */ double scan_rows; /* copy of baserel->rows */ + Cost scan_startup_cost; /* estimated startup cost to scan baserel */ + Cost scan_run_cost; /* estimated run cost to scan baserel */ + int parallel_nworkers; /* # of parallel workers */ double parallel_divisor; /* parallel divisor */ Cost final_cost; /* cost for sendback and host-side tasks */ bool scan_needs_ctid; /* FIXME: true, if ctid is referenced */ @@ -307,6 +312,19 @@ typedef struct pgstromPlanInnerInfo inners[FLEXIBLE_ARRAY_MEMBER]; } pgstromPlanInfo; +#define PP_INFO_NUM_ROWS(pp_info) \ + ((pp_info)->num_rels == 0 \ + ? (pp_info)->scan_rows \ + : (pp_info)->inners[(pp_info)->num_rels - 1].join_nrows) +#define PP_INFO_STARTUP_COST(pp_info) \ + ((pp_info)->num_rels == 0 \ + ? (pp_info)->scan_startup_cost \ + : (pp_info)->inners[(pp_info)->num_rels - 1].join_startup_cost) +#define PP_INFO_RUN_COST(pp_info) \ + ((pp_info)->num_rels == 0 \ + ? (pp_info)->scan_run_cost \ + : (pp_info)->inners[(pp_info)->num_rels - 1].join_run_cost) + /* * pgstromSharedState */ @@ -805,12 +823,20 @@ extern void gpuCachePutDeviceBuffer(void *gc_lmap); extern void sort_device_qualifiers(List *dev_quals_list, List *dev_costs_list); extern pgstromPlanInfo *try_fetch_xpuscan_planinfo(const Path *path); +extern pgstromPlanInfo *buildOuterScanPlanInfo(PlannerInfo *root, + RelOptInfo *baserel, + uint32_t xpu_task_flags, + bool parallel_path, + bool allow_host_quals, + bool allow_no_device_quals, + ParamPathInfo **p_param_info); extern CustomPath *buildXpuScanPath(PlannerInfo *root, RelOptInfo *baserel, + uint32_t task_kind, bool parallel_path, bool allow_host_quals, bool allow_no_device_quals, - uint32_t task_kind); + const CustomPathMethods *methods); extern CustomScan *PlanXpuScanPathCommon(PlannerInfo *root, RelOptInfo *baserel, CustomPath *best_path, @@ -830,11 +856,13 @@ extern void pgstrom_init_gpu_scan(void); extern void form_pgstrom_plan_info(CustomScan *cscan, pgstromPlanInfo *pp_info); extern pgstromPlanInfo *deform_pgstrom_plan_info(CustomScan *cscan); -extern void extract_input_path_params(const Path *input_path, - const Path *inner_path, /* optional */ - pgstromPlanInfo **p_pp_info, - List **p_input_paths_tlist, - List **p_inner_paths_list); +extern pgstromPlanInfo *try_fetch_xpujoin_planinfo(const Path *path); +extern pgstromPlanInfo *buildOuterJoinPlanInfo(PlannerInfo *root, + RelOptInfo *outer_rel, + uint32_t xpu_task_flags, + bool try_parallel_path, + ParamPathInfo **p_param_info, + List **p_inner_paths_list); extern void xpujoin_add_custompath(PlannerInfo *root, RelOptInfo *joinrel, RelOptInfo *outerrel, @@ -946,6 +974,7 @@ extern void pgstrom_init_dpu_scan(void); /* * dpu_join.c */ +extern CustomPathMethods dpujoin_path_methods; extern bool pgstrom_enable_dpujoin; extern bool pgstrom_enable_dpuhashjoin; extern bool pgstrom_enable_dpugistindex; @@ -994,15 +1023,6 @@ extern bool pgstrom_enabled; extern bool pgstrom_cpu_fallback_enabled; extern bool pgstrom_regression_test_mode; extern int pgstrom_max_async_tasks; -extern CustomPath *custom_path_find_cheapest(PlannerInfo *root, - RelOptInfo *rel, - bool parallel_aware, - uint32_t devkind); -extern bool custom_path_remember(PlannerInfo *root, - RelOptInfo *rel, - bool parallel_aware, - uint32_t devkind, - const CustomPath *cpath); extern Path *pgstrom_create_dummy_path(PlannerInfo *root, Path *subpath); extern void _PG_init(void);