From 1b59447ec058755838c4c10352a0e99dbe96f500 Mon Sep 17 00:00:00 2001 From: Weizhen Wang Date: Thu, 19 Sep 2024 12:11:30 +0800 Subject: [PATCH 1/2] This is an automated cherry-pick of #56117 Signed-off-by: ti-chi-bot --- pkg/planner/cardinality/row_count_index.go | 585 ++++++++++++++++++ pkg/planner/core/casetest/index/index_test.go | 232 +++++++ pkg/util/ranger/types.go | 567 +++++++++++++++++ 3 files changed, 1384 insertions(+) create mode 100644 pkg/planner/cardinality/row_count_index.go create mode 100644 pkg/planner/core/casetest/index/index_test.go create mode 100644 pkg/util/ranger/types.go diff --git a/pkg/planner/cardinality/row_count_index.go b/pkg/planner/cardinality/row_count_index.go new file mode 100644 index 0000000000000..68ea1dc13d3e5 --- /dev/null +++ b/pkg/planner/cardinality/row_count_index.go @@ -0,0 +1,585 @@ +// Copyright 2023 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cardinality + +import ( + "bytes" + "math" + "slices" + "strings" + "time" + + "github.com/pingcap/errors" + "github.com/pingcap/failpoint" + "github.com/pingcap/tidb/pkg/kv" + "github.com/pingcap/tidb/pkg/planner/context" + "github.com/pingcap/tidb/pkg/planner/util/debugtrace" + "github.com/pingcap/tidb/pkg/planner/util/fixcontrol" + "github.com/pingcap/tidb/pkg/sessionctx/stmtctx" + "github.com/pingcap/tidb/pkg/statistics" + "github.com/pingcap/tidb/pkg/types" + "github.com/pingcap/tidb/pkg/util/chunk" + "github.com/pingcap/tidb/pkg/util/codec" + "github.com/pingcap/tidb/pkg/util/collate" + "github.com/pingcap/tidb/pkg/util/mathutil" + "github.com/pingcap/tidb/pkg/util/ranger" +) + +// GetRowCountByIndexRanges estimates the row count by a slice of Range. +func GetRowCountByIndexRanges(sctx context.PlanContext, coll *statistics.HistColl, idxID int64, indexRanges []*ranger.Range) (result float64, err error) { + var name string + if sctx.GetSessionVars().StmtCtx.EnableOptimizerDebugTrace { + debugtrace.EnterContextCommon(sctx) + debugTraceGetRowCountInput(sctx, idxID, indexRanges) + defer func() { + debugtrace.RecordAnyValuesWithNames(sctx, "Name", name, "Result", result) + debugtrace.LeaveContextCommon(sctx) + }() + } + sc := sctx.GetSessionVars().StmtCtx + idx := coll.GetIdx(idxID) + colNames := make([]string, 0, 8) + if idx != nil { + if idx.Info != nil { + name = idx.Info.Name.O + for _, col := range idx.Info.Columns { + colNames = append(colNames, col.Name.O) + } + } + } + recordUsedItemStatsStatus(sctx, idx, coll.PhysicalID, idxID) + if statistics.IndexStatsIsInvalid(sctx, idx, coll, idxID) { + colsLen := -1 + if idx != nil && idx.Info.Unique { + colsLen = len(idx.Info.Columns) + } + result, err = getPseudoRowCountByIndexRanges(sc.TypeCtx(), indexRanges, float64(coll.RealtimeCount), colsLen) + if err == nil && sc.EnableOptimizerCETrace && idx != nil { + ceTraceRange(sctx, coll.PhysicalID, colNames, indexRanges, "Index Stats-Pseudo", uint64(result)) + } + return result, err + } + realtimeCnt, modifyCount := coll.GetScaledRealtimeAndModifyCnt(idx) + if sctx.GetSessionVars().StmtCtx.EnableOptimizerDebugTrace { + debugtrace.RecordAnyValuesWithNames(sctx, + "Histogram NotNull Count", idx.Histogram.NotNullCount(), + "TopN total count", idx.TopN.TotalCount(), + "Increase Factor", idx.GetIncreaseFactor(realtimeCnt), + ) + } + if idx.CMSketch != nil && idx.StatsVer == statistics.Version1 { + result, err = getIndexRowCountForStatsV1(sctx, coll, idxID, indexRanges) + } else { + result, err = getIndexRowCountForStatsV2(sctx, idx, coll, indexRanges, realtimeCnt, modifyCount) + } + if sc.EnableOptimizerCETrace { + ceTraceRange(sctx, coll.PhysicalID, colNames, indexRanges, "Index Stats", uint64(result)) + } + return result, errors.Trace(err) +} + +func getIndexRowCountForStatsV1(sctx context.PlanContext, coll *statistics.HistColl, idxID int64, indexRanges []*ranger.Range) (float64, error) { + sc := sctx.GetSessionVars().StmtCtx + debugTrace := sc.EnableOptimizerDebugTrace + if debugTrace { + debugtrace.EnterContextCommon(sctx) + defer debugtrace.LeaveContextCommon(sctx) + } + idx := coll.GetIdx(idxID) + totalCount := float64(0) + for _, ran := range indexRanges { + if debugTrace { + debugTraceStartEstimateRange(sctx, ran, nil, nil, totalCount) + } + rangePosition := getOrdinalOfRangeCond(sc, ran) + var rangeVals []types.Datum + // Try to enum the last range values. + if rangePosition != len(ran.LowVal) { + rangeVals = statistics.EnumRangeValues(ran.LowVal[rangePosition], ran.HighVal[rangePosition], ran.LowExclude, ran.HighExclude) + if rangeVals != nil { + rangePosition++ + } + } + // If first one is range, just use the previous way to estimate; if it is [NULL, NULL] range + // on single-column index, use previous way as well, because CMSketch does not contain null + // values in this case. + if rangePosition == 0 || isSingleColIdxNullRange(idx, ran) { + realtimeCnt, modifyCount := coll.GetScaledRealtimeAndModifyCnt(idx) + count, err := getIndexRowCountForStatsV2(sctx, idx, nil, []*ranger.Range{ran}, realtimeCnt, modifyCount) + if err != nil { + return 0, errors.Trace(err) + } + if debugTrace { + debugTraceEndEstimateRange(sctx, count, debugTraceRange) + } + totalCount += count + continue + } + var selectivity float64 + // use CM Sketch to estimate the equal conditions + if rangeVals == nil { + bytes, err := codec.EncodeKey(sc.TimeZone(), nil, ran.LowVal[:rangePosition]...) + err = sc.HandleError(err) + if err != nil { + return 0, errors.Trace(err) + } + selectivity, err = getEqualCondSelectivity(sctx, coll, idx, bytes, rangePosition, ran) + if err != nil { + return 0, errors.Trace(err) + } + } else { + bytes, err := codec.EncodeKey(sc.TimeZone(), nil, ran.LowVal[:rangePosition-1]...) + err = sc.HandleError(err) + if err != nil { + return 0, errors.Trace(err) + } + prefixLen := len(bytes) + for _, val := range rangeVals { + bytes = bytes[:prefixLen] + bytes, err = codec.EncodeKey(sc.TimeZone(), bytes, val) + err = sc.HandleError(err) + if err != nil { + return 0, err + } + res, err := getEqualCondSelectivity(sctx, coll, idx, bytes, rangePosition, ran) + if err != nil { + return 0, errors.Trace(err) + } + selectivity += res + } + } + // use histogram to estimate the range condition + if rangePosition != len(ran.LowVal) { + rang := ranger.Range{ + LowVal: []types.Datum{ran.LowVal[rangePosition]}, + LowExclude: ran.LowExclude, + HighVal: []types.Datum{ran.HighVal[rangePosition]}, + HighExclude: ran.HighExclude, + Collators: []collate.Collator{ran.Collators[rangePosition]}, + } + var count float64 + var err error + colUniqueIDs := coll.Idx2ColUniqueIDs[idxID] + var colUniqueID int64 + if rangePosition >= len(colUniqueIDs) { + colUniqueID = -1 + } else { + colUniqueID = colUniqueIDs[rangePosition] + } + // prefer index stats over column stats + if idxIDs, ok := coll.ColUniqueID2IdxIDs[colUniqueID]; ok && len(idxIDs) > 0 { + idxID := idxIDs[0] + count, err = GetRowCountByIndexRanges(sctx, coll, idxID, []*ranger.Range{&rang}) + } else { + count, err = GetRowCountByColumnRanges(sctx, coll, colUniqueID, []*ranger.Range{&rang}) + } + if err != nil { + return 0, errors.Trace(err) + } + selectivity = selectivity * count / idx.TotalRowCount() + } + count := selectivity * idx.TotalRowCount() + if debugTrace { + debugTraceEndEstimateRange(sctx, count, debugTraceRange) + } + totalCount += count + } + if totalCount > idx.TotalRowCount() { + totalCount = idx.TotalRowCount() + } + return totalCount, nil +} + +// isSingleColIdxNullRange checks if a range is [NULL, NULL] on a single-column index. +func isSingleColIdxNullRange(idx *statistics.Index, ran *ranger.Range) bool { + if len(idx.Info.Columns) > 1 { + return false + } + l, h := ran.LowVal[0], ran.HighVal[0] + if l.IsNull() && h.IsNull() { + return true + } + return false +} + +// It uses the modifyCount to adjust the influence of modifications on the table. +func getIndexRowCountForStatsV2(sctx context.PlanContext, idx *statistics.Index, coll *statistics.HistColl, indexRanges []*ranger.Range, realtimeRowCount, modifyCount int64) (float64, error) { + sc := sctx.GetSessionVars().StmtCtx + debugTrace := sc.EnableOptimizerDebugTrace + if debugTrace { + debugtrace.EnterContextCommon(sctx) + defer debugtrace.LeaveContextCommon(sctx) + } + totalCount := float64(0) + isSingleColIdx := len(idx.Info.Columns) == 1 + for _, indexRange := range indexRanges { + var count float64 + lb, err := codec.EncodeKey(sc.TimeZone(), nil, indexRange.LowVal...) + err = sc.HandleError(err) + if err != nil { + return 0, err + } + rb, err := codec.EncodeKey(sc.TimeZone(), nil, indexRange.HighVal...) + err = sc.HandleError(err) + if err != nil { + return 0, err + } + if debugTrace { + debugTraceStartEstimateRange(sctx, indexRange, lb, rb, totalCount) + } + fullLen := len(indexRange.LowVal) == len(indexRange.HighVal) && len(indexRange.LowVal) == len(idx.Info.Columns) + if bytes.Equal(lb, rb) { + // case 1: it's a point + if indexRange.LowExclude || indexRange.HighExclude { + if debugTrace { + debugTraceEndEstimateRange(sctx, 0, debugTraceImpossible) + } + continue + } + if fullLen { + // At most 1 in this case. + if idx.Info.Unique { + if !indexRange.IsOnlyNull() { + totalCount++ + if debugTrace { + debugTraceEndEstimateRange(sctx, 1, debugTraceUniquePoint) + } + continue + } + totalCount = float64(idx.NullCount) + if debugTrace { + debugTraceEndEstimateRange(sctx, float64(idx.NullCount), debugTraceUniquePoint) + } + continue + } + count = equalRowCountOnIndex(sctx, idx, lb, realtimeRowCount, modifyCount) + // If the current table row count has changed, we should scale the row count accordingly. + count *= idx.GetIncreaseFactor(realtimeRowCount) + if debugTrace { + debugTraceEndEstimateRange(sctx, count, debugTracePoint) + } + totalCount += count + continue + } + } + + // case 2: it's an interval + // The final interval is [low, high) + if indexRange.LowExclude { + lb = kv.Key(lb).PrefixNext() + } + if !indexRange.HighExclude { + rb = kv.Key(rb).PrefixNext() + } + l := types.NewBytesDatum(lb) + r := types.NewBytesDatum(rb) + lowIsNull := bytes.Equal(lb, nullKeyBytes) + if isSingleColIdx && lowIsNull { + count += float64(idx.Histogram.NullCount) + } + expBackoffSuccess := false + // Due to the limitation of calcFraction and convertDatumToScalar, the histogram actually won't estimate anything. + // If the first column's range is point. + if rangePosition := getOrdinalOfRangeCond(sc, indexRange); rangePosition > 0 && idx.StatsVer >= statistics.Version2 && coll != nil { + var expBackoffSel float64 + expBackoffSel, expBackoffSuccess, err = expBackoffEstimation(sctx, idx, coll, indexRange) + if err != nil { + return 0, err + } + if expBackoffSuccess { + expBackoffCnt := expBackoffSel * idx.TotalRowCount() + + upperLimit := expBackoffCnt + // Use the multi-column stats to calculate the max possible row count of [l, r) + if idx.Histogram.Len() > 0 { + _, lowerBkt, _, _ := idx.Histogram.LocateBucket(sctx, l) + _, upperBkt, _, _ := idx.Histogram.LocateBucket(sctx, r) + if debugTrace { + statistics.DebugTraceBuckets(sctx, &idx.Histogram, []int{lowerBkt - 1, upperBkt}) + } + // Use Count of the Bucket before l as the lower bound. + preCount := float64(0) + if lowerBkt > 0 { + preCount = float64(idx.Histogram.Buckets[lowerBkt-1].Count) + } + // Use Count of the Bucket where r exists as the upper bound. + upperCnt := float64(idx.Histogram.Buckets[upperBkt].Count) + upperLimit = upperCnt - preCount + upperLimit += float64(idx.TopN.BetweenCount(sctx, lb, rb)) + } + + // If the result of exponential backoff strategy is larger than the result from multi-column stats, + // use the upper limit from multi-column histogram instead. + if expBackoffCnt > upperLimit { + expBackoffCnt = upperLimit + } + count += expBackoffCnt + } + } + if !expBackoffSuccess { + count += betweenRowCountOnIndex(sctx, idx, l, r) + } + + // If the current table row count has changed, we should scale the row count accordingly. + increaseFactor := idx.GetIncreaseFactor(realtimeRowCount) + count *= increaseFactor + + // handling the out-of-range part + if (outOfRangeOnIndex(idx, l) && !(isSingleColIdx && lowIsNull)) || outOfRangeOnIndex(idx, r) { + histNDV := idx.NDV + // Exclude the TopN in Stats Version 2 + if idx.StatsVer == statistics.Version2 { + c := coll.GetCol(idx.Histogram.ID) + // If this is single column of a multi-column index - use the column's NDV rather than index NDV + isSingleColRange := len(indexRange.LowVal) == len(indexRange.HighVal) && len(indexRange.LowVal) == 1 + if isSingleColRange && !isSingleColIdx && c != nil && c.Histogram.NDV > 0 { + histNDV = c.Histogram.NDV - int64(c.TopN.Num()) + } else { + histNDV -= int64(idx.TopN.Num()) + } + } + count += idx.Histogram.OutOfRangeRowCount(sctx, &l, &r, modifyCount, histNDV, increaseFactor) + } + + if debugTrace { + debugTraceEndEstimateRange(sctx, count, debugTraceRange) + } + totalCount += count + } + allowZeroEst := fixcontrol.GetBoolWithDefault( + sctx.GetSessionVars().GetOptimizerFixControlMap(), + fixcontrol.Fix47400, + false, + ) + if allowZeroEst { + totalCount = mathutil.Clamp(totalCount, 0, float64(realtimeRowCount)) + } else { + // Don't allow the final result to go below 1 row + totalCount = mathutil.Clamp(totalCount, 1, float64(realtimeRowCount)) + } + return totalCount, nil +} + +var nullKeyBytes, _ = codec.EncodeKey(time.UTC, nil, types.NewDatum(nil)) + +func equalRowCountOnIndex(sctx context.PlanContext, idx *statistics.Index, b []byte, realtimeRowCount, modifyCount int64) (result float64) { + if sctx.GetSessionVars().StmtCtx.EnableOptimizerDebugTrace { + debugtrace.EnterContextCommon(sctx) + debugtrace.RecordAnyValuesWithNames(sctx, "Encoded Value", b) + defer func() { + debugtrace.RecordAnyValuesWithNames(sctx, "Result", result) + debugtrace.LeaveContextCommon(sctx) + }() + } + if len(idx.Info.Columns) == 1 { + if bytes.Equal(b, nullKeyBytes) { + return float64(idx.Histogram.NullCount) + } + } + val := types.NewBytesDatum(b) + if idx.StatsVer < statistics.Version2 { + if idx.Histogram.NDV > 0 && outOfRangeOnIndex(idx, val) { + return outOfRangeEQSelectivity(sctx, idx.Histogram.NDV, realtimeRowCount, int64(idx.TotalRowCount())) * idx.TotalRowCount() + } + if idx.CMSketch != nil { + return float64(idx.QueryBytes(sctx, b)) + } + histRowCount, _ := idx.Histogram.EqualRowCount(sctx, val, false) + return histRowCount + } + // stats version == 2 + // 1. try to find this value in TopN + if idx.TopN != nil { + count, found := idx.TopN.QueryTopN(sctx, b) + if found { + return float64(count) + } + } + // 2. try to find this value in bucket.Repeat(the last value in every bucket) + histCnt, matched := idx.Histogram.EqualRowCount(sctx, val, true) + if matched { + return histCnt + } + // 3. use uniform distribution assumption for the rest (even when this value is not covered by the range of stats) + histNDV := float64(idx.Histogram.NDV - int64(idx.TopN.Num())) + if histNDV <= 0 { + // If the table hasn't been modified, it's safe to return 0. Otherwise, the TopN could be stale - return 1. + if modifyCount == 0 { + return 0 + } + return 1 + } + return idx.Histogram.NotNullCount() / histNDV +} + +// expBackoffEstimation estimate the multi-col cases following the Exponential Backoff. See comment below for details. +func expBackoffEstimation(sctx context.PlanContext, idx *statistics.Index, coll *statistics.HistColl, indexRange *ranger.Range) (sel float64, success bool, err error) { + if sctx.GetSessionVars().StmtCtx.EnableOptimizerDebugTrace { + debugtrace.EnterContextCommon(sctx) + defer func() { + debugtrace.RecordAnyValuesWithNames(sctx, + "Result", sel, + "Success", success, + "error", err, + ) + debugtrace.LeaveContextCommon(sctx) + }() + } + tmpRan := []*ranger.Range{ + { + LowVal: make([]types.Datum, 1), + HighVal: make([]types.Datum, 1), + Collators: make([]collate.Collator, 1), + }, + } + colsIDs := coll.Idx2ColUniqueIDs[idx.Histogram.ID] + singleColumnEstResults := make([]float64, 0, len(indexRange.LowVal)) + // The following codes uses Exponential Backoff to reduce the impact of independent assumption. It works like: + // 1. Calc the selectivity of each column. + // 2. Sort them and choose the first 4 most selective filter and the corresponding selectivity is sel_1, sel_2, sel_3, sel_4 where i < j => sel_i < sel_j. + // 3. The final selectivity would be sel_1 * sel_2^{1/2} * sel_3^{1/4} * sel_4^{1/8}. + // This calculation reduced the independence assumption and can work well better than it. + for i := 0; i < len(indexRange.LowVal); i++ { + tmpRan[0].LowVal[0] = indexRange.LowVal[i] + tmpRan[0].HighVal[0] = indexRange.HighVal[i] + tmpRan[0].Collators[0] = indexRange.Collators[0] + if i == len(indexRange.LowVal)-1 { + tmpRan[0].LowExclude = indexRange.LowExclude + tmpRan[0].HighExclude = indexRange.HighExclude + } + colID := colsIDs[i] + var ( + count float64 + selectivity float64 + err error + foundStats bool + ) + if !statistics.ColumnStatsIsInvalid(coll.GetCol(colID), sctx, coll, colID) { + foundStats = true + count, err = GetRowCountByColumnRanges(sctx, coll, colID, tmpRan) + selectivity = count / float64(coll.RealtimeCount) + } + if idxIDs, ok := coll.ColUniqueID2IdxIDs[colID]; ok && !foundStats && len(indexRange.LowVal) > 1 { + // Note the `len(indexRange.LowVal) > 1` condition here, it means we only recursively call + // `GetRowCountByIndexRanges()` when the input `indexRange` is a multi-column range. This + // check avoids infinite recursion. + for _, idxID := range idxIDs { + if idxID == idx.Histogram.ID { + continue + } + idxStats := coll.GetIdx(idxID) + if idxStats == nil || statistics.IndexStatsIsInvalid(sctx, idxStats, coll, idxID) { + continue + } + foundStats = true + count, err = GetRowCountByIndexRanges(sctx, coll, idxID, tmpRan) + if err == nil { + break + } + realtimeCnt, _ := coll.GetScaledRealtimeAndModifyCnt(idxStats) + selectivity = count / float64(realtimeCnt) + } + } + if !foundStats { + continue + } + if err != nil { + return 0, false, err + } + singleColumnEstResults = append(singleColumnEstResults, selectivity) + } + // Sort them. + slices.Sort(singleColumnEstResults) + l := len(singleColumnEstResults) + failpoint.Inject("cleanEstResults", func() { + singleColumnEstResults = singleColumnEstResults[:0] + l = 0 + }) + if l == 1 { + return singleColumnEstResults[0], true, nil + } else if l == 0 { + return 0, false, nil + } + // Do not allow the exponential backoff to go below the available index bound. If the number of predicates + // is less than the number of index columns - use 90% of the bound to differentiate a subset from full index match. + // If there is an individual column selectivity that goes below this bound, use that selectivity only. + histNDV := coll.RealtimeCount + if idx.NDV > 0 { + histNDV = idx.NDV + } + idxLowBound := 1 / float64(min(histNDV, coll.RealtimeCount)) + if l < len(idx.Info.Columns) { + idxLowBound /= 0.9 + } + minTwoCol := min(singleColumnEstResults[0], singleColumnEstResults[1], idxLowBound) + multTwoCol := singleColumnEstResults[0] * math.Sqrt(singleColumnEstResults[1]) + if l == 2 { + return max(minTwoCol, multTwoCol), true, nil + } + minThreeCol := min(minTwoCol, singleColumnEstResults[2]) + multThreeCol := multTwoCol * math.Sqrt(math.Sqrt(singleColumnEstResults[2])) + if l == 3 { + return max(minThreeCol, multThreeCol), true, nil + } + minFourCol := min(minThreeCol, singleColumnEstResults[3]) + multFourCol := multThreeCol * math.Sqrt(math.Sqrt(math.Sqrt(singleColumnEstResults[3]))) + return max(minFourCol, multFourCol), true, nil +} + +// outOfRangeOnIndex checks if the datum is out of the range. +func outOfRangeOnIndex(idx *statistics.Index, val types.Datum) bool { + if !idx.Histogram.OutOfRange(val) { + return false + } + if idx.Histogram.Len() > 0 && matchPrefix(idx.Histogram.Bounds.GetRow(0), 0, &val) { + return false + } + return true +} + +// matchPrefix checks whether ad is the prefix of value +func matchPrefix(row chunk.Row, colIdx int, ad *types.Datum) bool { + switch ad.Kind() { + case types.KindString, types.KindBytes, types.KindBinaryLiteral, types.KindMysqlBit: + return strings.HasPrefix(row.GetString(colIdx), ad.GetString()) + } + return false +} + +// betweenRowCountOnIndex estimates the row count for interval [l, r). +// The input sctx is just for debug trace, you can pass nil safely if that's not needed. +func betweenRowCountOnIndex(sctx context.PlanContext, idx *statistics.Index, l, r types.Datum) float64 { + histBetweenCnt := idx.Histogram.BetweenRowCount(sctx, l, r) + if idx.StatsVer == statistics.Version1 { + return histBetweenCnt + } + return float64(idx.TopN.BetweenCount(sctx, l.GetBytes(), r.GetBytes())) + histBetweenCnt +} + +// getOrdinalOfRangeCond gets the ordinal of the position range condition, +// if not exist, it returns the end position. +func getOrdinalOfRangeCond(sc *stmtctx.StatementContext, ran *ranger.Range) int { + for i := range ran.LowVal { + a, b := ran.LowVal[i], ran.HighVal[i] + cmp, err := a.Compare(sc.TypeCtx(), &b, ran.Collators[0]) + if err != nil { + return 0 + } + if cmp != 0 { + return i + } + } + return len(ran.LowVal) +} diff --git a/pkg/planner/core/casetest/index/index_test.go b/pkg/planner/core/casetest/index/index_test.go new file mode 100644 index 0000000000000..ac4c77b31b930 --- /dev/null +++ b/pkg/planner/core/casetest/index/index_test.go @@ -0,0 +1,232 @@ +// Copyright 2023 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package index + +import ( + "fmt" + "testing" + + "github.com/pingcap/tidb/pkg/testkit" + "github.com/pingcap/tidb/pkg/testkit/testdata" + "github.com/pingcap/tidb/pkg/util" +) + +func TestNullConditionForPrefixIndex(t *testing.T) { + store := testkit.CreateMockStore(t) + tk := testkit.NewTestKit(t, store) + tk.MustExec("use test") + tk.MustExec(`CREATE TABLE t1 ( + id char(1) DEFAULT NULL, + c1 varchar(255) DEFAULT NULL, + c2 text DEFAULT NULL, + KEY idx1 (c1), + KEY idx2 (c1,c2(5)) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_bin`) + tk.MustExec("set tidb_cost_model_version=2") + tk.MustExec("create table t2(a int, b varchar(10), index idx(b(5)))") + tk.MustExec("create table t3(a int, b varchar(10), c int, primary key (a, b(5)) clustered)") + tk.MustExec("set tidb_opt_prefix_index_single_scan = 1") + tk.MustExec("insert into t1 values ('a', '0xfff', '111111'), ('b', '0xfff', '22 '), ('c', '0xfff', ''), ('d', '0xfff', null)") + tk.MustExec("insert into t2 values (1, 'aaaaaa'), (2, 'bb '), (3, ''), (4, null)") + tk.MustExec("insert into t3 values (1, 'aaaaaa', 2), (1, 'bb ', 3), (1, '', 4)") + + var input []string + var output []struct { + SQL string + Plan []string + Result []string + } + integrationSuiteData := GetIntegrationSuiteData() + integrationSuiteData.LoadTestCases(t, &input, &output) + for i, tt := range input { + testdata.OnRecord(func() { + output[i].SQL = tt + output[i].Plan = testdata.ConvertRowsToStrings(tk.MustQuery("explain format='brief' " + tt).Rows()) + output[i].Result = testdata.ConvertRowsToStrings(tk.MustQuery(tt).Sort().Rows()) + }) + tk.MustQuery("explain format='brief' " + tt).Check(testkit.Rows(output[i].Plan...)) + tk.MustQuery(tt).Sort().Check(testkit.Rows(output[i].Result...)) + } + + // test plan cache + tk.MustExec(`set tidb_enable_prepared_plan_cache=1`) + tk.MustExec("set @@tidb_enable_collect_execution_info=0") + tk.MustExec("prepare stmt from 'select count(1) from t1 where c1 = ? and c2 is not null'") + tk.MustExec("set @a = '0xfff'") + tk.MustQuery("execute stmt using @a").Check(testkit.Rows("3")) + tk.MustQuery("execute stmt using @a").Check(testkit.Rows("3")) + tk.MustQuery(`select @@last_plan_from_cache`).Check(testkit.Rows("1")) + tk.MustQuery("execute stmt using @a").Check(testkit.Rows("3")) + tkProcess := tk.Session().ShowProcess() + ps := []*util.ProcessInfo{tkProcess} + tk.Session().SetSessionManager(&testkit.MockSessionManager{PS: ps}) + tk.MustQuery(fmt.Sprintf("explain for connection %d", tkProcess.ID)).Check(testkit.Rows( + "StreamAgg_17 1.00 root funcs:count(Column#7)->Column#5", + "└─IndexReader_18 1.00 root index:StreamAgg_9", + " └─StreamAgg_9 1.00 cop[tikv] funcs:count(1)->Column#7", + " └─IndexRangeScan_16 99.90 cop[tikv] table:t1, index:idx2(c1, c2) range:[\"0xfff\" -inf,\"0xfff\" +inf], keep order:false, stats:pseudo")) +} + +func TestInvisibleIndex(t *testing.T) { + store := testkit.CreateMockStore(t) + tk := testkit.NewTestKit(t, store) + tk.MustExec("use test") + tk.MustExec("CREATE TABLE t1 ( a INT, KEY( a ) INVISIBLE );") + tk.MustExec("INSERT INTO t1 VALUES (1), (2), (3), (4), (5), (6), (7), (8), (9), (10);") + tk.MustQuery(`EXPLAIN SELECT a FROM t1;`).Check( + testkit.Rows( + `TableReader_5 10000.00 root data:TableFullScan_4`, + `└─TableFullScan_4 10000.00 cop[tikv] table:t1 keep order:false, stats:pseudo`)) + tk.MustExec("set session tidb_opt_use_invisible_indexes=on;") + tk.MustQuery(`EXPLAIN SELECT a FROM t1;`).Check( + testkit.Rows( + `IndexReader_7 10000.00 root index:IndexFullScan_6`, + `└─IndexFullScan_6 10000.00 cop[tikv] table:t1, index:a(a) keep order:false, stats:pseudo`)) +} + +func TestRangeDerivation(t *testing.T) { + store := testkit.CreateMockStore(t) + tk := testkit.NewTestKit(t, store) + tk.MustExec("use test") + tk.MustExec(`set @@tidb_opt_fix_control = "54337:ON"`) + tk.MustExec("create table t1 (a1 int, b1 int, c1 int, primary key pkx (a1,b1));") + tk.MustExec("create table t1char (a1 char(5), b1 char(5), c1 int, primary key pkx (a1,b1));") + tk.MustExec("create table t(a int, b int, c int, primary key(a,b));") + tk.MustExec("create table tuk (a int, b int, c int, unique key (a, b, c));") + tk.MustExec("set @@session.tidb_regard_null_as_point=false;") + + var input []string + var output []struct { + SQL string + Plan []string + } + indexRangeSuiteData := GetIndexRangeSuiteData() + indexRangeSuiteData.LoadTestCases(t, &input, &output) + indexRangeSuiteData.LoadTestCases(t, &input, &output) + for i, sql := range input { + plan := tk.MustQuery("explain format = 'brief' " + sql) + testdata.OnRecord(func() { + output[i].SQL = sql + output[i].Plan = testdata.ConvertRowsToStrings(plan.Rows()) + }) + plan.Check(testkit.Rows(output[i].Plan...)) + } +} + +func TestRowFunctionMatchTheIndexRangeScan(t *testing.T) { + store := testkit.CreateMockStore(t) + tk := testkit.NewTestKit(t, store) + tk.MustExec("use test") + tk.MustExec(`set @@tidb_opt_fix_control = "54337:ON"`) + tk.MustExec(`CREATE TABLE t1 (k1 int , k2 int, k3 int, index pk1(k1, k2))`) + tk.MustExec(`create table t2 (k1 int, k2 int)`) + var input []string + var output []struct { + SQL string + Plan []string + Result []string + } + integrationSuiteData := GetIntegrationSuiteData() + integrationSuiteData.LoadTestCases(t, &input, &output) + for i, tt := range input { + testdata.OnRecord(func() { + output[i].SQL = tt + output[i].Plan = testdata.ConvertRowsToStrings(tk.MustQuery("explain format='brief' " + tt).Rows()) + output[i].Result = testdata.ConvertRowsToStrings(tk.MustQuery(tt).Sort().Rows()) + }) + tk.MustQuery("explain format='brief' " + tt).Check(testkit.Rows(output[i].Plan...)) + tk.MustQuery(tt).Sort().Check(testkit.Rows(output[i].Result...)) + } +} + +func TestRangeIntersection(t *testing.T) { + store := testkit.CreateMockStore(t) + tk := testkit.NewTestKit(t, store) + tk.MustExec("use test") + tk.MustExec(`set @@tidb_opt_fix_control = "54337:ON"`) + tk.MustExec("create table t1 (a1 int, b1 int, c1 int, key pkx (a1,b1));") + tk.MustExec("insert into t1 values (1,1,1);") + tk.MustExec("insert into t1 values (null,1,1);") + tk.MustExec("insert into t1 values (1,null,1);") + tk.MustExec("insert into t1 values (1,1,null);") + tk.MustExec("insert into t1 values (1,10,1);") + tk.MustExec("insert into t1 values (10,20,1);") + tk.MustExec("insert into t1 select a1+1,b1,c1+1 from t1;") + tk.MustExec("insert into t1 select a1,b1+1,c1+1 from t1;") + tk.MustExec("insert into t1 select a1-1,b1+1,c1+1 from t1;") + tk.MustExec("insert into t1 select a1+2,b1+2,c1+2 from t1;") + tk.MustExec("insert into t1 select a1+2,b1-2,c1+2 from t1;") + tk.MustExec("insert into t1 select a1+2,b1-1,c1+2 from t1;") + tk.MustExec("insert into t1 select null,b1,c1+1 from t1;") + tk.MustExec("insert into t1 select a1,null,c1+1 from t1;") + + tk.MustExec("create table t11 (a1 int, b1 int, c1 int);") + tk.MustExec("insert into t11 select * from t1;") + + tk.MustExec("CREATE TABLE `tablename` (`primary_key` varbinary(1024) NOT NULL,`secondary_key` varbinary(1024) NOT NULL,`timestamp` bigint(20) NOT NULL,`value` mediumblob DEFAULT NULL,PRIMARY KEY PKK (`primary_key`,`secondary_key`,`timestamp`));") + + tk.MustExec("create table t(a int, b int, c int, key PKK(a,b,c));") + tk.MustExec("create table tt(a int, b int, c int, primary key PKK(a,b,c));") + tk.MustExec("insert into t select * from t1;") + tk.MustExec("insert into tt select * from t1 where a1 is not null and b1 is not null and c1 is not null;") + tk.MustExec("CREATE TABLE tnull (a INT, KEY PK(a));") + tk.MustExec("create table tkey_string(id1 CHAR(16) not null, id2 VARCHAR(16) not null, id3 BINARY(16) not null, id4 VARBINARY(16) not null, id5 BLOB not null, id6 TEXT not null, id7 ENUM('x-small', 'small', 'medium', 'large', 'x-large') not null, id8 SET ('a', 'b', 'c', 'd') not null, name varchar(16), primary key(id1, id2, id3, id4, id7, id8)) PARTITION BY KEY(id7) partitions 4;") + tk.MustExec("INSERT INTO tkey_string VALUES('huaian','huaian','huaian','huaian','huaian','huaian','x-small','a','linpin');") + tk.MustExec("INSERT INTO tkey_string VALUES('nanjing','nanjing','nanjing','nanjing','nanjing','nanjing','small','b','linpin');") + tk.MustExec("INSERT INTO tkey_string VALUES('zhenjiang','zhenjiang','zhenjiang','zhenjiang','zhenjiang','zhenjiang','medium','c','linpin');") + tk.MustExec("INSERT INTO tkey_string VALUES('suzhou','suzhou','suzhou','suzhou','suzhou','suzhou','large','d','linpin');") + tk.MustExec("INSERT INTO tkey_string VALUES('wuxi','wuxi','wuxi','wuxi','wuxi','wuxi','x-large','a','linpin');") + + var input []string + var output []struct { + SQL string + Plan []string + Result []string + } + indexRangeSuiteData := GetIndexRangeSuiteData() + indexRangeSuiteData.LoadTestCases(t, &input, &output) + indexRangeSuiteData.LoadTestCases(t, &input, &output) + for i, sql := range input { + plan := tk.MustQuery("explain format = 'brief' " + sql) + testdata.OnRecord(func() { + output[i].SQL = sql + output[i].Plan = testdata.ConvertRowsToStrings(plan.Rows()) + output[i].Result = testdata.ConvertRowsToStrings(tk.MustQuery(sql).Sort().Rows()) + }) + plan.Check(testkit.Rows(output[i].Plan...)) + tk.MustQuery(sql).Sort().Check(testkit.Rows(output[i].Result...)) + } +} + +func TestOrderedIndexWithIsNull(t *testing.T) { + store := testkit.CreateMockStore(t) + tk := testkit.NewTestKit(t, store) + tk.MustExec("use test") + tk.MustExec("CREATE TABLE t1 (a int key, b int, c int, index (b, c));") + tk.MustQuery("explain select a from t1 where b is null order by c").Check(testkit.Rows( + "Projection_6 10.00 root test.t1.a", + "└─IndexReader_12 10.00 root index:IndexRangeScan_11", + " └─IndexRangeScan_11 10.00 cop[tikv] table:t1, index:b(b, c) range:[NULL,NULL], keep order:true, stats:pseudo", + )) + // https://github.com/pingcap/tidb/issues/56116 + tk.MustExec("create table t2(id bigint(20) DEFAULT NULL, UNIQUE KEY index_on_id (id))") + tk.MustExec("insert into t2 values (), (), ()") + tk.MustExec("analyze table t2") + tk.MustQuery("explain select count(*) from t2 where id is null;").Check(testkit.Rows( + "StreamAgg_17 1.00 root funcs:count(Column#5)->Column#3", + "└─IndexReader_18 1.00 root index:StreamAgg_9", + " └─StreamAgg_9 1.00 cop[tikv] funcs:count(1)->Column#5", + " └─IndexRangeScan_16 3.00 cop[tikv] table:t2, index:index_on_id(id) range:[NULL,NULL], keep order:false")) +} diff --git a/pkg/util/ranger/types.go b/pkg/util/ranger/types.go new file mode 100644 index 0000000000000..8d0920f0be5fc --- /dev/null +++ b/pkg/util/ranger/types.go @@ -0,0 +1,567 @@ +// Copyright 2017 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package ranger + +import ( + "fmt" + "math" + "strings" + "time" + "unsafe" + + "github.com/pingcap/errors" + "github.com/pingcap/tidb/pkg/errctx" + "github.com/pingcap/tidb/pkg/kv" + "github.com/pingcap/tidb/pkg/planner/context" + "github.com/pingcap/tidb/pkg/types" + "github.com/pingcap/tidb/pkg/util/codec" + "github.com/pingcap/tidb/pkg/util/collate" + rangerctx "github.com/pingcap/tidb/pkg/util/ranger/context" +) + +// MutableRanges represents a range may change after it is created. +// It's mainly designed for plan-cache, since some ranges in a cached plan have to be rebuild when reusing. +type MutableRanges interface { + // Range returns the underlying range values. + Range() Ranges + // Rebuild rebuilds the underlying ranges again. + Rebuild(sctx context.PlanContext) error + // CloneForPlanCache clones the MutableRanges for plan cache. + CloneForPlanCache() MutableRanges +} + +// Ranges implements the MutableRanges interface for range array. +type Ranges []*Range + +// Range returns the range array. +func (rs Ranges) Range() Ranges { + return rs +} + +// Rebuild rebuilds this range. +func (Ranges) Rebuild(context.PlanContext) error { + return nil +} + +// CloneForPlanCache clones the MutableRanges for plan cache. +func (rs Ranges) CloneForPlanCache() MutableRanges { + if rs == nil { + return nil + } + cloned := make([]*Range, 0, len(rs)) + for _, r := range rs { + cloned = append(cloned, r.Clone()) + } + return Ranges(cloned) +} + +// MemUsage gets the memory usage of ranges. +func (rs Ranges) MemUsage() (sum int64) { + for _, ran := range rs { + sum += ran.MemUsage() + } + return +} + +// Range represents a range generated in physical plan building phase. +type Range struct { + LowVal []types.Datum // Low value is exclusive. + HighVal []types.Datum // High value is exclusive. + Collators []collate.Collator + LowExclude bool + HighExclude bool +} + +// Width returns the width of this range. +func (ran *Range) Width() int { + return len(ran.LowVal) +} + +// Clone clones a Range. +func (ran *Range) Clone() *Range { + if ran == nil { + return nil + } + newRange := &Range{ + LowVal: make([]types.Datum, 0, len(ran.LowVal)), + HighVal: make([]types.Datum, 0, len(ran.HighVal)), + LowExclude: ran.LowExclude, + HighExclude: ran.HighExclude, + } + for i, length := 0, len(ran.LowVal); i < length; i++ { + newRange.LowVal = append(newRange.LowVal, ran.LowVal[i]) + } + for i, length := 0, len(ran.HighVal); i < length; i++ { + newRange.HighVal = append(newRange.HighVal, ran.HighVal[i]) + } + newRange.Collators = append(newRange.Collators, ran.Collators...) + return newRange +} + +// IsPoint returns if the range is a point. +func (ran *Range) IsPoint(sctx *rangerctx.RangerContext) bool { + return ran.isPoint(sctx.TypeCtx, sctx.RegardNULLAsPoint) +} + +func (ran *Range) isPoint(tc types.Context, regardNullAsPoint bool) bool { + if len(ran.LowVal) != len(ran.HighVal) { + return false + } + for i := range ran.LowVal { + a := ran.LowVal[i] + b := ran.HighVal[i] + if a.Kind() == types.KindMinNotNull || b.Kind() == types.KindMaxValue { + return false + } + cmp, err := a.Compare(tc, &b, ran.Collators[i]) + if err != nil { + return false + } + if cmp != 0 { + return false + } + + if a.IsNull() && b.IsNull() { // [NULL, NULL] + if !regardNullAsPoint { + return false + } + } + } + return !ran.LowExclude && !ran.HighExclude +} + +// IsOnlyNull checks if the range has [NULL, NULL] or [NULL NULL, NULL NULL] range. +func (ran *Range) IsOnlyNull() bool { + for i := range ran.LowVal { + a := ran.LowVal[i] + b := ran.HighVal[i] + if !(a.IsNull() && b.IsNull()) { + return false + } + } + return true +} + +// IsPointNonNullable returns if the range is a point without NULL. +func (ran *Range) IsPointNonNullable(tc types.Context) bool { + return ran.isPoint(tc, false) +} + +// IsPointNullable returns if the range is a point. +// TODO: unify the parameter type with IsPointNullable and IsPoint +func (ran *Range) IsPointNullable(tc types.Context) bool { + return ran.isPoint(tc, true) +} + +// IsFullRange check if the range is full scan range +func (ran *Range) IsFullRange(unsignedIntHandle bool) bool { + if unsignedIntHandle { + if len(ran.LowVal) != 1 || len(ran.HighVal) != 1 { + return false + } + lowValRawString := formatDatum(ran.LowVal[0], true) + highValRawString := formatDatum(ran.HighVal[0], false) + return lowValRawString == "0" && highValRawString == "+inf" + } + if len(ran.LowVal) != len(ran.HighVal) { + return false + } + for i := range ran.LowVal { + lowValRawString := formatDatum(ran.LowVal[i], true) + highValRawString := formatDatum(ran.HighVal[i], false) + if ("-inf" != lowValRawString && "NULL" != lowValRawString) || + ("+inf" != highValRawString && "NULL" != highValRawString) || + ("NULL" == lowValRawString && "NULL" == highValRawString) { + return false + } + } + return true +} + +// HasFullRange checks if any range in the slice is a full range. +func HasFullRange(ranges []*Range, unsignedIntHandle bool) bool { + for _, ran := range ranges { + if ran.IsFullRange(unsignedIntHandle) { + return true + } + } + return false +} + +func dealWithRedact(input string, redact string) string { + if input == "-inf" || input == "+inf" { + return input + } + if redact == errors.RedactLogDisable { + return input + } else if redact == errors.RedactLogEnable { + return "?" + } + return fmt.Sprintf("‹%s›", input) +} + +// String implements the Stringer interface. +// don't use it in the product. +func (ran *Range) String() string { + return ran.string(errors.RedactLogDisable) +} + +// Redact is to print the range with redacting sensitive data. +func (ran *Range) Redact(redact string) string { + return ran.string(redact) +} + +// String implements the Stringer interface. +func (ran *Range) string(redact string) string { + lowStrs := make([]string, 0, len(ran.LowVal)) + for _, d := range ran.LowVal { + lowStrs = append(lowStrs, dealWithRedact(formatDatum(d, true), redact)) + } + highStrs := make([]string, 0, len(ran.LowVal)) + for _, d := range ran.HighVal { + highStrs = append(highStrs, dealWithRedact(formatDatum(d, false), redact)) + } + l, r := "[", "]" + if ran.LowExclude { + l = "(" + } + if ran.HighExclude { + r = ")" + } + return l + strings.Join(lowStrs, " ") + "," + strings.Join(highStrs, " ") + r +} + +// Encode encodes the range to its encoded value. +func (ran *Range) Encode(ec errctx.Context, loc *time.Location, lowBuffer, highBuffer []byte) ([]byte, []byte, error) { + var err error + lowBuffer, err = codec.EncodeKey(loc, lowBuffer[:0], ran.LowVal...) + err = ec.HandleError(err) + if err != nil { + return nil, nil, err + } + if ran.LowExclude { + lowBuffer = kv.Key(lowBuffer).PrefixNext() + } + highBuffer, err = codec.EncodeKey(loc, highBuffer[:0], ran.HighVal...) + err = ec.HandleError(err) + if err != nil { + return nil, nil, err + } + if !ran.HighExclude { + highBuffer = kv.Key(highBuffer).PrefixNext() + } + return lowBuffer, highBuffer, nil +} + +// PrefixEqualLen tells you how long the prefix of the range is a point. +// e.g. If this range is (1 2 3, 1 2 +inf), then the return value is 2. +func (ran *Range) PrefixEqualLen(tc types.Context) (int, error) { + // Here, len(ran.LowVal) always equal to len(ran.HighVal) + for i := 0; i < len(ran.LowVal); i++ { + cmp, err := ran.LowVal[i].Compare(tc, &ran.HighVal[i], ran.Collators[i]) + if err != nil { + return 0, errors.Trace(err) + } + if cmp != 0 { + return i, nil + } + } + return len(ran.LowVal), nil +} + +// EmptyRangeSize is the size of empty range. +const EmptyRangeSize = int64(unsafe.Sizeof(Range{})) + +// MemUsage gets the memory usage of range. +func (ran *Range) MemUsage() (sum int64) { + // 16 is the size of Collator interface. + sum = EmptyRangeSize + int64(len(ran.Collators))*16 + for _, val := range ran.LowVal { + sum += val.MemUsage() + } + for _, val := range ran.HighVal { + sum += val.MemUsage() + } + // We ignore size of collator currently. + return sum +} + +func formatDatum(d types.Datum, isLeftSide bool) string { + switch d.Kind() { + case types.KindNull: + return "NULL" + case types.KindMinNotNull: + return "-inf" + case types.KindMaxValue: + return "+inf" + case types.KindInt64: + switch d.GetInt64() { + case math.MinInt64: + if isLeftSide { + return "-inf" + } + case math.MaxInt64: + if !isLeftSide { + return "+inf" + } + } + case types.KindUint64: + if d.GetUint64() == math.MaxUint64 && !isLeftSide { + return "+inf" + } + case types.KindBytes: + return fmt.Sprintf("%q", d.GetValue()) + case types.KindString: + return fmt.Sprintf("%q", d.GetValue()) + case types.KindMysqlEnum, types.KindMysqlSet, + types.KindMysqlJSON, types.KindBinaryLiteral, types.KindMysqlBit: + return fmt.Sprintf("\"%v\"", d.GetValue()) + } + return fmt.Sprintf("%v", d.GetValue()) +} + +// compareLexicographically compares two bounds from two ranges and returns 0, 1, -1 +// for equal, greater than or less than respectively. It gets the two bounds, +// collations and if each bound is open (open1, open2) or closed. In addition, +// it also gets if each bound is lower or upper (low1, low2). +// Lower bounds logically can be extended with -infinity and upper bounds can be extended with +infinity. +func compareLexicographically(tc types.Context, bound1, bound2 []types.Datum, collators []collate.Collator, + open1, open2, low1, low2 bool) (int, error) { + n1 := len(bound1) + n2 := len(bound2) + n := min(n1, n2) + + for i := 0; i < n; i++ { + cmp, err := bound1[i].Compare(tc, &bound2[i], collators[i]) + if err != nil { + return 0, err + } + if cmp != 0 { + return cmp, nil + } + } + + // Handle interval types + if n1 == n2 { + switch { + case !open1 && !open2: + return 0, nil + case open1 == open2: + if low1 == low2 { + return 0, nil + } else if low1 { + return 1, nil + } else { + return -1, nil + } + case open1: + if low1 { + return 1, nil + } + return -1, nil + case open2: + if low2 { + return -1, nil + } + return 1, nil + } + } + + // Unequal length ranges. We use -infinity for lower bounds and +infinity for upper bounds. + if n1 < n2 { + if low1 { + // -infinity is less than anything + return -1, nil + } + // +infinity is higher than anything + return 1, nil + } + // n1 > n2 + if low2 { + // anything is larger than -infinity. + return 1, nil + } + // anything is less than +infinity + return -1, nil +} + +// Check if a list of Datum is a prefix of another list of Datum. This is useful for checking if +// lower/upper bound of a range is a subset of another. +func prefix(tc types.Context, superValue []types.Datum, supValue []types.Datum, length int, collators []collate.Collator) bool { + for i := 0; i < length; i++ { + cmp, err := superValue[i].Compare(tc, &supValue[i], collators[i]) + if (err != nil) || (cmp != 0) { + return false + } + } + return true +} + +// Subset checks if a list of ranges(rs) is a subset of another list of ranges(superRanges). +// This is true if every range in the first list is a subset of any +// range in the second list. Also, we check if all elements of superRanges are covered. +func (rs Ranges) Subset(tc types.Context, superRanges Ranges) bool { + var subset bool + superRangesCovered := make([]bool, len(superRanges)) + if len(rs) == 0 { + return len(superRanges) == 0 + } else if len(superRanges) == 0 { + // unrestricted superRanges and restricted rs + return true + } + + for _, subRange := range rs { + subset = false + for i, superRange := range superRanges { + if subRange.Subset(tc, superRange) { + subset = true + superRangesCovered[i] = true + break + } + } + if !subset { + return false + } + } + for i := 0; i < len(superRangesCovered); i++ { + if !superRangesCovered[i] { + return false + } + } + + return true +} + +func checkCollators(ran1 *Range, ran2 *Range, length int) bool { + // Make sure both ran and superRange have the same collations. + // The current code path for this function always will have same collation + // for ran and superRange. It is added here for future + // use of the function. + for i := 0; i < length; i++ { + if ran1.Collators[i] != ran2.Collators[i] { + return false + } + } + return true +} + +// Subset for Range type, check if range(ran) is a subset of another range(otherRange). +// This is done by: +// - Both ran and otherRange have the same collators. This is not needed for the current code path. +// But, it is used here for future use of the function. +// - Checking if the lower/upper bound of otherRange covers the corresponding lower/upper bound of ran. +// Thus include checking open/closed inetrvals. +func (ran *Range) Subset(tc types.Context, otherRange *Range) bool { + if len(ran.LowVal) < len(otherRange.LowVal) { + return false + } + + if !checkCollators(ran, otherRange, len(otherRange.LowVal)) { + return false + } + + // Either otherRange is closed or both ranges have the same open/close setting. + lowExcludeOK := !otherRange.LowExclude || ran.LowExclude == otherRange.LowExclude + highExcludeOK := !otherRange.HighExclude || ran.HighExclude == otherRange.HighExclude + if !lowExcludeOK || !highExcludeOK { + return false + } + + return prefix(tc, otherRange.LowVal, ran.LowVal, len(otherRange.LowVal), ran.Collators) && + prefix(tc, otherRange.HighVal, ran.HighVal, len(otherRange.LowVal), ran.Collators) +} + +// IntersectRange computes intersection between two ranges. err is set of something went wrong +// during comparison. +func (ran *Range) IntersectRange(tc types.Context, otherRange *Range) (*Range, error) { + intersectLength := max(len(ran.LowVal), len(otherRange.LowVal)) + result := &Range{ + LowVal: make([]types.Datum, 0, intersectLength), + HighVal: make([]types.Datum, 0, intersectLength), + Collators: make([]collate.Collator, 0, intersectLength), + } + + if len(ran.LowVal) > len(otherRange.LowVal) { + result.Collators = ran.Collators + } else { + result.Collators = otherRange.Collators + } + + lowVsHigh, err := compareLexicographically(tc, ran.LowVal, otherRange.HighVal, ran.Collators, + ran.LowExclude, otherRange.HighExclude, true, false) + if err != nil { + return &Range{}, err + } + if lowVsHigh == 1 { + return nil, nil + } + + lowVsHigh, err = compareLexicographically(tc, otherRange.LowVal, ran.HighVal, ran.Collators, + otherRange.LowExclude, ran.HighExclude, true, false) + if err != nil { + return &Range{}, err + } + if lowVsHigh == 1 { + return nil, nil + } + + lowVsLow, err := compareLexicographically(tc, ran.LowVal, otherRange.LowVal, + ran.Collators, ran.LowExclude, otherRange.LowExclude, true, true) + if err != nil { + return &Range{}, err + } + if lowVsLow == -1 { + result.LowVal = otherRange.LowVal + result.LowExclude = otherRange.LowExclude + } else { + result.LowVal = ran.LowVal + result.LowExclude = ran.LowExclude + } + + highVsHigh, err := compareLexicographically(tc, ran.HighVal, otherRange.HighVal, + ran.Collators, ran.HighExclude, otherRange.HighExclude, false, false) + if err != nil { + return &Range{}, err + } + if highVsHigh == 1 { + result.HighVal = otherRange.HighVal + result.HighExclude = otherRange.HighExclude + } else { + result.HighVal = ran.HighVal + result.HighExclude = ran.HighExclude + } + return result, nil +} + +// IntersectRanges computes pairwise intersection between each element in rs and otherRangeList. +func (rs Ranges) IntersectRanges(tc types.Context, otherRanges Ranges) Ranges { + result := Ranges{} + for _, rsRange := range rs { + for _, otherRange := range otherRanges { + subsetLength := min(len(rsRange.LowVal), len(otherRange.LowVal)) + if !checkCollators(rsRange, otherRange, subsetLength) { + return nil + } + oneIntersection, err := rsRange.IntersectRange(tc, otherRange) + if err != nil { + return nil + } + if oneIntersection != nil { + result = append(result, oneIntersection) + } + } + } + return result +} From 8f0be2721429de559586e7bea848ffa6dae3e94e Mon Sep 17 00:00:00 2001 From: Weizhen Wang Date: Wed, 5 Feb 2025 17:22:04 +0800 Subject: [PATCH 2/2] update Signed-off-by: Weizhen Wang --- pkg/planner/cardinality/row_count_index.go | 585 ------------------ pkg/planner/core/casetest/index/index_test.go | 232 ------- pkg/util/ranger/types.go | 567 ----------------- statistics/index.go | 6 +- statistics/integration_test.go | 15 + util/ranger/types.go | 12 + 6 files changed, 31 insertions(+), 1386 deletions(-) delete mode 100644 pkg/planner/cardinality/row_count_index.go delete mode 100644 pkg/planner/core/casetest/index/index_test.go delete mode 100644 pkg/util/ranger/types.go diff --git a/pkg/planner/cardinality/row_count_index.go b/pkg/planner/cardinality/row_count_index.go deleted file mode 100644 index 68ea1dc13d3e5..0000000000000 --- a/pkg/planner/cardinality/row_count_index.go +++ /dev/null @@ -1,585 +0,0 @@ -// Copyright 2023 PingCAP, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package cardinality - -import ( - "bytes" - "math" - "slices" - "strings" - "time" - - "github.com/pingcap/errors" - "github.com/pingcap/failpoint" - "github.com/pingcap/tidb/pkg/kv" - "github.com/pingcap/tidb/pkg/planner/context" - "github.com/pingcap/tidb/pkg/planner/util/debugtrace" - "github.com/pingcap/tidb/pkg/planner/util/fixcontrol" - "github.com/pingcap/tidb/pkg/sessionctx/stmtctx" - "github.com/pingcap/tidb/pkg/statistics" - "github.com/pingcap/tidb/pkg/types" - "github.com/pingcap/tidb/pkg/util/chunk" - "github.com/pingcap/tidb/pkg/util/codec" - "github.com/pingcap/tidb/pkg/util/collate" - "github.com/pingcap/tidb/pkg/util/mathutil" - "github.com/pingcap/tidb/pkg/util/ranger" -) - -// GetRowCountByIndexRanges estimates the row count by a slice of Range. -func GetRowCountByIndexRanges(sctx context.PlanContext, coll *statistics.HistColl, idxID int64, indexRanges []*ranger.Range) (result float64, err error) { - var name string - if sctx.GetSessionVars().StmtCtx.EnableOptimizerDebugTrace { - debugtrace.EnterContextCommon(sctx) - debugTraceGetRowCountInput(sctx, idxID, indexRanges) - defer func() { - debugtrace.RecordAnyValuesWithNames(sctx, "Name", name, "Result", result) - debugtrace.LeaveContextCommon(sctx) - }() - } - sc := sctx.GetSessionVars().StmtCtx - idx := coll.GetIdx(idxID) - colNames := make([]string, 0, 8) - if idx != nil { - if idx.Info != nil { - name = idx.Info.Name.O - for _, col := range idx.Info.Columns { - colNames = append(colNames, col.Name.O) - } - } - } - recordUsedItemStatsStatus(sctx, idx, coll.PhysicalID, idxID) - if statistics.IndexStatsIsInvalid(sctx, idx, coll, idxID) { - colsLen := -1 - if idx != nil && idx.Info.Unique { - colsLen = len(idx.Info.Columns) - } - result, err = getPseudoRowCountByIndexRanges(sc.TypeCtx(), indexRanges, float64(coll.RealtimeCount), colsLen) - if err == nil && sc.EnableOptimizerCETrace && idx != nil { - ceTraceRange(sctx, coll.PhysicalID, colNames, indexRanges, "Index Stats-Pseudo", uint64(result)) - } - return result, err - } - realtimeCnt, modifyCount := coll.GetScaledRealtimeAndModifyCnt(idx) - if sctx.GetSessionVars().StmtCtx.EnableOptimizerDebugTrace { - debugtrace.RecordAnyValuesWithNames(sctx, - "Histogram NotNull Count", idx.Histogram.NotNullCount(), - "TopN total count", idx.TopN.TotalCount(), - "Increase Factor", idx.GetIncreaseFactor(realtimeCnt), - ) - } - if idx.CMSketch != nil && idx.StatsVer == statistics.Version1 { - result, err = getIndexRowCountForStatsV1(sctx, coll, idxID, indexRanges) - } else { - result, err = getIndexRowCountForStatsV2(sctx, idx, coll, indexRanges, realtimeCnt, modifyCount) - } - if sc.EnableOptimizerCETrace { - ceTraceRange(sctx, coll.PhysicalID, colNames, indexRanges, "Index Stats", uint64(result)) - } - return result, errors.Trace(err) -} - -func getIndexRowCountForStatsV1(sctx context.PlanContext, coll *statistics.HistColl, idxID int64, indexRanges []*ranger.Range) (float64, error) { - sc := sctx.GetSessionVars().StmtCtx - debugTrace := sc.EnableOptimizerDebugTrace - if debugTrace { - debugtrace.EnterContextCommon(sctx) - defer debugtrace.LeaveContextCommon(sctx) - } - idx := coll.GetIdx(idxID) - totalCount := float64(0) - for _, ran := range indexRanges { - if debugTrace { - debugTraceStartEstimateRange(sctx, ran, nil, nil, totalCount) - } - rangePosition := getOrdinalOfRangeCond(sc, ran) - var rangeVals []types.Datum - // Try to enum the last range values. - if rangePosition != len(ran.LowVal) { - rangeVals = statistics.EnumRangeValues(ran.LowVal[rangePosition], ran.HighVal[rangePosition], ran.LowExclude, ran.HighExclude) - if rangeVals != nil { - rangePosition++ - } - } - // If first one is range, just use the previous way to estimate; if it is [NULL, NULL] range - // on single-column index, use previous way as well, because CMSketch does not contain null - // values in this case. - if rangePosition == 0 || isSingleColIdxNullRange(idx, ran) { - realtimeCnt, modifyCount := coll.GetScaledRealtimeAndModifyCnt(idx) - count, err := getIndexRowCountForStatsV2(sctx, idx, nil, []*ranger.Range{ran}, realtimeCnt, modifyCount) - if err != nil { - return 0, errors.Trace(err) - } - if debugTrace { - debugTraceEndEstimateRange(sctx, count, debugTraceRange) - } - totalCount += count - continue - } - var selectivity float64 - // use CM Sketch to estimate the equal conditions - if rangeVals == nil { - bytes, err := codec.EncodeKey(sc.TimeZone(), nil, ran.LowVal[:rangePosition]...) - err = sc.HandleError(err) - if err != nil { - return 0, errors.Trace(err) - } - selectivity, err = getEqualCondSelectivity(sctx, coll, idx, bytes, rangePosition, ran) - if err != nil { - return 0, errors.Trace(err) - } - } else { - bytes, err := codec.EncodeKey(sc.TimeZone(), nil, ran.LowVal[:rangePosition-1]...) - err = sc.HandleError(err) - if err != nil { - return 0, errors.Trace(err) - } - prefixLen := len(bytes) - for _, val := range rangeVals { - bytes = bytes[:prefixLen] - bytes, err = codec.EncodeKey(sc.TimeZone(), bytes, val) - err = sc.HandleError(err) - if err != nil { - return 0, err - } - res, err := getEqualCondSelectivity(sctx, coll, idx, bytes, rangePosition, ran) - if err != nil { - return 0, errors.Trace(err) - } - selectivity += res - } - } - // use histogram to estimate the range condition - if rangePosition != len(ran.LowVal) { - rang := ranger.Range{ - LowVal: []types.Datum{ran.LowVal[rangePosition]}, - LowExclude: ran.LowExclude, - HighVal: []types.Datum{ran.HighVal[rangePosition]}, - HighExclude: ran.HighExclude, - Collators: []collate.Collator{ran.Collators[rangePosition]}, - } - var count float64 - var err error - colUniqueIDs := coll.Idx2ColUniqueIDs[idxID] - var colUniqueID int64 - if rangePosition >= len(colUniqueIDs) { - colUniqueID = -1 - } else { - colUniqueID = colUniqueIDs[rangePosition] - } - // prefer index stats over column stats - if idxIDs, ok := coll.ColUniqueID2IdxIDs[colUniqueID]; ok && len(idxIDs) > 0 { - idxID := idxIDs[0] - count, err = GetRowCountByIndexRanges(sctx, coll, idxID, []*ranger.Range{&rang}) - } else { - count, err = GetRowCountByColumnRanges(sctx, coll, colUniqueID, []*ranger.Range{&rang}) - } - if err != nil { - return 0, errors.Trace(err) - } - selectivity = selectivity * count / idx.TotalRowCount() - } - count := selectivity * idx.TotalRowCount() - if debugTrace { - debugTraceEndEstimateRange(sctx, count, debugTraceRange) - } - totalCount += count - } - if totalCount > idx.TotalRowCount() { - totalCount = idx.TotalRowCount() - } - return totalCount, nil -} - -// isSingleColIdxNullRange checks if a range is [NULL, NULL] on a single-column index. -func isSingleColIdxNullRange(idx *statistics.Index, ran *ranger.Range) bool { - if len(idx.Info.Columns) > 1 { - return false - } - l, h := ran.LowVal[0], ran.HighVal[0] - if l.IsNull() && h.IsNull() { - return true - } - return false -} - -// It uses the modifyCount to adjust the influence of modifications on the table. -func getIndexRowCountForStatsV2(sctx context.PlanContext, idx *statistics.Index, coll *statistics.HistColl, indexRanges []*ranger.Range, realtimeRowCount, modifyCount int64) (float64, error) { - sc := sctx.GetSessionVars().StmtCtx - debugTrace := sc.EnableOptimizerDebugTrace - if debugTrace { - debugtrace.EnterContextCommon(sctx) - defer debugtrace.LeaveContextCommon(sctx) - } - totalCount := float64(0) - isSingleColIdx := len(idx.Info.Columns) == 1 - for _, indexRange := range indexRanges { - var count float64 - lb, err := codec.EncodeKey(sc.TimeZone(), nil, indexRange.LowVal...) - err = sc.HandleError(err) - if err != nil { - return 0, err - } - rb, err := codec.EncodeKey(sc.TimeZone(), nil, indexRange.HighVal...) - err = sc.HandleError(err) - if err != nil { - return 0, err - } - if debugTrace { - debugTraceStartEstimateRange(sctx, indexRange, lb, rb, totalCount) - } - fullLen := len(indexRange.LowVal) == len(indexRange.HighVal) && len(indexRange.LowVal) == len(idx.Info.Columns) - if bytes.Equal(lb, rb) { - // case 1: it's a point - if indexRange.LowExclude || indexRange.HighExclude { - if debugTrace { - debugTraceEndEstimateRange(sctx, 0, debugTraceImpossible) - } - continue - } - if fullLen { - // At most 1 in this case. - if idx.Info.Unique { - if !indexRange.IsOnlyNull() { - totalCount++ - if debugTrace { - debugTraceEndEstimateRange(sctx, 1, debugTraceUniquePoint) - } - continue - } - totalCount = float64(idx.NullCount) - if debugTrace { - debugTraceEndEstimateRange(sctx, float64(idx.NullCount), debugTraceUniquePoint) - } - continue - } - count = equalRowCountOnIndex(sctx, idx, lb, realtimeRowCount, modifyCount) - // If the current table row count has changed, we should scale the row count accordingly. - count *= idx.GetIncreaseFactor(realtimeRowCount) - if debugTrace { - debugTraceEndEstimateRange(sctx, count, debugTracePoint) - } - totalCount += count - continue - } - } - - // case 2: it's an interval - // The final interval is [low, high) - if indexRange.LowExclude { - lb = kv.Key(lb).PrefixNext() - } - if !indexRange.HighExclude { - rb = kv.Key(rb).PrefixNext() - } - l := types.NewBytesDatum(lb) - r := types.NewBytesDatum(rb) - lowIsNull := bytes.Equal(lb, nullKeyBytes) - if isSingleColIdx && lowIsNull { - count += float64(idx.Histogram.NullCount) - } - expBackoffSuccess := false - // Due to the limitation of calcFraction and convertDatumToScalar, the histogram actually won't estimate anything. - // If the first column's range is point. - if rangePosition := getOrdinalOfRangeCond(sc, indexRange); rangePosition > 0 && idx.StatsVer >= statistics.Version2 && coll != nil { - var expBackoffSel float64 - expBackoffSel, expBackoffSuccess, err = expBackoffEstimation(sctx, idx, coll, indexRange) - if err != nil { - return 0, err - } - if expBackoffSuccess { - expBackoffCnt := expBackoffSel * idx.TotalRowCount() - - upperLimit := expBackoffCnt - // Use the multi-column stats to calculate the max possible row count of [l, r) - if idx.Histogram.Len() > 0 { - _, lowerBkt, _, _ := idx.Histogram.LocateBucket(sctx, l) - _, upperBkt, _, _ := idx.Histogram.LocateBucket(sctx, r) - if debugTrace { - statistics.DebugTraceBuckets(sctx, &idx.Histogram, []int{lowerBkt - 1, upperBkt}) - } - // Use Count of the Bucket before l as the lower bound. - preCount := float64(0) - if lowerBkt > 0 { - preCount = float64(idx.Histogram.Buckets[lowerBkt-1].Count) - } - // Use Count of the Bucket where r exists as the upper bound. - upperCnt := float64(idx.Histogram.Buckets[upperBkt].Count) - upperLimit = upperCnt - preCount - upperLimit += float64(idx.TopN.BetweenCount(sctx, lb, rb)) - } - - // If the result of exponential backoff strategy is larger than the result from multi-column stats, - // use the upper limit from multi-column histogram instead. - if expBackoffCnt > upperLimit { - expBackoffCnt = upperLimit - } - count += expBackoffCnt - } - } - if !expBackoffSuccess { - count += betweenRowCountOnIndex(sctx, idx, l, r) - } - - // If the current table row count has changed, we should scale the row count accordingly. - increaseFactor := idx.GetIncreaseFactor(realtimeRowCount) - count *= increaseFactor - - // handling the out-of-range part - if (outOfRangeOnIndex(idx, l) && !(isSingleColIdx && lowIsNull)) || outOfRangeOnIndex(idx, r) { - histNDV := idx.NDV - // Exclude the TopN in Stats Version 2 - if idx.StatsVer == statistics.Version2 { - c := coll.GetCol(idx.Histogram.ID) - // If this is single column of a multi-column index - use the column's NDV rather than index NDV - isSingleColRange := len(indexRange.LowVal) == len(indexRange.HighVal) && len(indexRange.LowVal) == 1 - if isSingleColRange && !isSingleColIdx && c != nil && c.Histogram.NDV > 0 { - histNDV = c.Histogram.NDV - int64(c.TopN.Num()) - } else { - histNDV -= int64(idx.TopN.Num()) - } - } - count += idx.Histogram.OutOfRangeRowCount(sctx, &l, &r, modifyCount, histNDV, increaseFactor) - } - - if debugTrace { - debugTraceEndEstimateRange(sctx, count, debugTraceRange) - } - totalCount += count - } - allowZeroEst := fixcontrol.GetBoolWithDefault( - sctx.GetSessionVars().GetOptimizerFixControlMap(), - fixcontrol.Fix47400, - false, - ) - if allowZeroEst { - totalCount = mathutil.Clamp(totalCount, 0, float64(realtimeRowCount)) - } else { - // Don't allow the final result to go below 1 row - totalCount = mathutil.Clamp(totalCount, 1, float64(realtimeRowCount)) - } - return totalCount, nil -} - -var nullKeyBytes, _ = codec.EncodeKey(time.UTC, nil, types.NewDatum(nil)) - -func equalRowCountOnIndex(sctx context.PlanContext, idx *statistics.Index, b []byte, realtimeRowCount, modifyCount int64) (result float64) { - if sctx.GetSessionVars().StmtCtx.EnableOptimizerDebugTrace { - debugtrace.EnterContextCommon(sctx) - debugtrace.RecordAnyValuesWithNames(sctx, "Encoded Value", b) - defer func() { - debugtrace.RecordAnyValuesWithNames(sctx, "Result", result) - debugtrace.LeaveContextCommon(sctx) - }() - } - if len(idx.Info.Columns) == 1 { - if bytes.Equal(b, nullKeyBytes) { - return float64(idx.Histogram.NullCount) - } - } - val := types.NewBytesDatum(b) - if idx.StatsVer < statistics.Version2 { - if idx.Histogram.NDV > 0 && outOfRangeOnIndex(idx, val) { - return outOfRangeEQSelectivity(sctx, idx.Histogram.NDV, realtimeRowCount, int64(idx.TotalRowCount())) * idx.TotalRowCount() - } - if idx.CMSketch != nil { - return float64(idx.QueryBytes(sctx, b)) - } - histRowCount, _ := idx.Histogram.EqualRowCount(sctx, val, false) - return histRowCount - } - // stats version == 2 - // 1. try to find this value in TopN - if idx.TopN != nil { - count, found := idx.TopN.QueryTopN(sctx, b) - if found { - return float64(count) - } - } - // 2. try to find this value in bucket.Repeat(the last value in every bucket) - histCnt, matched := idx.Histogram.EqualRowCount(sctx, val, true) - if matched { - return histCnt - } - // 3. use uniform distribution assumption for the rest (even when this value is not covered by the range of stats) - histNDV := float64(idx.Histogram.NDV - int64(idx.TopN.Num())) - if histNDV <= 0 { - // If the table hasn't been modified, it's safe to return 0. Otherwise, the TopN could be stale - return 1. - if modifyCount == 0 { - return 0 - } - return 1 - } - return idx.Histogram.NotNullCount() / histNDV -} - -// expBackoffEstimation estimate the multi-col cases following the Exponential Backoff. See comment below for details. -func expBackoffEstimation(sctx context.PlanContext, idx *statistics.Index, coll *statistics.HistColl, indexRange *ranger.Range) (sel float64, success bool, err error) { - if sctx.GetSessionVars().StmtCtx.EnableOptimizerDebugTrace { - debugtrace.EnterContextCommon(sctx) - defer func() { - debugtrace.RecordAnyValuesWithNames(sctx, - "Result", sel, - "Success", success, - "error", err, - ) - debugtrace.LeaveContextCommon(sctx) - }() - } - tmpRan := []*ranger.Range{ - { - LowVal: make([]types.Datum, 1), - HighVal: make([]types.Datum, 1), - Collators: make([]collate.Collator, 1), - }, - } - colsIDs := coll.Idx2ColUniqueIDs[idx.Histogram.ID] - singleColumnEstResults := make([]float64, 0, len(indexRange.LowVal)) - // The following codes uses Exponential Backoff to reduce the impact of independent assumption. It works like: - // 1. Calc the selectivity of each column. - // 2. Sort them and choose the first 4 most selective filter and the corresponding selectivity is sel_1, sel_2, sel_3, sel_4 where i < j => sel_i < sel_j. - // 3. The final selectivity would be sel_1 * sel_2^{1/2} * sel_3^{1/4} * sel_4^{1/8}. - // This calculation reduced the independence assumption and can work well better than it. - for i := 0; i < len(indexRange.LowVal); i++ { - tmpRan[0].LowVal[0] = indexRange.LowVal[i] - tmpRan[0].HighVal[0] = indexRange.HighVal[i] - tmpRan[0].Collators[0] = indexRange.Collators[0] - if i == len(indexRange.LowVal)-1 { - tmpRan[0].LowExclude = indexRange.LowExclude - tmpRan[0].HighExclude = indexRange.HighExclude - } - colID := colsIDs[i] - var ( - count float64 - selectivity float64 - err error - foundStats bool - ) - if !statistics.ColumnStatsIsInvalid(coll.GetCol(colID), sctx, coll, colID) { - foundStats = true - count, err = GetRowCountByColumnRanges(sctx, coll, colID, tmpRan) - selectivity = count / float64(coll.RealtimeCount) - } - if idxIDs, ok := coll.ColUniqueID2IdxIDs[colID]; ok && !foundStats && len(indexRange.LowVal) > 1 { - // Note the `len(indexRange.LowVal) > 1` condition here, it means we only recursively call - // `GetRowCountByIndexRanges()` when the input `indexRange` is a multi-column range. This - // check avoids infinite recursion. - for _, idxID := range idxIDs { - if idxID == idx.Histogram.ID { - continue - } - idxStats := coll.GetIdx(idxID) - if idxStats == nil || statistics.IndexStatsIsInvalid(sctx, idxStats, coll, idxID) { - continue - } - foundStats = true - count, err = GetRowCountByIndexRanges(sctx, coll, idxID, tmpRan) - if err == nil { - break - } - realtimeCnt, _ := coll.GetScaledRealtimeAndModifyCnt(idxStats) - selectivity = count / float64(realtimeCnt) - } - } - if !foundStats { - continue - } - if err != nil { - return 0, false, err - } - singleColumnEstResults = append(singleColumnEstResults, selectivity) - } - // Sort them. - slices.Sort(singleColumnEstResults) - l := len(singleColumnEstResults) - failpoint.Inject("cleanEstResults", func() { - singleColumnEstResults = singleColumnEstResults[:0] - l = 0 - }) - if l == 1 { - return singleColumnEstResults[0], true, nil - } else if l == 0 { - return 0, false, nil - } - // Do not allow the exponential backoff to go below the available index bound. If the number of predicates - // is less than the number of index columns - use 90% of the bound to differentiate a subset from full index match. - // If there is an individual column selectivity that goes below this bound, use that selectivity only. - histNDV := coll.RealtimeCount - if idx.NDV > 0 { - histNDV = idx.NDV - } - idxLowBound := 1 / float64(min(histNDV, coll.RealtimeCount)) - if l < len(idx.Info.Columns) { - idxLowBound /= 0.9 - } - minTwoCol := min(singleColumnEstResults[0], singleColumnEstResults[1], idxLowBound) - multTwoCol := singleColumnEstResults[0] * math.Sqrt(singleColumnEstResults[1]) - if l == 2 { - return max(minTwoCol, multTwoCol), true, nil - } - minThreeCol := min(minTwoCol, singleColumnEstResults[2]) - multThreeCol := multTwoCol * math.Sqrt(math.Sqrt(singleColumnEstResults[2])) - if l == 3 { - return max(minThreeCol, multThreeCol), true, nil - } - minFourCol := min(minThreeCol, singleColumnEstResults[3]) - multFourCol := multThreeCol * math.Sqrt(math.Sqrt(math.Sqrt(singleColumnEstResults[3]))) - return max(minFourCol, multFourCol), true, nil -} - -// outOfRangeOnIndex checks if the datum is out of the range. -func outOfRangeOnIndex(idx *statistics.Index, val types.Datum) bool { - if !idx.Histogram.OutOfRange(val) { - return false - } - if idx.Histogram.Len() > 0 && matchPrefix(idx.Histogram.Bounds.GetRow(0), 0, &val) { - return false - } - return true -} - -// matchPrefix checks whether ad is the prefix of value -func matchPrefix(row chunk.Row, colIdx int, ad *types.Datum) bool { - switch ad.Kind() { - case types.KindString, types.KindBytes, types.KindBinaryLiteral, types.KindMysqlBit: - return strings.HasPrefix(row.GetString(colIdx), ad.GetString()) - } - return false -} - -// betweenRowCountOnIndex estimates the row count for interval [l, r). -// The input sctx is just for debug trace, you can pass nil safely if that's not needed. -func betweenRowCountOnIndex(sctx context.PlanContext, idx *statistics.Index, l, r types.Datum) float64 { - histBetweenCnt := idx.Histogram.BetweenRowCount(sctx, l, r) - if idx.StatsVer == statistics.Version1 { - return histBetweenCnt - } - return float64(idx.TopN.BetweenCount(sctx, l.GetBytes(), r.GetBytes())) + histBetweenCnt -} - -// getOrdinalOfRangeCond gets the ordinal of the position range condition, -// if not exist, it returns the end position. -func getOrdinalOfRangeCond(sc *stmtctx.StatementContext, ran *ranger.Range) int { - for i := range ran.LowVal { - a, b := ran.LowVal[i], ran.HighVal[i] - cmp, err := a.Compare(sc.TypeCtx(), &b, ran.Collators[0]) - if err != nil { - return 0 - } - if cmp != 0 { - return i - } - } - return len(ran.LowVal) -} diff --git a/pkg/planner/core/casetest/index/index_test.go b/pkg/planner/core/casetest/index/index_test.go deleted file mode 100644 index ac4c77b31b930..0000000000000 --- a/pkg/planner/core/casetest/index/index_test.go +++ /dev/null @@ -1,232 +0,0 @@ -// Copyright 2023 PingCAP, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package index - -import ( - "fmt" - "testing" - - "github.com/pingcap/tidb/pkg/testkit" - "github.com/pingcap/tidb/pkg/testkit/testdata" - "github.com/pingcap/tidb/pkg/util" -) - -func TestNullConditionForPrefixIndex(t *testing.T) { - store := testkit.CreateMockStore(t) - tk := testkit.NewTestKit(t, store) - tk.MustExec("use test") - tk.MustExec(`CREATE TABLE t1 ( - id char(1) DEFAULT NULL, - c1 varchar(255) DEFAULT NULL, - c2 text DEFAULT NULL, - KEY idx1 (c1), - KEY idx2 (c1,c2(5)) -) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_bin`) - tk.MustExec("set tidb_cost_model_version=2") - tk.MustExec("create table t2(a int, b varchar(10), index idx(b(5)))") - tk.MustExec("create table t3(a int, b varchar(10), c int, primary key (a, b(5)) clustered)") - tk.MustExec("set tidb_opt_prefix_index_single_scan = 1") - tk.MustExec("insert into t1 values ('a', '0xfff', '111111'), ('b', '0xfff', '22 '), ('c', '0xfff', ''), ('d', '0xfff', null)") - tk.MustExec("insert into t2 values (1, 'aaaaaa'), (2, 'bb '), (3, ''), (4, null)") - tk.MustExec("insert into t3 values (1, 'aaaaaa', 2), (1, 'bb ', 3), (1, '', 4)") - - var input []string - var output []struct { - SQL string - Plan []string - Result []string - } - integrationSuiteData := GetIntegrationSuiteData() - integrationSuiteData.LoadTestCases(t, &input, &output) - for i, tt := range input { - testdata.OnRecord(func() { - output[i].SQL = tt - output[i].Plan = testdata.ConvertRowsToStrings(tk.MustQuery("explain format='brief' " + tt).Rows()) - output[i].Result = testdata.ConvertRowsToStrings(tk.MustQuery(tt).Sort().Rows()) - }) - tk.MustQuery("explain format='brief' " + tt).Check(testkit.Rows(output[i].Plan...)) - tk.MustQuery(tt).Sort().Check(testkit.Rows(output[i].Result...)) - } - - // test plan cache - tk.MustExec(`set tidb_enable_prepared_plan_cache=1`) - tk.MustExec("set @@tidb_enable_collect_execution_info=0") - tk.MustExec("prepare stmt from 'select count(1) from t1 where c1 = ? and c2 is not null'") - tk.MustExec("set @a = '0xfff'") - tk.MustQuery("execute stmt using @a").Check(testkit.Rows("3")) - tk.MustQuery("execute stmt using @a").Check(testkit.Rows("3")) - tk.MustQuery(`select @@last_plan_from_cache`).Check(testkit.Rows("1")) - tk.MustQuery("execute stmt using @a").Check(testkit.Rows("3")) - tkProcess := tk.Session().ShowProcess() - ps := []*util.ProcessInfo{tkProcess} - tk.Session().SetSessionManager(&testkit.MockSessionManager{PS: ps}) - tk.MustQuery(fmt.Sprintf("explain for connection %d", tkProcess.ID)).Check(testkit.Rows( - "StreamAgg_17 1.00 root funcs:count(Column#7)->Column#5", - "└─IndexReader_18 1.00 root index:StreamAgg_9", - " └─StreamAgg_9 1.00 cop[tikv] funcs:count(1)->Column#7", - " └─IndexRangeScan_16 99.90 cop[tikv] table:t1, index:idx2(c1, c2) range:[\"0xfff\" -inf,\"0xfff\" +inf], keep order:false, stats:pseudo")) -} - -func TestInvisibleIndex(t *testing.T) { - store := testkit.CreateMockStore(t) - tk := testkit.NewTestKit(t, store) - tk.MustExec("use test") - tk.MustExec("CREATE TABLE t1 ( a INT, KEY( a ) INVISIBLE );") - tk.MustExec("INSERT INTO t1 VALUES (1), (2), (3), (4), (5), (6), (7), (8), (9), (10);") - tk.MustQuery(`EXPLAIN SELECT a FROM t1;`).Check( - testkit.Rows( - `TableReader_5 10000.00 root data:TableFullScan_4`, - `└─TableFullScan_4 10000.00 cop[tikv] table:t1 keep order:false, stats:pseudo`)) - tk.MustExec("set session tidb_opt_use_invisible_indexes=on;") - tk.MustQuery(`EXPLAIN SELECT a FROM t1;`).Check( - testkit.Rows( - `IndexReader_7 10000.00 root index:IndexFullScan_6`, - `└─IndexFullScan_6 10000.00 cop[tikv] table:t1, index:a(a) keep order:false, stats:pseudo`)) -} - -func TestRangeDerivation(t *testing.T) { - store := testkit.CreateMockStore(t) - tk := testkit.NewTestKit(t, store) - tk.MustExec("use test") - tk.MustExec(`set @@tidb_opt_fix_control = "54337:ON"`) - tk.MustExec("create table t1 (a1 int, b1 int, c1 int, primary key pkx (a1,b1));") - tk.MustExec("create table t1char (a1 char(5), b1 char(5), c1 int, primary key pkx (a1,b1));") - tk.MustExec("create table t(a int, b int, c int, primary key(a,b));") - tk.MustExec("create table tuk (a int, b int, c int, unique key (a, b, c));") - tk.MustExec("set @@session.tidb_regard_null_as_point=false;") - - var input []string - var output []struct { - SQL string - Plan []string - } - indexRangeSuiteData := GetIndexRangeSuiteData() - indexRangeSuiteData.LoadTestCases(t, &input, &output) - indexRangeSuiteData.LoadTestCases(t, &input, &output) - for i, sql := range input { - plan := tk.MustQuery("explain format = 'brief' " + sql) - testdata.OnRecord(func() { - output[i].SQL = sql - output[i].Plan = testdata.ConvertRowsToStrings(plan.Rows()) - }) - plan.Check(testkit.Rows(output[i].Plan...)) - } -} - -func TestRowFunctionMatchTheIndexRangeScan(t *testing.T) { - store := testkit.CreateMockStore(t) - tk := testkit.NewTestKit(t, store) - tk.MustExec("use test") - tk.MustExec(`set @@tidb_opt_fix_control = "54337:ON"`) - tk.MustExec(`CREATE TABLE t1 (k1 int , k2 int, k3 int, index pk1(k1, k2))`) - tk.MustExec(`create table t2 (k1 int, k2 int)`) - var input []string - var output []struct { - SQL string - Plan []string - Result []string - } - integrationSuiteData := GetIntegrationSuiteData() - integrationSuiteData.LoadTestCases(t, &input, &output) - for i, tt := range input { - testdata.OnRecord(func() { - output[i].SQL = tt - output[i].Plan = testdata.ConvertRowsToStrings(tk.MustQuery("explain format='brief' " + tt).Rows()) - output[i].Result = testdata.ConvertRowsToStrings(tk.MustQuery(tt).Sort().Rows()) - }) - tk.MustQuery("explain format='brief' " + tt).Check(testkit.Rows(output[i].Plan...)) - tk.MustQuery(tt).Sort().Check(testkit.Rows(output[i].Result...)) - } -} - -func TestRangeIntersection(t *testing.T) { - store := testkit.CreateMockStore(t) - tk := testkit.NewTestKit(t, store) - tk.MustExec("use test") - tk.MustExec(`set @@tidb_opt_fix_control = "54337:ON"`) - tk.MustExec("create table t1 (a1 int, b1 int, c1 int, key pkx (a1,b1));") - tk.MustExec("insert into t1 values (1,1,1);") - tk.MustExec("insert into t1 values (null,1,1);") - tk.MustExec("insert into t1 values (1,null,1);") - tk.MustExec("insert into t1 values (1,1,null);") - tk.MustExec("insert into t1 values (1,10,1);") - tk.MustExec("insert into t1 values (10,20,1);") - tk.MustExec("insert into t1 select a1+1,b1,c1+1 from t1;") - tk.MustExec("insert into t1 select a1,b1+1,c1+1 from t1;") - tk.MustExec("insert into t1 select a1-1,b1+1,c1+1 from t1;") - tk.MustExec("insert into t1 select a1+2,b1+2,c1+2 from t1;") - tk.MustExec("insert into t1 select a1+2,b1-2,c1+2 from t1;") - tk.MustExec("insert into t1 select a1+2,b1-1,c1+2 from t1;") - tk.MustExec("insert into t1 select null,b1,c1+1 from t1;") - tk.MustExec("insert into t1 select a1,null,c1+1 from t1;") - - tk.MustExec("create table t11 (a1 int, b1 int, c1 int);") - tk.MustExec("insert into t11 select * from t1;") - - tk.MustExec("CREATE TABLE `tablename` (`primary_key` varbinary(1024) NOT NULL,`secondary_key` varbinary(1024) NOT NULL,`timestamp` bigint(20) NOT NULL,`value` mediumblob DEFAULT NULL,PRIMARY KEY PKK (`primary_key`,`secondary_key`,`timestamp`));") - - tk.MustExec("create table t(a int, b int, c int, key PKK(a,b,c));") - tk.MustExec("create table tt(a int, b int, c int, primary key PKK(a,b,c));") - tk.MustExec("insert into t select * from t1;") - tk.MustExec("insert into tt select * from t1 where a1 is not null and b1 is not null and c1 is not null;") - tk.MustExec("CREATE TABLE tnull (a INT, KEY PK(a));") - tk.MustExec("create table tkey_string(id1 CHAR(16) not null, id2 VARCHAR(16) not null, id3 BINARY(16) not null, id4 VARBINARY(16) not null, id5 BLOB not null, id6 TEXT not null, id7 ENUM('x-small', 'small', 'medium', 'large', 'x-large') not null, id8 SET ('a', 'b', 'c', 'd') not null, name varchar(16), primary key(id1, id2, id3, id4, id7, id8)) PARTITION BY KEY(id7) partitions 4;") - tk.MustExec("INSERT INTO tkey_string VALUES('huaian','huaian','huaian','huaian','huaian','huaian','x-small','a','linpin');") - tk.MustExec("INSERT INTO tkey_string VALUES('nanjing','nanjing','nanjing','nanjing','nanjing','nanjing','small','b','linpin');") - tk.MustExec("INSERT INTO tkey_string VALUES('zhenjiang','zhenjiang','zhenjiang','zhenjiang','zhenjiang','zhenjiang','medium','c','linpin');") - tk.MustExec("INSERT INTO tkey_string VALUES('suzhou','suzhou','suzhou','suzhou','suzhou','suzhou','large','d','linpin');") - tk.MustExec("INSERT INTO tkey_string VALUES('wuxi','wuxi','wuxi','wuxi','wuxi','wuxi','x-large','a','linpin');") - - var input []string - var output []struct { - SQL string - Plan []string - Result []string - } - indexRangeSuiteData := GetIndexRangeSuiteData() - indexRangeSuiteData.LoadTestCases(t, &input, &output) - indexRangeSuiteData.LoadTestCases(t, &input, &output) - for i, sql := range input { - plan := tk.MustQuery("explain format = 'brief' " + sql) - testdata.OnRecord(func() { - output[i].SQL = sql - output[i].Plan = testdata.ConvertRowsToStrings(plan.Rows()) - output[i].Result = testdata.ConvertRowsToStrings(tk.MustQuery(sql).Sort().Rows()) - }) - plan.Check(testkit.Rows(output[i].Plan...)) - tk.MustQuery(sql).Sort().Check(testkit.Rows(output[i].Result...)) - } -} - -func TestOrderedIndexWithIsNull(t *testing.T) { - store := testkit.CreateMockStore(t) - tk := testkit.NewTestKit(t, store) - tk.MustExec("use test") - tk.MustExec("CREATE TABLE t1 (a int key, b int, c int, index (b, c));") - tk.MustQuery("explain select a from t1 where b is null order by c").Check(testkit.Rows( - "Projection_6 10.00 root test.t1.a", - "└─IndexReader_12 10.00 root index:IndexRangeScan_11", - " └─IndexRangeScan_11 10.00 cop[tikv] table:t1, index:b(b, c) range:[NULL,NULL], keep order:true, stats:pseudo", - )) - // https://github.com/pingcap/tidb/issues/56116 - tk.MustExec("create table t2(id bigint(20) DEFAULT NULL, UNIQUE KEY index_on_id (id))") - tk.MustExec("insert into t2 values (), (), ()") - tk.MustExec("analyze table t2") - tk.MustQuery("explain select count(*) from t2 where id is null;").Check(testkit.Rows( - "StreamAgg_17 1.00 root funcs:count(Column#5)->Column#3", - "└─IndexReader_18 1.00 root index:StreamAgg_9", - " └─StreamAgg_9 1.00 cop[tikv] funcs:count(1)->Column#5", - " └─IndexRangeScan_16 3.00 cop[tikv] table:t2, index:index_on_id(id) range:[NULL,NULL], keep order:false")) -} diff --git a/pkg/util/ranger/types.go b/pkg/util/ranger/types.go deleted file mode 100644 index 8d0920f0be5fc..0000000000000 --- a/pkg/util/ranger/types.go +++ /dev/null @@ -1,567 +0,0 @@ -// Copyright 2017 PingCAP, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package ranger - -import ( - "fmt" - "math" - "strings" - "time" - "unsafe" - - "github.com/pingcap/errors" - "github.com/pingcap/tidb/pkg/errctx" - "github.com/pingcap/tidb/pkg/kv" - "github.com/pingcap/tidb/pkg/planner/context" - "github.com/pingcap/tidb/pkg/types" - "github.com/pingcap/tidb/pkg/util/codec" - "github.com/pingcap/tidb/pkg/util/collate" - rangerctx "github.com/pingcap/tidb/pkg/util/ranger/context" -) - -// MutableRanges represents a range may change after it is created. -// It's mainly designed for plan-cache, since some ranges in a cached plan have to be rebuild when reusing. -type MutableRanges interface { - // Range returns the underlying range values. - Range() Ranges - // Rebuild rebuilds the underlying ranges again. - Rebuild(sctx context.PlanContext) error - // CloneForPlanCache clones the MutableRanges for plan cache. - CloneForPlanCache() MutableRanges -} - -// Ranges implements the MutableRanges interface for range array. -type Ranges []*Range - -// Range returns the range array. -func (rs Ranges) Range() Ranges { - return rs -} - -// Rebuild rebuilds this range. -func (Ranges) Rebuild(context.PlanContext) error { - return nil -} - -// CloneForPlanCache clones the MutableRanges for plan cache. -func (rs Ranges) CloneForPlanCache() MutableRanges { - if rs == nil { - return nil - } - cloned := make([]*Range, 0, len(rs)) - for _, r := range rs { - cloned = append(cloned, r.Clone()) - } - return Ranges(cloned) -} - -// MemUsage gets the memory usage of ranges. -func (rs Ranges) MemUsage() (sum int64) { - for _, ran := range rs { - sum += ran.MemUsage() - } - return -} - -// Range represents a range generated in physical plan building phase. -type Range struct { - LowVal []types.Datum // Low value is exclusive. - HighVal []types.Datum // High value is exclusive. - Collators []collate.Collator - LowExclude bool - HighExclude bool -} - -// Width returns the width of this range. -func (ran *Range) Width() int { - return len(ran.LowVal) -} - -// Clone clones a Range. -func (ran *Range) Clone() *Range { - if ran == nil { - return nil - } - newRange := &Range{ - LowVal: make([]types.Datum, 0, len(ran.LowVal)), - HighVal: make([]types.Datum, 0, len(ran.HighVal)), - LowExclude: ran.LowExclude, - HighExclude: ran.HighExclude, - } - for i, length := 0, len(ran.LowVal); i < length; i++ { - newRange.LowVal = append(newRange.LowVal, ran.LowVal[i]) - } - for i, length := 0, len(ran.HighVal); i < length; i++ { - newRange.HighVal = append(newRange.HighVal, ran.HighVal[i]) - } - newRange.Collators = append(newRange.Collators, ran.Collators...) - return newRange -} - -// IsPoint returns if the range is a point. -func (ran *Range) IsPoint(sctx *rangerctx.RangerContext) bool { - return ran.isPoint(sctx.TypeCtx, sctx.RegardNULLAsPoint) -} - -func (ran *Range) isPoint(tc types.Context, regardNullAsPoint bool) bool { - if len(ran.LowVal) != len(ran.HighVal) { - return false - } - for i := range ran.LowVal { - a := ran.LowVal[i] - b := ran.HighVal[i] - if a.Kind() == types.KindMinNotNull || b.Kind() == types.KindMaxValue { - return false - } - cmp, err := a.Compare(tc, &b, ran.Collators[i]) - if err != nil { - return false - } - if cmp != 0 { - return false - } - - if a.IsNull() && b.IsNull() { // [NULL, NULL] - if !regardNullAsPoint { - return false - } - } - } - return !ran.LowExclude && !ran.HighExclude -} - -// IsOnlyNull checks if the range has [NULL, NULL] or [NULL NULL, NULL NULL] range. -func (ran *Range) IsOnlyNull() bool { - for i := range ran.LowVal { - a := ran.LowVal[i] - b := ran.HighVal[i] - if !(a.IsNull() && b.IsNull()) { - return false - } - } - return true -} - -// IsPointNonNullable returns if the range is a point without NULL. -func (ran *Range) IsPointNonNullable(tc types.Context) bool { - return ran.isPoint(tc, false) -} - -// IsPointNullable returns if the range is a point. -// TODO: unify the parameter type with IsPointNullable and IsPoint -func (ran *Range) IsPointNullable(tc types.Context) bool { - return ran.isPoint(tc, true) -} - -// IsFullRange check if the range is full scan range -func (ran *Range) IsFullRange(unsignedIntHandle bool) bool { - if unsignedIntHandle { - if len(ran.LowVal) != 1 || len(ran.HighVal) != 1 { - return false - } - lowValRawString := formatDatum(ran.LowVal[0], true) - highValRawString := formatDatum(ran.HighVal[0], false) - return lowValRawString == "0" && highValRawString == "+inf" - } - if len(ran.LowVal) != len(ran.HighVal) { - return false - } - for i := range ran.LowVal { - lowValRawString := formatDatum(ran.LowVal[i], true) - highValRawString := formatDatum(ran.HighVal[i], false) - if ("-inf" != lowValRawString && "NULL" != lowValRawString) || - ("+inf" != highValRawString && "NULL" != highValRawString) || - ("NULL" == lowValRawString && "NULL" == highValRawString) { - return false - } - } - return true -} - -// HasFullRange checks if any range in the slice is a full range. -func HasFullRange(ranges []*Range, unsignedIntHandle bool) bool { - for _, ran := range ranges { - if ran.IsFullRange(unsignedIntHandle) { - return true - } - } - return false -} - -func dealWithRedact(input string, redact string) string { - if input == "-inf" || input == "+inf" { - return input - } - if redact == errors.RedactLogDisable { - return input - } else if redact == errors.RedactLogEnable { - return "?" - } - return fmt.Sprintf("‹%s›", input) -} - -// String implements the Stringer interface. -// don't use it in the product. -func (ran *Range) String() string { - return ran.string(errors.RedactLogDisable) -} - -// Redact is to print the range with redacting sensitive data. -func (ran *Range) Redact(redact string) string { - return ran.string(redact) -} - -// String implements the Stringer interface. -func (ran *Range) string(redact string) string { - lowStrs := make([]string, 0, len(ran.LowVal)) - for _, d := range ran.LowVal { - lowStrs = append(lowStrs, dealWithRedact(formatDatum(d, true), redact)) - } - highStrs := make([]string, 0, len(ran.LowVal)) - for _, d := range ran.HighVal { - highStrs = append(highStrs, dealWithRedact(formatDatum(d, false), redact)) - } - l, r := "[", "]" - if ran.LowExclude { - l = "(" - } - if ran.HighExclude { - r = ")" - } - return l + strings.Join(lowStrs, " ") + "," + strings.Join(highStrs, " ") + r -} - -// Encode encodes the range to its encoded value. -func (ran *Range) Encode(ec errctx.Context, loc *time.Location, lowBuffer, highBuffer []byte) ([]byte, []byte, error) { - var err error - lowBuffer, err = codec.EncodeKey(loc, lowBuffer[:0], ran.LowVal...) - err = ec.HandleError(err) - if err != nil { - return nil, nil, err - } - if ran.LowExclude { - lowBuffer = kv.Key(lowBuffer).PrefixNext() - } - highBuffer, err = codec.EncodeKey(loc, highBuffer[:0], ran.HighVal...) - err = ec.HandleError(err) - if err != nil { - return nil, nil, err - } - if !ran.HighExclude { - highBuffer = kv.Key(highBuffer).PrefixNext() - } - return lowBuffer, highBuffer, nil -} - -// PrefixEqualLen tells you how long the prefix of the range is a point. -// e.g. If this range is (1 2 3, 1 2 +inf), then the return value is 2. -func (ran *Range) PrefixEqualLen(tc types.Context) (int, error) { - // Here, len(ran.LowVal) always equal to len(ran.HighVal) - for i := 0; i < len(ran.LowVal); i++ { - cmp, err := ran.LowVal[i].Compare(tc, &ran.HighVal[i], ran.Collators[i]) - if err != nil { - return 0, errors.Trace(err) - } - if cmp != 0 { - return i, nil - } - } - return len(ran.LowVal), nil -} - -// EmptyRangeSize is the size of empty range. -const EmptyRangeSize = int64(unsafe.Sizeof(Range{})) - -// MemUsage gets the memory usage of range. -func (ran *Range) MemUsage() (sum int64) { - // 16 is the size of Collator interface. - sum = EmptyRangeSize + int64(len(ran.Collators))*16 - for _, val := range ran.LowVal { - sum += val.MemUsage() - } - for _, val := range ran.HighVal { - sum += val.MemUsage() - } - // We ignore size of collator currently. - return sum -} - -func formatDatum(d types.Datum, isLeftSide bool) string { - switch d.Kind() { - case types.KindNull: - return "NULL" - case types.KindMinNotNull: - return "-inf" - case types.KindMaxValue: - return "+inf" - case types.KindInt64: - switch d.GetInt64() { - case math.MinInt64: - if isLeftSide { - return "-inf" - } - case math.MaxInt64: - if !isLeftSide { - return "+inf" - } - } - case types.KindUint64: - if d.GetUint64() == math.MaxUint64 && !isLeftSide { - return "+inf" - } - case types.KindBytes: - return fmt.Sprintf("%q", d.GetValue()) - case types.KindString: - return fmt.Sprintf("%q", d.GetValue()) - case types.KindMysqlEnum, types.KindMysqlSet, - types.KindMysqlJSON, types.KindBinaryLiteral, types.KindMysqlBit: - return fmt.Sprintf("\"%v\"", d.GetValue()) - } - return fmt.Sprintf("%v", d.GetValue()) -} - -// compareLexicographically compares two bounds from two ranges and returns 0, 1, -1 -// for equal, greater than or less than respectively. It gets the two bounds, -// collations and if each bound is open (open1, open2) or closed. In addition, -// it also gets if each bound is lower or upper (low1, low2). -// Lower bounds logically can be extended with -infinity and upper bounds can be extended with +infinity. -func compareLexicographically(tc types.Context, bound1, bound2 []types.Datum, collators []collate.Collator, - open1, open2, low1, low2 bool) (int, error) { - n1 := len(bound1) - n2 := len(bound2) - n := min(n1, n2) - - for i := 0; i < n; i++ { - cmp, err := bound1[i].Compare(tc, &bound2[i], collators[i]) - if err != nil { - return 0, err - } - if cmp != 0 { - return cmp, nil - } - } - - // Handle interval types - if n1 == n2 { - switch { - case !open1 && !open2: - return 0, nil - case open1 == open2: - if low1 == low2 { - return 0, nil - } else if low1 { - return 1, nil - } else { - return -1, nil - } - case open1: - if low1 { - return 1, nil - } - return -1, nil - case open2: - if low2 { - return -1, nil - } - return 1, nil - } - } - - // Unequal length ranges. We use -infinity for lower bounds and +infinity for upper bounds. - if n1 < n2 { - if low1 { - // -infinity is less than anything - return -1, nil - } - // +infinity is higher than anything - return 1, nil - } - // n1 > n2 - if low2 { - // anything is larger than -infinity. - return 1, nil - } - // anything is less than +infinity - return -1, nil -} - -// Check if a list of Datum is a prefix of another list of Datum. This is useful for checking if -// lower/upper bound of a range is a subset of another. -func prefix(tc types.Context, superValue []types.Datum, supValue []types.Datum, length int, collators []collate.Collator) bool { - for i := 0; i < length; i++ { - cmp, err := superValue[i].Compare(tc, &supValue[i], collators[i]) - if (err != nil) || (cmp != 0) { - return false - } - } - return true -} - -// Subset checks if a list of ranges(rs) is a subset of another list of ranges(superRanges). -// This is true if every range in the first list is a subset of any -// range in the second list. Also, we check if all elements of superRanges are covered. -func (rs Ranges) Subset(tc types.Context, superRanges Ranges) bool { - var subset bool - superRangesCovered := make([]bool, len(superRanges)) - if len(rs) == 0 { - return len(superRanges) == 0 - } else if len(superRanges) == 0 { - // unrestricted superRanges and restricted rs - return true - } - - for _, subRange := range rs { - subset = false - for i, superRange := range superRanges { - if subRange.Subset(tc, superRange) { - subset = true - superRangesCovered[i] = true - break - } - } - if !subset { - return false - } - } - for i := 0; i < len(superRangesCovered); i++ { - if !superRangesCovered[i] { - return false - } - } - - return true -} - -func checkCollators(ran1 *Range, ran2 *Range, length int) bool { - // Make sure both ran and superRange have the same collations. - // The current code path for this function always will have same collation - // for ran and superRange. It is added here for future - // use of the function. - for i := 0; i < length; i++ { - if ran1.Collators[i] != ran2.Collators[i] { - return false - } - } - return true -} - -// Subset for Range type, check if range(ran) is a subset of another range(otherRange). -// This is done by: -// - Both ran and otherRange have the same collators. This is not needed for the current code path. -// But, it is used here for future use of the function. -// - Checking if the lower/upper bound of otherRange covers the corresponding lower/upper bound of ran. -// Thus include checking open/closed inetrvals. -func (ran *Range) Subset(tc types.Context, otherRange *Range) bool { - if len(ran.LowVal) < len(otherRange.LowVal) { - return false - } - - if !checkCollators(ran, otherRange, len(otherRange.LowVal)) { - return false - } - - // Either otherRange is closed or both ranges have the same open/close setting. - lowExcludeOK := !otherRange.LowExclude || ran.LowExclude == otherRange.LowExclude - highExcludeOK := !otherRange.HighExclude || ran.HighExclude == otherRange.HighExclude - if !lowExcludeOK || !highExcludeOK { - return false - } - - return prefix(tc, otherRange.LowVal, ran.LowVal, len(otherRange.LowVal), ran.Collators) && - prefix(tc, otherRange.HighVal, ran.HighVal, len(otherRange.LowVal), ran.Collators) -} - -// IntersectRange computes intersection between two ranges. err is set of something went wrong -// during comparison. -func (ran *Range) IntersectRange(tc types.Context, otherRange *Range) (*Range, error) { - intersectLength := max(len(ran.LowVal), len(otherRange.LowVal)) - result := &Range{ - LowVal: make([]types.Datum, 0, intersectLength), - HighVal: make([]types.Datum, 0, intersectLength), - Collators: make([]collate.Collator, 0, intersectLength), - } - - if len(ran.LowVal) > len(otherRange.LowVal) { - result.Collators = ran.Collators - } else { - result.Collators = otherRange.Collators - } - - lowVsHigh, err := compareLexicographically(tc, ran.LowVal, otherRange.HighVal, ran.Collators, - ran.LowExclude, otherRange.HighExclude, true, false) - if err != nil { - return &Range{}, err - } - if lowVsHigh == 1 { - return nil, nil - } - - lowVsHigh, err = compareLexicographically(tc, otherRange.LowVal, ran.HighVal, ran.Collators, - otherRange.LowExclude, ran.HighExclude, true, false) - if err != nil { - return &Range{}, err - } - if lowVsHigh == 1 { - return nil, nil - } - - lowVsLow, err := compareLexicographically(tc, ran.LowVal, otherRange.LowVal, - ran.Collators, ran.LowExclude, otherRange.LowExclude, true, true) - if err != nil { - return &Range{}, err - } - if lowVsLow == -1 { - result.LowVal = otherRange.LowVal - result.LowExclude = otherRange.LowExclude - } else { - result.LowVal = ran.LowVal - result.LowExclude = ran.LowExclude - } - - highVsHigh, err := compareLexicographically(tc, ran.HighVal, otherRange.HighVal, - ran.Collators, ran.HighExclude, otherRange.HighExclude, false, false) - if err != nil { - return &Range{}, err - } - if highVsHigh == 1 { - result.HighVal = otherRange.HighVal - result.HighExclude = otherRange.HighExclude - } else { - result.HighVal = ran.HighVal - result.HighExclude = ran.HighExclude - } - return result, nil -} - -// IntersectRanges computes pairwise intersection between each element in rs and otherRangeList. -func (rs Ranges) IntersectRanges(tc types.Context, otherRanges Ranges) Ranges { - result := Ranges{} - for _, rsRange := range rs { - for _, otherRange := range otherRanges { - subsetLength := min(len(rsRange.LowVal), len(otherRange.LowVal)) - if !checkCollators(rsRange, otherRange, subsetLength) { - return nil - } - oneIntersection, err := rsRange.IntersectRange(tc, otherRange) - if err != nil { - return nil - } - if oneIntersection != nil { - result = append(result, oneIntersection) - } - } - } - return result -} diff --git a/statistics/index.go b/statistics/index.go index a8c39da22b637..257e6ff488963 100644 --- a/statistics/index.go +++ b/statistics/index.go @@ -240,8 +240,10 @@ func (idx *Index) GetRowCount(sctx sessionctx.Context, coll *HistColl, indexRang if fullLen { // At most 1 in this case. if idx.Info.Unique { - totalCount++ - continue + if !indexRange.IsOnlyNull() { + totalCount++ + continue + } } count = idx.equalRowCount(lb, realtimeRowCount) // If the current table row count has changed, we should scale the row count accordingly. diff --git a/statistics/integration_test.go b/statistics/integration_test.go index 0493d870aec89..6adb1008b490d 100644 --- a/statistics/integration_test.go +++ b/statistics/integration_test.go @@ -808,3 +808,18 @@ func TestIssue49986(t *testing.T) { " └─Selection 10.00 cop[tikv] eq(\"astp2019121731703151\", test.acc.m)", " └─TableFullScan 10000.00 cop[tikv] table:b keep order:false, stats:pseudo")) } + +func TestIssue56116(t *testing.T) { + store := testkit.CreateMockStore(t) + tk := testkit.NewTestKit(t, store) + tk.MustExec("use test") + + tk.MustExec("create table t2(id bigint(20) DEFAULT NULL, UNIQUE KEY index_on_id (id))") + tk.MustExec("insert into t2 values (), (), ()") + tk.MustExec("analyze table t2") + tk.MustQuery("explain select count(*) from t2 where id is null;").Check(testkit.Rows( + "StreamAgg_17 1.00 root funcs:count(Column#5)->Column#3", + "└─IndexReader_18 1.00 root index:StreamAgg_9", + " └─StreamAgg_9 1.00 cop[tikv] funcs:count(1)->Column#5", + " └─IndexRangeScan_16 3.00 cop[tikv] table:t2, index:index_on_id(id) range:[NULL,NULL], keep order:false")) +} diff --git a/util/ranger/types.go b/util/ranger/types.go index 6c6ab98c52157..6589d42e7e884 100644 --- a/util/ranger/types.go +++ b/util/ranger/types.go @@ -126,6 +126,18 @@ func (ran *Range) isPoint(stmtCtx *stmtctx.StatementContext, regardNullAsPoint b return !ran.LowExclude && !ran.HighExclude } +// IsOnlyNull checks if the range has [NULL, NULL] or [NULL NULL, NULL NULL] range. +func (ran *Range) IsOnlyNull() bool { + for i := range ran.LowVal { + a := ran.LowVal[i] + b := ran.HighVal[i] + if !(a.IsNull() && b.IsNull()) { + return false + } + } + return true +} + // IsPointNonNullable returns if the range is a point without NULL. func (ran *Range) IsPointNonNullable(sctx sessionctx.Context) bool { return ran.isPoint(sctx.GetSessionVars().StmtCtx, false)