You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Below shows the reason why we should do this. The main person page load would go from ~15 seconds to < 1 second. Because we're adding it to the end, it doesn't impact joins (see last query). We still do full table scans, but because we don't need to reorder in memory it's much faster.
There seems to be no difference in table size between (team_id, id, toDate(created_at)) and (team_id, id, created_at) and for more speed improvements I'd suggest the latter, but I don't understand clickhouse well enough to know.
-- original/* user_id:1 celery:posthog.tasks.tasks.process_query_task */SELECTpersons.id,
persons.idAS id,
persons.created_atAS created_at,
1FROM
(SELECT
argMax(toTimeZone(person.created_at, 'US/Pacific'), person.version) AS created_at,
person.idAS id
FROM
person
WHERE
equals(person.team_id, 1589)
GROUP BYperson.idHAVINGand(ifNull(equals(argMax(person.is_deleted, person.version), 0), 0), ifNull(less(argMax(toTimeZone(person.created_at, 'US/Pacific'), person.version), plus(now64(6, 'US/Pacific'), toIntervalDay(1))), 0))
SETTINGS optimize_aggregation_in_order=1) AS persons
ORDER BYpersons.created_atDESCLIMIT101
OFFSET 0 SETTINGS optimize_aggregation_in_order=1, join_algorithm='auto', readonly=2, max_execution_time=600, allow_experimental_object_type=1, format_csv_allow_double_quotes=0, max_ast_elements=4000000, max_expanded_ast_elements=4000000, max_bytes_before_external_group_by=0--101 rows in set. Elapsed: 15.563 sec. Processed 1.18 billion rows, 48.45 GB (75.86 million rows/s., 3.11 GB/s.)-- Peak memory usage: 2.06 GiB.-- Optimized query/* user_id:1 celery:posthog.tasks.tasks.process_query_task */SELECTpersons.id,
persons.idAS id,
persons.created_atAS created_at,
properties
FROM
(SELECT
argMax(toTimeZone(person.created_at, 'UTC'), person.version) AS created_at,
person.idAS id,
argMax(person.properties, person.version) AS properties
FROM
person
WHEREand(equals(person.team_id, 1589), in(id, (SELECTwhere_optimization.idAS id
FROM
person AS where_optimization
WHEREand(equals(where_optimization.team_id, 1589), notIn(where_optimization.id, (SELECTlimit_delete_optimization.idAS id
FROM
person AS limit_delete_optimization
WHEREand(equals(limit_delete_optimization.team_id, 1589), equals(limit_delete_optimization.is_deleted, 1)))))
ORDER BY
toTimeZone(where_optimization.created_at, 'UTC') DESCLIMIT1 BY where_optimization.idLIMIT101
OFFSET 0)))
GROUP BYperson.idHAVINGand(ifNull(equals(argMax(person.is_deleted, person.version), 0), 0), ifNull(less(argMax(toTimeZone(person.created_at, 'UTC'), person.version), plus(now64(6, 'UTC'), toIntervalDay(1))), 0))) AS persons
ORDER BYpersons.created_atDESCLIMIT101 SETTINGS optimize_aggregation_in_order=1, join_algorithm='auto', readonly=2, max_execution_time=600, allow_experimental_object_type=1, format_csv_allow_double_quotes=0, max_ast_elements=4000000, max_expanded_ast_elements=4000000, max_bytes_before_external_group_by=0-- 101 rows in set. Elapsed: 7.587 sec. Processed 2.37 billion rows, 50.30 GB (312.29 million rows/s., 6.63 GB/s.)-- Peak memory usage: 45.15 GiB.-- create new table with new order by keyCREATETABLEposthog.tim_test_created_at_person
(
`id` UUID,
`created_at` DateTime64(3),
`team_id` Int64,
`properties` String,
`is_identified` Int8,
`is_deleted` Int8 DEFAULT 0,
`version` UInt64,
`_timestamp` DateTime,
`_offset` UInt64,
)
ENGINE = ReplacingMergeTree(version)
-- ORDER BY (team_id, id, toDate(created_at)) -- 29.66GBORDER BY (team_id, id, created_at) -- 29.94GB
SETTINGS index_granularity =8192insert intoposthog.tim_test_created_at_personselect id, created_at, team_id, '', is_identified, is_deleted, version, _timestamp, _offset from person where team_id =1589 SETTINGS max_execution_time =600-- Optimized query with new order by key (note much less mmeory)SELECTwhere_optimization.idAS id
FROM
tim_test_created_at_person AS where_optimization
--person AS where_optimizationWHEREand(
equals(where_optimization.team_id, 1589),
notIn(
where_optimization.id,
(
SELECTlimit_delete_optimization.idAS id
FROM
person AS limit_delete_optimization
WHEREand(
equals(limit_delete_optimization.team_id, 1589),
equals(limit_delete_optimization.is_deleted, 1)
)
)
),
equals(toStartOfMonth(created_at), toStartOfMonth(now64(6, 'UTC')))
)
ORDER BY
toDate(where_optimization.created_at) DESC, where_optimization.created_atDESCLIMIT1 BY where_optimization.idLIMIT101 OFFSET 0-- with toDate(created_at)-- 101 rows in set. Elapsed: 0.901 sec. Processed 2.17 billion rows, 42.63 GB (2.40 billion rows/s., 47.33 GB/s.)-- Peak memory usage: 3.29 GiB.-- with created_at-- 101 rows in set. Elapsed: 0.852 sec. Processed 2.16 billion rows, 42.59 GB (2.54 billion rows/s., 50.00 GB/s.)-- Peak memory usage: 3.32 GiB.-- join comparisonselectcount(distinct e__person.id)
from
(select person_id from events where event ='$pageview'andtimestamp>'2025-02-27 00:00:00'andtimestamp<'2025-02-27 00:00:10'and team_id =1589) as e
LEFT JOIN (
SELECTperson.idAS id
FROM-- person
tim_test_created_at_person as person
WHEREand(
equals(person.team_id, 21661),
ifNull(
in(
tuple(person.id, person.version),
(
SELECTperson.idAS id,
max(person.version) AS version
FROM-- person
tim_test_created_at_person as person
WHERE
equals(person.team_id, 1589)
GROUP BYperson.idHAVINGand(
ifNull(
equals(argMax(person.is_deleted, person.version), 0),
0
),
ifNull(
less(
argMax(
toTimeZone(person.created_at, 'Asia/Dubai'),
person.version
),
plus(now64(6, 'Asia/Dubai'), toIntervalDay(1))
),
0
)
)
limit10000000
)
),
0
)
) SETTINGS optimize_aggregation_in_order =1
) AS e__person ON equals(
e.person_id,
e__person.id
)
-- 1 row in set. Elapsed: 1.662 sec. Processed 341.93 million rows, 13.65 GB (205.79 million rows/s., 8.21 GB/s.)-- Peak memory usage: 2.56 GiB.
The text was updated successfully, but these errors were encountered:
timgl
added
the
performance
Has to do with performance. For PRs, runs the clickhouse query performance suite
label
Feb 27, 2025
We should add created_at to the person table ORDER BY, so it becomes
Below shows the reason why we should do this. The main person page load would go from ~15 seconds to < 1 second. Because we're adding it to the end, it doesn't impact joins (see last query). We still do full table scans, but because we don't need to reorder in memory it's much faster.
There seems to be no difference in table size between
(team_id, id, toDate(created_at))
and(team_id, id, created_at)
and for more speed improvements I'd suggest the latter, but I don't understand clickhouse well enough to know.The text was updated successfully, but these errors were encountered: