From e58463253540bcbe2342898373a8173e9db7962a Mon Sep 17 00:00:00 2001
From: Mats Kindahl <mats@timescale.com>
Date: Fri, 17 Jan 2025 10:58:37 +0100
Subject: [PATCH] Fix lock order when dropping index

If an index is dropped, it is necessary to lock the heap table (of
the index) before the index since all normal operations do it in this
order. When dropping an index, we did not take all the necessary locks
in the right order before calling `performMultipleDeletions`, which can
cause deadlocks when dropping an index on a hypertable at the same time
as running a utility statement that takes heavy locks, e.g., VACUUM or
ANALYZE.

Adding a isolation test as well that will generate a deadlock if the
index and table locks are not taken in the correct order.
---
 .unreleased/pr_7600                           |   1 +
 src/chunk_index.c                             | 106 +++++++++++---
 .../expected/deadlock_drop_index_vacuum.out   |  24 ++++
 tsl/test/isolation/specs/CMakeLists.txt       |   1 +
 .../specs/deadlock_drop_index_vacuum.spec     | 130 ++++++++++++++++++
 5 files changed, 240 insertions(+), 22 deletions(-)
 create mode 100644 .unreleased/pr_7600
 create mode 100644 tsl/test/isolation/expected/deadlock_drop_index_vacuum.out
 create mode 100644 tsl/test/isolation/specs/deadlock_drop_index_vacuum.spec

diff --git a/.unreleased/pr_7600 b/.unreleased/pr_7600
new file mode 100644
index 00000000000..bb3a4008f81
--- /dev/null
+++ b/.unreleased/pr_7600
@@ -0,0 +1 @@
+Fixes: #7600 Fix lock order when dropping index
diff --git a/src/chunk_index.c b/src/chunk_index.c
index da11d4a5658..5e85282f2fc 100644
--- a/src/chunk_index.c
+++ b/src/chunk_index.c
@@ -573,17 +573,70 @@ typedef struct ChunkIndexDeleteData
 	bool drop_index;
 } ChunkIndexDeleteData;
 
-/* Find all internal dependencies to be able to delete all the objects in one
+/*
+ * Lock object.
+ *
+ * In particular, we need to ensure that we lock the table of an index before
+ * locking the index, or run the risk of ending up in a deadlock since the
+ * normal locking order is table first, index second. Since we're not a
+ * concurrent delete, we take a strong lock for this.
+ *
+ * It is also necessary that the parent table is locked first, but we have
+ * already done that at this stage, so it does not need to be done explicitly.
+ */
+static bool
+chunk_lock_object_for_deletion(const ObjectAddress *obj)
+{
+	/*
+	 * If we're locking an index, we need to lock the table first. See
+	 * RangeVarCallbackForDropRelation() in tablecmds.c. We can ignore
+	 * partition indexes since we're not using that.
+	 */
+	char relkind = get_rel_relkind(obj->objectId);
+
+	/*
+	 * If we cannot find the object, it might have been concurrently deleted
+	 * (we do not have locks on objects yet).
+	 */
+	if (relkind == '\0')
+		return false;
+	if (relkind == RELKIND_INDEX)
+	{
+		Oid heapOid = IndexGetRelation(obj->objectId, true);
+		if (OidIsValid(heapOid))
+			LockRelationOid(heapOid, AccessExclusiveLock);
+	}
+
+	LockRelationOid(obj->objectId, AccessExclusiveLock);
+	return true;
+}
+
+/*
+ * Find all internal dependencies to be able to delete all the objects in one
  * go. We do this by scanning the dependency table and keeping all the tables
- * in our internal schema. */
-static void
-chunk_collect_objects_for_deletion(const ObjectAddress *relobj, ObjectAddresses *objects)
+ * in our internal schema.
+ *
+ * We also lock the objects in the correct order (meaning table first, index
+ * second) here to make sure that we do not end up with deadlocks.
+ *
+ * We return 'true' if we added any objects, and 'false' otherwise.
+ */
+static bool
+chunk_collect_and_lock_objects_for_deletion(const ObjectAddress *relobj, ObjectAddresses *objects)
 {
 	Relation deprel = table_open(DependRelationId, RowExclusiveLock);
 	ScanKeyData scankey[2];
 	SysScanDesc scan;
 	HeapTuple tup;
 
+	/*
+	 * If the object disappeared before we managed to get a lock on it, there
+	 * is nothing more to do so just return early and indicate that there are
+	 * no objects to delete.
+	 */
+	if (!chunk_lock_object_for_deletion(relobj))
+		return false;
+
 	add_exact_object_address(relobj, objects);
 
 	ScanKeyInit(&scankey[0],
@@ -608,18 +661,13 @@ chunk_collect_objects_for_deletion(const ObjectAddress *relobj, ObjectAddresses
 	{
 		Form_pg_depend record = (Form_pg_depend) GETSTRUCT(tup);
 		ObjectAddress refobj = { .classId = record->refclassid, .objectId = record->refobjid };
-
-		switch (record->deptype)
-		{
-			case DEPENDENCY_INTERNAL:
-				add_exact_object_address(&refobj, objects);
-				break;
-			default:
-				continue; /* Do nothing */
-		}
+		if (record->deptype == DEPENDENCY_INTERNAL && chunk_lock_object_for_deletion(&refobj))
+			add_exact_object_address(&refobj, objects);
 	}
+
 	systable_endscan(scan);
 	table_close(deprel, RowExclusiveLock);
+	return true;
 }
 
 static ScanTupleResult
@@ -642,7 +690,8 @@ chunk_index_tuple_delete(TupleInfo *ti, void *data)
 
 		if (OidIsValid(idxobj.objectId))
 		{
-			/* If we use performDeletion here it will fail if there are
+			/*
+			 * If we use performDeletion() here it will fail if there are
 			 * internal dependencies on the object since we are restricting
 			 * the cascade.
 			 *
@@ -651,15 +700,28 @@ chunk_index_tuple_delete(TupleInfo *ti, void *data)
 			 * internal dependencies and use the function
 			 * performMultipleDeletions.
 			 *
-			 * The function performMultipleDeletions accept a list of objects
-			 * and if there are dependencies between any of the objects given
-			 * to the function, it will not generate an error for that but
-			 * rather proceed with the deletion. If there are any dependencies
-			 * (internal or not) outside this set of objects, it will still
-			 * abort the deletion and print an error. */
+			 * We lock the objects to delete first to make sure that the lock
+			 * order is correct. This is done inside RemoveRelations and
+			 * performMultipleDeletions() expect these locks to be taken
+			 * first. If not, it will take very rudimentary locks, which will
+			 * cause deadlocks in some cases because the lock order is not
+			 * correct.
+			 *
+			 * Since we do not have any locks on any objects at this point,
+			 * the relations might have disappeared before we had a chance to
+			 * lock them. In this case it is not necessary to do an explicit
+			 * call to performMultipleDeletions().
+			 *
+			 * The function performMultipleDeletions() accept a list of
+			 * objects and if there are dependencies between any of the
+			 * objects given to the function, it will not generate an error
+			 * for that but rather proceed with the deletion. If there are any
+			 * dependencies (internal or not) outside this set of objects, it
+			 * will still abort the deletion and print an error.
+			 */
 			ObjectAddresses *objects = new_object_addresses();
-			chunk_collect_objects_for_deletion(&idxobj, objects);
-			performMultipleDeletions(objects, DROP_RESTRICT, 0);
+			if (chunk_collect_and_lock_objects_for_deletion(&idxobj, objects))
+				performMultipleDeletions(objects, DROP_RESTRICT, 0);
 			free_object_addresses(objects);
 		}
 	}
diff --git a/tsl/test/isolation/expected/deadlock_drop_index_vacuum.out b/tsl/test/isolation/expected/deadlock_drop_index_vacuum.out
new file mode 100644
index 00000000000..03fa5f15125
--- /dev/null
+++ b/tsl/test/isolation/expected/deadlock_drop_index_vacuum.out
@@ -0,0 +1,24 @@
+Parsed test spec with 4 sessions
+
+starting permutation: S1_lock S3_vacuum S2_lock S1_commit S4_drop_index S2_commit
+step S1_lock: 
+    LOCK TABLE _timescaledb_internal.metrics_chunk_2 IN ACCESS EXCLUSIVE MODE;
+
+step S3_vacuum: 
+    VACUUM ANALYZE _timescaledb_internal.metrics_chunk_2;
+ <waiting ...>
+step S2_lock: 
+    LOCK TABLE _timescaledb_internal.metrics_chunk_2 IN ACCESS EXCLUSIVE MODE;
+ <waiting ...>
+step S1_commit: 
+    COMMIT;
+
+step S2_lock: <... completed>
+step S4_drop_index: 
+    DROP INDEX metrics_device_time_idx;
+ <waiting ...>
+step S2_commit: 
+    COMMIT;
+
+step S3_vacuum: <... completed>
+step S4_drop_index: <... completed>
diff --git a/tsl/test/isolation/specs/CMakeLists.txt b/tsl/test/isolation/specs/CMakeLists.txt
index 629f53094fb..960abb4b0d4 100644
--- a/tsl/test/isolation/specs/CMakeLists.txt
+++ b/tsl/test/isolation/specs/CMakeLists.txt
@@ -20,6 +20,7 @@ list(
   cagg_multi_iso.spec
   cagg_concurrent_refresh.spec
   deadlock_drop_chunks_compress.spec
+  deadlock_drop_index_vacuum.spec
   parallel_compression.spec
   osm_range_updates_iso.spec)
 
diff --git a/tsl/test/isolation/specs/deadlock_drop_index_vacuum.spec b/tsl/test/isolation/specs/deadlock_drop_index_vacuum.spec
new file mode 100644
index 00000000000..be11e373984
--- /dev/null
+++ b/tsl/test/isolation/specs/deadlock_drop_index_vacuum.spec
@@ -0,0 +1,130 @@
+# This file and its contents are licensed under the Timescale License.
+# Please see the included NOTICE for copyright information and
+# LICENSE-TIMESCALE for a copy of the license.
+
+# Order of locks in drop index and vacuum analyze was wrong, and DROP
+# INDEX took the locks in order index-table, while VACUUM ANALYZE took
+# them in order table-index.
+#
+# Create deadlock if the locking order is wrong. Problem here is that
+# vacuum takes two locks. First one to read a table and then one to do
+# the actual vacuum. For this reason we need two processes that end up
+# in the deadlock (VACUUM ANALYZE and DROP INDEX respectively) and
+# then two extra sessions working in lock-step to create the deadlock.
+#
+# It can be illustrated with this sequence, where we have a chunk
+# table and a chunk index. The sessions between the brackets ([]) is
+# the lock queue and session holding the lock is in angle brackets
+# (<>) is the session that holds the lock on the object in question:
+#
+# S1: Lock chunk from hypertable
+#      index:      []
+#      table: <S1> []
+# S3: Start VACUUM ANALYZE, will attempt to lock chunk table, but get queued.
+#      index:      []
+#      table: <S1> [S3]
+# S2: Lock chunk table from hypertable, which will be queued
+#      index:      []
+#      table: <S1> [S3 S2]
+# S1: Unlock chunk table
+#      index:      []
+#      table:      [S3 S2]
+# S3: VACUUM ANALYZE continues and takes the lock on the chunk table.
+#      index:      []
+#      table: <S3> [S2]
+# S3: VACUUM ANALYZE will release the lock on the chunk table.
+#      index:      []
+#      table:      [S2]
+# S3: VACUUM ANALYZE will attempt to lock the chunk table again
+#      index:      []
+#      table:      [S2 S3]
+# S2: The LOCK statement gets the lock and VACUUM will wait
+#      index:      []
+#      table: <S2> [S3]
+# S4: DROP INDEX starts and takes lock in index first and then is
+#     queued for the chunk table
+#      index: <S4> []
+#      table: <S2> [S3 S4]
+# S2: Release lock on chunk table
+#      index: <S4> []
+#      table:      [S3 S4]
+# S3: VACUUM continues and takes table lock and then tries index lock
+#      index: <S4> [S3]
+#      table: <S3> [S4]
+# Deadlock
+
+setup {
+    CREATE TABLE metrics (time timestamptz, device_id integer, temp float);
+    SELECT create_hypertable('metrics', 'time', chunk_time_interval => interval '1 day');
+    INSERT INTO metrics
+    SELECT generate_series('2018-12-01 00:00'::timestamp,
+			   '2018-12-03 00:00',
+			   '1 hour'),
+           (random()*30)::int,
+	   random()*80 - 40;
+    
+    CREATE INDEX metrics_device_time_idx ON metrics(device_id, time NULLS FIRST);
+
+    -- Rename chunks so that we have known names. We cannot execute
+    -- VACUUM in a function block. 
+    DO $$
+    DECLARE
+      chunk regclass;
+      count int = 1;
+    BEGIN
+    FOR chunk IN SELECT ch FROM show_chunks('metrics') ch
+    LOOP
+      EXECUTE format('ALTER TABLE %s RENAME TO metrics_chunk_%s', chunk, count);
+      count = count + 1;
+    END LOOP;
+    END
+    $$;
+}
+
+teardown {
+    DROP TABLE metrics;
+}
+
+session "S1"
+setup {
+    START TRANSACTION;
+    SET TRANSACTION ISOLATION LEVEL READ COMMITTED;
+    SET LOCAL lock_timeout = '500ms';
+    SET LOCAL deadlock_timeout = '300ms';
+}
+
+step "S1_lock" {
+    LOCK TABLE _timescaledb_internal.metrics_chunk_2 IN ACCESS EXCLUSIVE MODE;
+}
+
+step "S1_commit" {
+    COMMIT;
+}
+
+session "S2"
+setup {
+    START TRANSACTION;
+    SET TRANSACTION ISOLATION LEVEL READ COMMITTED;
+    SET LOCAL lock_timeout = '500ms';
+    SET LOCAL deadlock_timeout = '300ms';
+}
+
+step "S2_lock" {
+    LOCK TABLE _timescaledb_internal.metrics_chunk_2 IN ACCESS EXCLUSIVE MODE;
+}
+
+step "S2_commit" {
+    COMMIT;
+}
+
+session "S3"
+step "S3_vacuum" {
+    VACUUM ANALYZE _timescaledb_internal.metrics_chunk_2;
+}
+
+session "S4"
+step "S4_drop_index" {
+    DROP INDEX metrics_device_time_idx;
+}
+
+permutation S1_lock S3_vacuum S2_lock S1_commit S4_drop_index S2_commit