feat: changes made to evaluation runner of width-generic tactics (#1001)

We change our message printing to be more legible, and we also start adding scripting to analyze the DB that we generate from the evaluation.
opencompl · Feb 7, 2025 · d5d4f38 · d5d4f38
1 parent 4e76df5
commit d5d4f38
Show file tree

Hide file tree

Showing 5 changed files with 93 additions and 5 deletions.
diff --git a/SSA/Experimental/Bits/Fast/Reflect.lean b/SSA/Experimental/Bits/Fast/Reflect.lean
@@ -1098,13 +1098,13 @@ structure Config where
   This is useful to prevent the tactic from taking oodles of time cruncing on goals that
   build large state spaces, which can happen in the presence of tactics.
   -/
-  circuitSizeThreshold : Nat := 200
+  circuitSizeThreshold : Nat := 0
 
   /--
   The upper bound on the state space of the FSM, beyond which the tactic will bail out on an error.
   See also `Config.circuitSizeThreshold`.
   -/
-  stateSpaceSizeThreshold : Nat := 20
+  stateSpaceSizeThreshold : Nat := 0
   /--
   Whether the tactic should used a specialized solver for fixed-width constraints.
   -/

diff --git a/SSA/Projects/InstCombine/TacticAuto.lean b/SSA/Projects/InstCombine/TacticAuto.lean
@@ -308,6 +308,7 @@ macro "bv_bench": tactic =>
             "bv_ring" : (bv_ring; done),
             "bv_of_bool" : (bv_of_bool; done),
             "bv_omega" : (bv_omega; done),
+            -- Automata, Classic.
             "bv_automata_classic_prop" : (bool_to_prop; bv_automata_classic; done),
             "bv_automata_classic" : (bv_automata_classic; done),
             "bv_normalize_automata_classic" : ((try (solve | bv_normalize)); (try bv_automata_classic); done),
@@ -346,7 +347,7 @@ macro "bv_bench_automata": tactic =>
         all_goals (
           tac_bench (config := { outputType := .csv }) [
             "presburger" : (bv_automata_classic; done),
-            "circuit" : (bv_automata_circuit (config := { backend := .cadical /- maxIter -/ 4 }); done),
+            "circuit" : (bv_automata_circuit (config := { backend := .cadical 0 }); done),
             "no_uninterpreted" : (bv_automata_fragment_no_uninterpreted),
             "width_ok" : (bv_automata_fragment_width_legal),
             "reflect_ok" : (bv_automata_fragment_reflect),

diff --git a/bv-evaluation/.gitignore b/bv-evaluation/.gitignore
@@ -1,4 +1,5 @@
 *.sqlite3
+.venv/
 *.sqlite3.log
 results/
 
diff --git a/bv-evaluation/compare-automata-automata-circuit.py b/bv-evaluation/compare-automata-automata-circuit.py
@@ -11,6 +11,8 @@
 import re
 import sqlite3
 import logging
+import random
+from tabulate import tabulate
 
 ROOT_DIR = subprocess.check_output(['git', 'rev-parse', '--show-toplevel']).decode('utf-8').strip()
 BENCHMARK_DIR = ROOT_DIR + '/SSA/Projects/InstCombine/tests/proofs/'
@@ -138,7 +140,15 @@ def process(db : str, jobs: int, prod_run : bool):
     con.close()
 
     logging.info(f"Clearing git state of '{BENCHMARK_DIR}' with 'git checkout --'")
-    subprocess.Popen(f'git checkout -- {BENCHMARK_DIR}', cwd=ROOT_DIR, shell=True).wait()
+    gco = subprocess.Popen(f'git checkout -- {BENCHMARK_DIR}', cwd=ROOT_DIR, shell=True)
+    gco.wait()
+    assert gco.returncode == 0, f"git checkout -- {BENCHMARK_DIR} should succeed."
+
+
+    logging.info(f"Running a 'lake exe cache get && lake build'.")
+    lake = subprocess.Popen(f'lake exe cache get && lake build', cwd=ROOT_DIR, shell=True)
+    lake.wait()
+    assert lake.returncode == 0, f"lake build should succeed before running evaluation."
 
     with concurrent.futures.ThreadPoolExecutor(max_workers=jobs) as executor:
         futures = {}
@@ -167,21 +177,87 @@ def setup_logging(db_name : str):
         format='%(asctime)s - %(levelname)s - %(message)s',
         handlers=[logging.FileHandler(f'{db_name}.log', mode='a'), logging.StreamHandler()])
 
+def analyze_uninterpreted_functions(cur : sqlite3.Cursor):
+    """Tabulate all uninterpreted functions"""
+    logging.info("Analyzing uninterpreted functions")
+    rows = cur.execute("""
+        SELECT fileTitle, thmName, goalStr, errMsg FROM tests WHERE tactic = "circuit" AND status = "err"
+    """)
+
+    KEY_UNTRUE = "failed to prove goal  since decideIfZerosM established that theorem is not true"
+    KEY_MULTIPLE_WIDTHS = "found multiple bitvector widths in the target"
+    KEY_UNKNOWN_GOAL_STATE_FORMAT = "expected predicate over bitvectors (no quantification)  found"
+    KEY_SYMBOLIC_SHIFT_LEFT = "expected shiftLeft by natural number  found symbolic shift amount"
+    KEY_UNKNOWN_BOOLEAN_EQUALITY = "only boolean conditionals allowed are 'bv.slt bv = true'  'bv.sle bv = true'. Found"
+    KEY_UNKNOWN_BOOLEAN_DISEQUALITY = "Expected typeclass to be 'BitVec w' / 'Nat'  found '   Bool' in"
+
+    str2explanation = {
+      KEY_UNTRUE : "theorems that are established as untrue",
+      KEY_MULTIPLE_WIDTHS : "theorems that have multiple widths",
+      KEY_UNKNOWN_GOAL_STATE_FORMAT : "theorems that have unknown goal state format",
+      KEY_SYMBOLIC_SHIFT_LEFT : "theorems that have symbolic left shift amount",
+      KEY_UNKNOWN_BOOLEAN_EQUALITY : "unknown boolean equality",
+      KEY_UNKNOWN_BOOLEAN_DISEQUALITY : "unknown boolean disequality",
+    }
+    str2matches = {
+      KEY_UNTRUE : [],
+      KEY_MULTIPLE_WIDTHS : [],
+      KEY_UNKNOWN_GOAL_STATE_FORMAT : [],
+      KEY_SYMBOLIC_SHIFT_LEFT : [],
+      KEY_UNKNOWN_BOOLEAN_EQUALITY : [],
+      KEY_UNKNOWN_BOOLEAN_DISEQUALITY : [],
+    }
+
+
+    HEADERS = ["errMsg", "goalStr", "thmName", "fileTitle"]
+    HEADER_COL_WIDTHS = [40, 40, 50, 50]
+    for row in rows:
+        (fileTitle, thmName, goalStr, errMsg) = row
+        matched = False
+        for s in str2matches:
+            if s in errMsg:
+                str2matches[s].append((errMsg, goalStr, thmName, fileTitle))
+                matched = True
+                break
+        if not matched:
+            print(errMsg)
+
+    for s in str2matches:
+        NEXAMPLES = 5
+        print(f"{str2explanation[s]} (#{len(str2matches[KEY_UNTRUE])}):")
+        print(tabulate(str2matches[s][:NEXAMPLES], headers=HEADERS, tablefmt="grid", maxcolwidths=HEADER_COL_WIDTHS))
+
+
+# analyze the sqlite db.
+def analyze(db : str):
+    con = sqlite3.connect(db)
+    cur = con.cursor()
+    analyze_uninterpreted_functions(cur)
+    pass
+
+
 if __name__ == "__main__":
   current_time = datetime.datetime.now().strftime('%Y-%m-%d-%H:%M:%S')
   nproc = os.cpu_count()
   parser = argparse.ArgumentParser(prog='compare-automata-automata-circuit')
-  parser.add_argument('--db', default=f'automata-circuit-{current_time}.sqlite3', help='path to sqlite3 database')
+  default_db = f'automata-circuit-{current_time}.sqlite3'
+  parser.add_argument('--db', default=default_db, help='path to sqlite3 database')
   parser.add_argument('-j', '--jobs', type=int, default=nproc // 3)
   parser.add_argument('--run', action='store_true', help="run evaluation")
   parser.add_argument('--prodrun', action='store_true', help="run production run of evaluation")
+  parser.add_argument('--analyze', action='store_true', help="analyze the data of the db")
   args = parser.parse_args()
   setup_logging(args.db)
   logging.info(args)
   if args.run:
     process(args.db, args.jobs, prod_run=False)
   elif args.prodrun:
     process(args.db, args.jobs, prod_run=True)
+  elif args.analyze:
+    if args.db == default_db:
+      logging.error("expected additional argument '--db <path/to/db/to/analyze'")
+      exit(1)
+    analyze(args.db)
   else:
     logging.error("expected --run or --prodrun.")
 
diff --git a/bv-evaluation/requirements.txt b/bv-evaluation/requirements.txt
@@ -0,0 +1,10 @@
+importlib_metadata==8.6.1
+numpy==2.2.2
+pandas==2.2.3
+python-dateutil==2.9.0.post0
+pytz==2025.1
+six==1.17.0
+tabulate==0.9.0
+tzdata==2025.1
+visidata==3.1.1
+zipp==3.21.0