harsha-simhadri · ciuji · Oct 29, 2023 · Oct 29, 2023 · Oct 29, 2023 · Oct 30, 2023
diff --git a/.github/workflows/neurips23.yml b/.github/workflows/neurips23.yml
@@ -78,6 +78,9 @@ jobs:
           - algorithm: pyanns
             dataset: random-xs
             track: ood
+          - algorithm: rubignn
+            dataset: random-filter-s
+            track: filter
       fail-fast: false
 
     steps:

diff --git a/neurips23/filter/rubignn/Dockerfile b/neurips23/filter/rubignn/Dockerfile
@@ -0,0 +1,22 @@
+FROM neurips23
+
+RUN apt-get update && apt-get install -y python3-numpy python3-scipy python3-pip build-essential git axel wget
+RUN wget https://aka.ms/downloadazcopy-v10-linux && mv downloadazcopy-v10-linux azcopy.tgz && tar xzf azcopy.tgz --transform 's!^[^/]\+\($\|/\)!azcopy_folder\1!' 
+RUN cp azcopy_folder/azcopy /usr/bin
+
+RUN pip3 install -U pip
+
+RUN apt update
+RUN apt install -y software-properties-common
+RUN add-apt-repository -y ppa:git-core/ppa
+RUN apt update
+RUN DEBIAN_FRONTEND=noninteractive apt install -y git make cmake g++ libaio-dev libgoogle-perftools-dev libunwind-dev clang-format libboost-dev libboost-program-options-dev libmkl-full-dev libcpprest-dev python3.10
+
+WORKDIR /home/app
+RUN git clone https://github.com/rutgers-db/ru-bignn-23.git --branch main
+WORKDIR /home/app/ru-bignn-23
+RUN python3 -m pip install -r requirements_py3.10.txt
+RUN mkdir build
+RUN cmake -S . -B build  -DCMAKE_BUILD_TYPE=Release
+RUN cmake --build build -- -j
+WORKDIR /home/app
diff --git a/neurips23/filter/rubignn/README.md b/neurips23/filter/rubignn/README.md
@@ -0,0 +1,60 @@
+# Submission for NeurIPS'23 Big-ANN Filter Track of team rubignn
+
+Our method improves the Filtered-DiskANN to support thousands attributes and multi-filter search.
+
+Here is the repo for our complete code: [https://github.com/rutgers-db/ru-bignn-23](https://github.com/rutgers-db/ru-bignn-23)
+
+## Download prebuilt index file
+
+sas_string: `sp=rl&st=2023-10-31T01:24:17Z&se=2023-12-01T10:24:17Z&spr=https&sv=2022-11-02&sr=c&sig=1Gk9nCu3%2FdvHZ4IyldHo161Swb5eCaRs%2FXXCnz5JEaU%3D'`
+
+sal_url: `https://rubignn.blob.core.windows.net/biganncontest-96?sp=rl&st=2023-10-31T01:24:17Z&se=2023-12-01T10:24:17Z&spr=https&sv=2022-11-02&sr=c&sig=1Gk9nCu3%2FdvHZ4IyldHo161Swb5eCaRs%2FXXCnz5JEaU%3D`
+
+blob_prefix: `https://rubignn.blob.core.windows.net/biganncontest-96//index_file_96_R10L70`
+
+command for download index files: 
+
+```
+INDEX_FILE_PATH=/home/ubuntu/built_index
+azcopy copy 'https://rubignn.blob.core.windows.net/biganncontest-96//index_file_96_R10L70?sp=rl&st=2023-10-31T01:24:17Z&se=2023-12-01T10:24:17Z&spr=https&sv=2022-11-02&sr=c&sig=1Gk9nCu3%2FdvHZ4IyldHo161Swb5eCaRs%2FXXCnz5JEaU%3D' $INDEX_FILE_PATH --recursive
+```
+
+## Run Searching on Docker
+
+1. Download the index file
+
+2. Build docker through `python install.py --neurips23track filter --algorithm rubignn`
+
+3. Execute searching in docker:
+
+      run `docker_run_container_search.sh`. **Note: may need to modify the directory path(CONTEST_REPO_PATH and INDEX_FILE_PATH)**
+
+## docker_run_container_search script
+
+This is the main running script to mount the directory, run the container, conduct searching, and generate results
+
+After build the container, it will execute these commands inside the container:
+
+1. `mkdir -p /home/app/results/neurips23/filter/yfcc-10M/10/rubignn`: generate output directory
+
+2. `cd /home/app/ru-bignn-23/build && ./apps/search_contest --index_path_prefix /home/app/index_file/yfcc_R10_L70_SR96_stitched_index_label --query_file /home/app/data/yfcc100M/query.public.100K.u8bin --L 50 80 90 100 110 120 130 --query_filters_file /home/app/data/yfcc100M/query.metadata.public.100K.spmat --result_path_prefix /home/app/results/neurips23/filter/yfcc-10M/10/rubignn/rubignn --runs 5 `: execute the searching, it contain these parameters: 
+
+        `--index_path_prefix` index files directory and prefix;
+        `--query_file` is the path for querys;
+        `--query_filters_file` is the path for query filters;
+        `--result_path_prefix`: path to store the results;
+        `--runs`: run every search multiple times to get best search result as `run.py`
+        `--search_list`(or `--L`): search parameters.
+
+
+3. `python3 ../contest-scripts/output_bin_to_hdf5.py /home/app/results/neurips23/filter/yfcc-10M/10/rubignn/rubignn_search_metadata.txt /home/app`: transfer the original bin result to hdf5 results.
+
+## Build Index on Docker
+
+Execute the build script: `docker_run_container_build.sh`.
+
+## Build and search on random-filter-s
+
+Execute the build script: `docker_run_small_test.sh`.
+
+We have intergrate the bash script to contest framework, you can run `python3 run.py --algorithm rubignn --max-n-algorithms 2 --neurips23track filter --dataset random-filter-s` now. But this has extra overhead in searching, because every searching time it will reload the index.
diff --git a/neurips23/filter/rubignn/config.yaml b/neurips23/filter/rubignn/config.yaml
@@ -0,0 +1,36 @@
+yfcc-10M:
+    rubignn:
+      docker-tag: neurips23-filter-rubignn
+      module: neurips23.filter.rubignn.rubignn
+      constructor: rubignn
+      base-args: ["@metric"]
+      run-groups:
+        base:
+          args: |
+            [{"R": 10, "L": 70, "stitched_R":96}]
+          query-args: |
+            [{"L": 80}, 
+             {"L": 90},
+             {"L": 95},
+             {"L": 100},
+             {"L": 105},
+             {"L": 110},
+             {"L": 120},
+             {"L": 130}
+            ]
+random-filter-s:
+    rubignn:
+      docker-tag: neurips23-filter-rubignn
+      module: neurips23.filter.rubignn.rubignn
+      constructor: rubignn
+      base-args: ["@metric"]
+      run-groups:
+        base:
+          args: |
+            [{"R": 8, "L": 20, "stitched_R":16}]
+          query-args: |
+            [{"L": 10}, 
+             {"L": 20},
+             {"L": 30},
+             {"L": 40}
+            ]
diff --git a/neurips23/filter/rubignn/docker_run_container_build.sh b/neurips23/filter/rubignn/docker_run_container_build.sh
@@ -0,0 +1,6 @@
+CONTEST_REPO_PATH=/home/ubuntu/big-ann-benchmarks #path to big-ann-benchmarks directory
+INDEX_FILE_PATH=/home/ubuntu/built_index #path to index_file directory
+
+docker container run -it --memory='15g' --mount type=bind,src=$CONTEST_REPO_PATH/results,dst=/home/app/results --mount type=bind,src=$INDEX_FILE_PATH/index_file_docker_build,dst=/home/app/index_file --read-only --mount type=bind,src=$CONTEST_REPO_PATH/data,dst=/home/app/data  --entrypoint '/bin/bash'  neurips23-filter-rubignn  -c 'cd /home/app/ru-bignn-23/build &&
+./apps/base_label_to_label_file /home/app/data/yfcc100M/base.metadata.10M.spmat /home/app/index_file/label_file_base_yfcc10m_filter.txt &&
+./apps/build_stitched_index --data_type uint8 --data_path /home/app/data/yfcc100M/base.10M.u8bin.crop_nb_10000000 --index_path_prefix /home/app/index_file/yfcc_R10_L70_SR96_stitched_index_label -R 10 -L 70 --stitched_R 96 --alpha 1.2 --label_file /home/app/index_file/label_file_base_yfcc10m_filter.txt --universal_label 0'
diff --git a/neurips23/filter/rubignn/docker_run_container_search.sh b/neurips23/filter/rubignn/docker_run_container_search.sh
@@ -0,0 +1,7 @@
+CONTEST_REPO_PATH=/home/ubuntu/big-ann-benchmarks #path to big-ann-benchmarks directory
+INDEX_FILE_PATH=/home/ubuntu/built_index #path to index_file directory
+
+docker container run -it  --mount type=bind,src=$CONTEST_REPO_PATH/results,dst=/home/app/results --mount type=bind,src=$INDEX_FILE_PATH/index_file_docker_build,dst=/home/app/index_file --read-only --mount type=bind,src=$CONTEST_REPO_PATH/data,dst=/home/app/data  --entrypoint '/bin/bash' neurips23-filter-rubignn   -c 'mkdir -p /home/app/results/neurips23/filter/yfcc-10M/10/rubignn && 
+cd /home/app/ru-bignn-23/build &&
+./apps/search_contest --index_path_prefix /home/app/index_file/yfcc_R10_L70_SR96_stitched_index_label --query_file /home/app/data/yfcc100M/query.public.100K.u8bin --search_list 80 90 95 100 105 110 120 130 --query_filters_file /home/app/data/yfcc100M/query.metadata.public.100K.spmat --result_path_prefix /home/app/results/neurips23/filter/yfcc-10M/10/rubignn/rubignn --runs 5 &&
+python3 ../contest-scripts/output_bin_to_hdf5.py /home/app/results/neurips23/filter/yfcc-10M/10/rubignn/rubignn_search_metadata.txt /home/app'
diff --git a/neurips23/filter/rubignn/docker_run_small_test.sh b/neurips23/filter/rubignn/docker_run_small_test.sh
@@ -0,0 +1,15 @@
+CONTEST_REPO_PATH=/home/ubuntu/big-ann-benchmarks #path to big-ann-benchmarks directory
+INDEX_FILE_PATH=/home/ubuntu/built_index #path to index_file directory
+
+mkdir -p $INDEX_FILE_PATH/index_file_docker_build # make sure index file directory exist
+
+docker container run -it  --mount type=bind,src=$CONTEST_REPO_PATH/results,dst=/home/app/results --mount type=bind,src=$INDEX_FILE_PATH/index_file_docker_build,dst=/home/app/index_file --read-only --mount type=bind,src=$CONTEST_REPO_PATH/data,dst=/home/app/data  neurips23-filter-rubignn  /bin/bash -c 'cd /home/app/ru-bignn-23/build &&
+./apps/base_label_to_label_file /home/app/data/random-filter100000/data_metadata_100000_50 /home/app/index_file/label_file_base_random-filter-s_filter.txt && 
+./apps/build_stitched_index --data_type float --data_path /home/app/data/random-filter100000/data_100000_50 --index_path_prefix /home/app/index_file/random-filter-s_R16_L80_SR96_stitched_index_label -R 16 -L 80 --stitched_R 96 --alpha 1.2 --label_file /home/app/index_file/label_file_base_random-filter-s_filter.txt --universal_label 0'
+
+
+
+docker container run -it  --mount type=bind,src=$CONTEST_REPO_PATH/results,dst=/home/app/results --mount type=bind,src=$INDEX_FILE_PATH/index_file_docker_build,dst=/home/app/index_file --read-only --mount type=bind,src=$CONTEST_REPO_PATH/data,dst=/home/app/data  neurips23-filter-rubignn  /bin/bash -c 'mkdir -p /home/app/results/neurips23/filter/random-filter-s/10/rubignn && 
+cd /home/app/ru-bignn-23/build &&
+./apps/search_contest --index_path_prefix /home/app/index_file/random-filter-s_R16_L80_SR96_stitched_index_label --query_file /home/app/data/random-filter100000/queries_1000_50 --search_list 50 80 100 --query_filters_file /home/app/data/random-filter100000/queries_metadata_100000_50 --result_path_prefix /home/app/results/neurips23/filter/random-filter-s/10/rubignn/rubignn --runs 2 --dataset random-filter-s --data_type float &&
+python3 ../contest-scripts/output_bin_to_hdf5.py /home/app/results/neurips23/filter/random-filter-s/10/rubignn/rubignn_search_metadata.txt /home/app'
diff --git a/neurips23/filter/rubignn/download_index.sh b/neurips23/filter/rubignn/download_index.sh
@@ -0,0 +1,6 @@
+INDEX_FILE_PATH=/home/ubuntu/built_index
+azcopy copy 'https://rubignn.blob.core.windows.net/biganncontest-96//index_file_96_R10L70?sp=rl&st=2023-10-31T01:24:17Z&se=2023-12-01T10:24:17Z&spr=https&sv=2022-11-02&sr=c&sig=1Gk9nCu3%2FdvHZ4IyldHo161Swb5eCaRs%2FXXCnz5JEaU%3D' $INDEX_FILE_PATH --recursive
+
+#another index
+INDEX_FILE_PATH=/home/ubuntu/built_index
+azcopy copy 'https://rubignn.blob.core.windows.net/biganncontest-96/index_file_96_R16L80?sp=rl&st=2023-10-31T01:24:17Z&se=2023-12-01T10:24:17Z&spr=https&sv=2022-11-02&sr=c&sig=1Gk9nCu3%2FdvHZ4IyldHo161Swb5eCaRs%2FXXCnz5JEaU%3D' $INDEX_FILE_PATH --recursive
diff --git a/neurips23/filter/rubignn/output_bin_to_hdf5.py b/neurips23/filter/rubignn/output_bin_to_hdf5.py
@@ -0,0 +1,102 @@
+from __future__ import absolute_import
+
+import h5py
+import json
+import os
+import re
+import numpy as np
+import sys
+import pandas as pd
+
+def load_bin_result(path):
+  results = np.fromfile(path,dtype=np.uint32)
+  results = results[2:]
+  results = results.reshape((-1,10))
+  return results
+
+def get_result_filename(dataset=None, count=None, build_args=None,algorithm=None,
+                      query_arguments=None, neurips23track=None, runbook_path=None):
+  d = ['results']
+  if neurips23track and neurips23track != 'none':
+      d.append('neurips23')
+      d.append(neurips23track)
+      if neurips23track == 'streaming':
+          if runbook_path == None:
+              raise RuntimeError('Need runbook_path to store results')
+          else:
+              d.append(os.path.split(runbook_path)[1])
+  if dataset:
+      d.append(dataset)
+  if count:
+      d.append(str(count))
+  if algorithm:
+      d.append(algorithm)
+
+  if build_args:
+    data = build_args + str(query_arguments)
+    data = re.sub(r'\W+', '_', json.dumps(data, sort_keys=True)).strip('_')
+    if len(data) > 150:
+        data = data[-149:]
+    d.append(data)
+  return os.path.join(*d)
+
+def add_results_to_h5py(f, search_type, results, count, suffix = ''):
+    if search_type == "knn" or search_type == "knn_filtered":
+        neighbors = f.create_dataset('neighbors' + suffix, (len(results), count), 'i', data = results)
+    else:
+        raise NotImplementedError()
+
+
+def store_results(dataset, count, definition, query_arguments,
+        attrs, results, search_type, neurips23track='filter', runbook_path=None):
+    fn = get_result_filename(
+        dataset, count, definition, query_arguments, neurips23track, runbook_path) + '.hdf5'
+    head, tail = os.path.split(fn)
+    if not os.path.isdir(head):
+        os.makedirs(head)
+    f = h5py.File(name=fn, mode='w', libver='latest')
+    for k, v in attrs.items():
+        f.attrs[k] = v
+
+
+    add_results_to_h5py(f, search_type, results, count)
+    f.close()
+
+if __name__=="__main__":
+  args=sys.argv[1:]
+  #TODO: load metadata and bin result
+  search_metadata_path = args[0]
+  result_prefix=args[1]
+  search_metadata=pd.read_csv(search_metadata_path,names=['result_bin_path','build_time','index_size','algo','dataset','best_search_time','name','query_argument','run_count','distance','type','count','search_times'])
+  for _,row in search_metadata.iterrows():
+    fn = get_result_filename(row['dataset'], row['count'], "R10_L70_SR96_",row['algo'],  row['query_argument'], 'filter', None) + '.hdf5'
+    fn = os.path.join(result_prefix,fn)
+    head, tail = os.path.split(fn)
+    print(fn)
+    name_with_para = "rubignn(('R10_L70_SR96_', {{'search_list': {} }}))".format(row['query_argument'])
+    attrs = {
+        "best_search_time": row['best_search_time'],
+        "name": name_with_para,
+        "run_count": row['run_count'],
+        "distance": row['distance'],
+        "type": row['type'],
+        "count": int(row['count']),
+        "search_times": [float(x_) for x_ in str(row['search_times']).split(" ")],
+        "build_time":row['build_time'],
+        "index_size":row['index_size'],
+        "algo":row['algo'],
+        "dataset":row['dataset']
+    }
+
+    results = load_bin_result(row['result_bin_path'])
+    if not os.path.isdir(head):
+        os.makedirs(head)
+    f = h5py.File(name=fn, mode='w', libver='latest')
+    for k, v in attrs.items():
+        f.attrs[k] = v
+    add_results_to_h5py(f, row['type'], results, int(row['count']))
+    f.close()
+
+
+
+
diff --git a/neurips23/filter/rubignn/rubignn.py b/neurips23/filter/rubignn/rubignn.py
@@ -0,0 +1,116 @@
+from neurips23.filter.base import BaseFilterANN
+from benchmark.datasets import DATASETS
+from benchmark.dataset_io import download_accelerated
+import subprocess
+import os
+import pathlib
+import numpy as np
+
+class rubignn(BaseFilterANN):
+    def __init__(self, metric, index_params):
+        self._index_params=index_params
+        self._metric = metric
+        self.method_name="rubignn"
+        print(index_params)
+
+    def load_bin_result(self,path,k):
+        results = np.fromfile(path,dtype=np.uint32)
+        results = results[2:]
+        results = results.reshape((-1,k))
+        return results
+
+    def get_results(self):
+        return self.load_bin_result(self.result_path,self.k)
+
+    def filtered_query(self, X, filter, k):
+        """
+        This is intergrated for CI test, it has overhead to load the index every search time.
+        So please use our custom setup for the real test for yfcc-10M
+        """
+        self.k=k
+        print("We have custom setup")
+        print(X.shape,k)
+        qs_file_name=str(os.path.join(self.ds.basedir, self.ds.qs_fn))
+        print(qs_file_name)
+        qs_metadata_name = str(os.path.join(self.ds.basedir, self.ds.qs_metadata_fn))
+        print(qs_metadata_name)
+        result_path_prefix=str(os.path.join("/home/app/results/neurips23/filter",self.dataset,str(k),self.method_name))
+        print(result_path_prefix)
+        cmd = ['./ru-bignn-23/build/apps/search_contest','--data_type',self.data_dtype,'--index_path_prefix',self.index_path,'--query_file',qs_file_name,'--query_filters_file',qs_metadata_name,'-L',str(self.L_search),"--runs","1","--dataset",self.dataset,"--result_path_prefix",result_path_prefix+"/rubignn"]
+        subprocess.run(cmd)
+        self.result_path=result_path_prefix+"/rubignn_L"+str(self.L_search)+"_idx_uint32.bin"
+
+
+
+
+    def get_index_prefix(self):
+        res = ""
+        for i,j in enumerate(self._index_params):
+            res=res+str(j)+str(self._index_params[j])+"-"
+        res=res+"stiched_index"
+        return res
+
+    def fit(self, dataset):
+        ds = DATASETS[dataset]()
+        print("params:",self._index_params)
+        data_path = ds.get_dataset_fn()
+        metadata_path = '/home/app/'+ str(os.path.join(ds.basedir, ds.ds_metadata_fn))
+        index_file_prefix='/home/app/data/index_file/'
+        pathlib.Path(index_file_prefix).mkdir(parents=True, exist_ok=True) 
+        label_file_path = index_file_prefix+'label_file_base_'+dataset+'_filter.txt'
+        subprocess.run(['./ru-bignn-23/build/apps/base_label_to_label_file',metadata_path,label_file_path])
+        index_prefix = index_file_prefix+dataset+"-"+self.get_index_prefix()
+        print(index_prefix)
+        self.data_dtype='uint8'
+        self.ds=ds
+        self.dataset=dataset
+        if(ds.dtype=="float32"):
+            self.data_dtype='float'
+
+        cmd = ['./ru-bignn-23/build/apps/build_stitched_index','--data_type',self.data_dtype,'--data_path','/home/app/'+data_path,'--index_path_prefix',index_prefix,'-R',str(self._index_params['R']),'-L',str(self._index_params['L']),'--stitched_R',str(self._index_params['stitched_R']),'--alpha','1.2','--label_file',label_file_path,'--universal_label','0']
+        print(' '.join(cmd))
+        subprocess.run(cmd)
+        self.index_path=index_prefix
+        print("index_path:",self.index_path)
+        if os.path.isfile(self.index_path):
+            print('Build Index Success!')
+        else:
+            print("fail,",self.index_path)
+            assert(False)
+
+
+    def set_query_arguments(self, query_args):
+        # faiss.cvar.indexIVF_stats.reset()
+        # if "nprobe" in query_args:
+        #     self.nprobe = query_args['nprobe']
+        #     self.ps.set_index_parameters(self.index, f"nprobe={query_args['nprobe']}")
+        #     self.qas = query_args
+        # else:
+        #     self.nprobe = 1
+        # if "mt_threshold" in query_args:
+        #     self.metadata_threshold = query_args['mt_threshold']
+        # else:
+        #     self.metadata_threshold = 1e-3
+
+        # TODO: fix
+        print("setting query args: ",query_args)
+        if "L" in query_args:
+            self.L_search = query_args["L"]
+        else:
+             sefl.L_search=100
+        self.qas=query_args
+
+    def load_index(self, dataset):
+        """
+        Load the index for dataset. Returns False if index
+        is not available, True otherwise.
+
+        Checking the index usually involves the dataset name
+        and the index build paramters passed during construction.
+        """
+
+        return False        
+
+    def __str__(self):
+        return f'rubignn({self._index_params, self.qas})'
+
diff --git a/neurips23/filter/rubignn/search_runner.sh b/neurips23/filter/rubignn/search_runner.sh
@@ -0,0 +1,3 @@
+mkdir -p $CONTEST_REPO_PATH/results/neurips23/filter/yfcc-10M/10/rubignn
+cd /home/app/ru-bignn-23/build
+./apps/search_contest --index_path_prefix /home/app/index_file/yfcc_R16_L80_SR80_stitched_index_label --query_file /home/app/data/yfcc100M/query.public.100K.u8bin --search_list 50 80 90 100 110 120 130 --query_filters_file /home/app/data/yfcc100M/query.metadata.public.100K.spmat --result_path_prefix /home/app/results/neurips23/filter/yfcc-10M/10/rubignn/rubignn