PathwayCommons · Anwesh1 · Mar 11, 2021 · Mar 11, 2021 · Mar 12, 2021 · Mar 12, 2021
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -1,13 +1,13 @@
 # This workflow will install Python dependencies, run tests and lint with a variety of Python versions
 # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
 
-name: build
+name: CI
 
 on:
   push:
-    branches: [ master ]
+    branches: [ master, develop ]
   pull_request:
-    branches: [ master ]
+    branches: [ master, develop ]
 
 jobs:
   build:
@@ -45,8 +45,8 @@ jobs:
         pytest tests --cov ./semantic_search --cov-report=xml --cov-config=./.coveragerc
     - name: Upload coverage to Codecov
       # We don't want to push coverge for every job in the matrix.
-      # Rather arbitrarily, choose to push on Ubuntu with Python 3.8.
-      if: matrix.python-version == '3.9' && matrix.os == 'ubuntu-latest' && github.event_name == 'push'
+      # Rather arbitrarily, choose to push on Ubuntu with Python 3.7.
+      if: matrix.python-version == '3.7' && matrix.os == 'ubuntu-latest'  && (github.event_name == 'push' || github.event_name == 'pull_request')
       uses: codecov/[email protected]
       with:
         file: ./coverage.xml

diff --git a/README.md b/README.md
@@ -1,5 +1,5 @@
 ![build](https://github.com/PathwayCommons/semantic-search/workflows/build/badge.svg)
-[![codecov](https://codecov.io/gh/PathwayCommons/semantic-search/branch/master/graph/badge.svg)](https://codecov.io/gh/PathwayCommons/semantic-search)
+[![codecov](https://codecov.io/gh/PathwayCommons/semantic-search/branch/master/graph/badge.svg?token=K7444IQC9I)](https://codecov.io/gh/PathwayCommons/semantic-search)
 [![Checked with mypy](http://www.mypy-lang.org/static/mypy_badge.svg)](http://mypy-lang.org/)
 ![GitHub](https://img.shields.io/github/license/PathwayCommons/semantic-search?color=blue)
 
@@ -49,64 +49,58 @@ To provide arguments to the server, pass them as environment variables, e.g.:
 CUDA_DEVICE=0 MAX_LENGTH=384 uvicorn semantic_search.main:app
 ```
 
-Once the server is running, you can make a POST request with:
+Once the server is running, you can make a POST request to the `/search` endpoint with a JSON body. E.g.
 
-1. JSON body that is self-contained. Provide the text in `query` and text in `documents` to search against. Sample JSON request:
-
-    ```json
+```json
+{
+  "query": {
+    "uid": "9887103",
+    "text": "The Drosophila activin receptor baboon signals through dSmad2 and controls cell proliferation but not patterning during larval development."
+  },
+  "documents": [
     {
-        "query": {
-            "uid":"9887103",
-            "text": "The Drosophila activin receptor baboon signals through dSmad2 and controls cell proliferation but not patterning during larval development. The TGF-beta superfamily of growth and differentiation factors, including TGF-beta, Activins and bone morphogenetic proteins (BMPs) play critical roles in regulating the development of many organisms..."
-        },
-        "documents":[
-            {
-                "uid": "9887103",
-                "text": "The Drosophila activin receptor baboon signals through dSmad2 and controls cell proliferation but not patterning during larval development. The TGF-beta superfamily of growth and differentiation factors, including TGF-beta, Activins and bone morphogenetic proteins (BMPs) play critical roles in regulating the development of many organisms..."
-            },
-            {
-                "uid": "30049242",
-                "text": "Transcriptional up-regulation of the TGF-β intracellular signaling transducer Mad of Drosophila larvae in response to parasitic nematode infection. The common fruit fly Drosophila melanogaster is an exceptional model for dissecting innate immunity..."
-            },
-            {
-                "uid": "22936248",
-                "text": "High-fidelity promoter profiling reveals widespread alternative promoter usage and transposon-driven developmental gene expression. Many eukaryotic genes possess multiple alternative promoters with distinct expression specificities..."
-            }
-        ],
-        "top_k":2
-    }
-    ```
-
-    The return value is a JSON representation of the `top_k` most similar documents (default: return all, except the query itself):
-
-    ```json
-    [
-        {
-            "uid": 30049242,
-            "score": 0.6427373886108398
-        },
-        {
-            "uid": 22936248,
-            "score": 0.49102723598480225
-        }
-    ]
-    ```
-
-    - NB: In this case, each `uid` in `documents` should be unique, but otherwise have no meaning.
-
-2. JSON body that references PubMed article uids. Sample JSON request:
-
-    ```json
+      "uid": "10320478",
+      "text": "Drosophila dSmad2 and Atr-I transmit activin/TGFbeta signals. "
+    },
     {
-        "query": "9887103",
-        "documents": ["9887103", "30049242", "22936248"],
-        "top_k": 2
+      "uid": "22563507",
+      "text": "R-Smad competition controls activin receptor output in Drosophila. "
+    },
+    {
+      "uid": "18820452",
+      "text": "Distinct signaling of Drosophila Activin/TGF-beta family members. "
+    },
+    {
+      "uid": "10357889"
+    },
+    {
+      "uid": "31270814"
     }
-    ```
+  ],
+  "top_k": 3
+}
+```
+
+The return value is a JSON representation of the `top_k` most similar documents (defaults to 10):
+
+```json
+[
+  {
+    "uid": "10320478",
+    "score": 0.6997108459472656
+  },
+  {
+    "uid": "22563507",
+    "score": 0.6877762675285339
+  },
+  {
+    "uid": "18820452",
+    "score": 0.6436074376106262
+  }
+]
+```
 
-    - Notes:
-      - For each Document element, the text consists of the `ArticleTitle` appended to `Abstract` for that PubMed article. See [pubmed DTD](https://dtd.nlm.nih.gov/ncbi/pubmed/doc/out/180101/index.html)
-      - JSON body may consist of either objects (as in Case 1) or PMID strings for `query` and elements of `documents`. However, the elements of `documents` must either be all be a single type.
+If `"text"` is not provided, we assume `"uid"`s are valid PMIDs and fetch the title and abstract text before embedding, indexing and searching.
 
 ### Running via Docker
 

diff --git a/semantic_search/common/util.py b/semantic_search/common/util.py
@@ -6,6 +6,10 @@
 import torch
 import typer
 from transformers import AutoModel, AutoTokenizer, PreTrainedModel, PreTrainedTokenizer
+from semantic_search.schemas import Document
+from semantic_search.ncbi import uids_to_docs
+
+UID = str
 
 
 class Emoji(Enum):
@@ -113,3 +117,10 @@ def add_to_faiss_index(ids: List[int], embeddings: np.ndarray, index: faiss.Inde
     ids = np.asarray(ids).astype("int64")
     embeddings = embeddings.astype("float32")
     index.add_with_ids(embeddings, ids)
+
+
+def normalize_documents(pmids: List[str]) -> str:
+    normalized_docs = []
+    for doc in pmids:
+        normalized_docs.append(Document(**list(uids_to_docs([doc]))[0][0]))
+    return normalized_docs[0].text  # type: ignore
diff --git a/semantic_search/main.py b/semantic_search/main.py
@@ -1,9 +1,11 @@
+from datetime import datetime
+from http import HTTPStatus
 from operator import itemgetter
-from typing import Dict, List, Optional, Tuple, Union, cast
+from typing import List, Optional, Tuple, Union, cast
 
 import faiss
 import torch
-from fastapi import FastAPI
+from fastapi import FastAPI, Request
 from pydantic import BaseSettings
 
 from semantic_search import __version__
@@ -12,8 +14,9 @@
     encode_with_transformer,
     setup_faiss_index,
     setup_model_and_tokenizer,
+    normalize_documents,
 )
-from semantic_search.schemas import Model, Query
+from semantic_search.schemas import Model, Search, TopMatch
 
 app = FastAPI(
     title="Scientific Semantic Search",
@@ -83,42 +86,58 @@ def app_startup():
     model.index = setup_faiss_index(embedding_dim)
 
 
-@app.post("/")
-async def query(query: Query) -> List[Dict[str, float]]:
-    ids = [int(doc.uid) for doc in query.documents]
-    texts = [document.text for document in query.documents]
+@app.get("/", tags=["General"])
+def index(request: Request):
+    """Health check."""
+    response = {
+        "message": HTTPStatus.OK.phrase,
+        "method": request.method,
+        "status-code": HTTPStatus.OK,
+        "timestamp": datetime.now().isoformat(),
+        "url": request.url._url,
+    }
+    return response
 
-    # # Ensure that the query is not in the index when we search.
-    # query_id = np.asarray(int(query.query.uid)).reshape(
-    #     1,
-    # )
-    # model.index.remove_ids(query_id)
+
+@app.post("/search", tags=["Search"], response_model=List[TopMatch])
+async def search(search: Search):
+    """Returns the `top_k` most similar documents to `query` from the provided list of `documents`
+    and the index.
+    """
+    ids = [int(doc.uid) for doc in search.documents]
+    texts = [document.text for document in search.documents]
 
     # Only add items to the index if they do not already exist.
     # See: https://github.com/facebookresearch/faiss/issues/859
     # To do this, we first determine which of the incoming ids do not exist in the index
     indexed_ids = set(faiss.vector_to_array(model.index.id_map).tolist())
-    to_embed = [(id_, text) for id_, text in zip(ids, texts) if id_ not in indexed_ids]
+
+    if search.query.text is None and search.query.uid not in indexed_ids:
+        search.query.text = normalize_documents([search.query.uid])
+
+    for i, (id_, text) in enumerate(zip(ids, texts)):
+        if text is None and id_ not in indexed_ids:
+            texts[i] = normalize_documents([str(id_)])
+
     # We then embed the corresponding text and update the index
+    to_embed = [(id_, text) for id_, text in zip(ids, texts) if id_ not in indexed_ids]
     if to_embed:
         ids, texts = zip(*to_embed)  # type: ignore
-        embeddings = encode(texts).cpu().numpy()
+        embeddings = encode(texts).cpu().numpy()  # type: ignore
         add_to_faiss_index(ids, embeddings, model.index)
 
     # Can't search for more items than exist in the index
-    top_k = min(model.index.ntotal, query.top_k + 1)
-
+    top_k = min(model.index.ntotal, search.top_k)
     # Embed the query and perform the search
-    query_embedding = encode(query.query.text).cpu().numpy()
+    query_embedding = encode(search.query.text).cpu().numpy()  # type: ignore
     top_k_scores, top_k_indicies = model.index.search(query_embedding, top_k)
 
     top_k_indicies = top_k_indicies.reshape(-1).tolist()
     top_k_scores = top_k_scores.reshape(-1).tolist()
 
-    if int(query.query.uid) in top_k_indicies:
-        index = top_k_indicies.index(int(query.query.uid))
+    if int(search.query.uid) in top_k_indicies:
+        index = top_k_indicies.index(int(search.query.uid))
         del top_k_indicies[index], top_k_scores[index]
-    else:
-        del top_k_indicies[-1], top_k_scores[-1]
 
-    return [{"uid": uid, "score": score} for uid, score in zip(top_k_indicies, top_k_scores)]
+    response = [TopMatch(uid=uid, score=score) for uid, score in zip(top_k_indicies, top_k_scores)]
+    return response