Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Sprint 14 : Prioritization of quinolones #28

Draft
wants to merge 5 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
365 changes: 365 additions & 0 deletions synthetic-enumeration/sprint-14-quinolones/00-prep-receptor.py

Large diffs are not rendered by default.

105 changes: 105 additions & 0 deletions synthetic-enumeration/sprint-14-quinolones/01-aggregate-compounds.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
from openeye import oechem
from rich.progress import track

"""
Prepare list of annotated compounds for docking

* Aggregate all compound designs
* Annotate by intermediate
* Sort compounds by size
* Interleave compounds by intermediate

"""

mols = list()

# TODO: Auto-download and timestamp
submissions_csv_filename = 'submissions/submissions-2022-02-24.csv.bz2'

# Read all submitted designs: Compounds with the key substructure will be retained
print('Reading submitted designs...')
# Drop columns that cause trouble for OpenEye
import pandas as pd
drop_columns = ['Submission Rationale', 'Submission Notes']
df = pd.read_csv(submissions_csv_filename, dtype=str)
df.drop(columns=drop_columns, inplace=True)
print(f'{len(df)} records read')
import tempfile
with tempfile.NamedTemporaryFile(suffix='.csv') as csv_file:
df.to_csv(csv_file.name, header=True, index=False)
# Read file
with oechem.oemolistream(csv_file.name) as ifs:
mol = oechem.OEGraphMol()
while oechem.OEReadMolecule(ifs, mol):
# Clear SD tags
oechem.OEClearSDData(mol)
# Store the molecule
mols.append(mol.CreateCopy())
print(f'{len(mols)} molecules read')

# Aggregate all compound designs
print('reading compound designs...')
source_filenames = [
# Filtered synthetic designs
'virtual-libraries/amidation-renamed.csv',
'virtual-libraries/mitsunobu-renamed.csv',
'virtual-libraries/red_ami-renamed.csv',
'virtual-libraries/red_amination_1-renamed.csv',
'virtual-libraries/williamson_ether-renamed.csv',
]
for source_filename in source_filenames:
print(source_filename)
with oechem.oemolistream(source_filename) as ifs:
mol = oechem.OEGraphMol()
while oechem.OEReadMolecule(ifs, mol):
# Clear SD tags
oechem.OEClearSDData(mol)
# Store the molecule
mols.append(mol.CreateCopy())
print(f'{len(mols)} molecules read')

# Annotate molecules with SMARTS labels
print('Annotating SMARTS labels...')
import csv
labels_filename = 'annotations/annotations.csv' # list of labels for various SMARTS patterns
smarts_labels = dict()
with open(labels_filename, 'r') as csvfile:
csvreader = csv.reader(csvfile, delimiter=',')
for row in csvreader:
smarts = row[0]
label = row[1]
if smarts[0] != '#': # allow comments
smarts_labels[smarts] = label
# Label the molecules
for smarts, label in smarts_labels.items():
ss = oechem.OESubSearch(smarts)
for mol in track(mols, description=label):
oechem.OEPrepareSearch(mol, ss)
if ss.SingleMatch(mol):
oechem.OESetSDData(mol, 'scaffold', label)
# Discard molecules without labels
mols = [ mol for mol in mols if oechem.OEHasSDData(mol, 'scaffold') ]
print(f'{len(mols)} molecules remain after discarding molecules that do not match scaffold')

# Sort based on molecular weight
print(f'Sorting molecules by number of atoms...')
mols.sort(key=lambda mol : mol.NumAtoms())

# Filter the number of heavy atoms
#n_heavy_max = 35
n_heavy_max = 50
mols = [ mol for mol in mols if oechem.OECount(mol, oechem.OEIsHeavy()) <= n_heavy_max ]
print(f'{len(mols)} molecule remain after filtering atoms with more than {n_heavy_max} heavy atoms')

# Write molecules
output_filename = 'sorted/sprint-14.csv'
with oechem.oemolostream(output_filename) as ofs:
for mol in track(mols, description='Writing molecules...'):
oechem.OEWriteMolecule(ofs, mol)

# Generate PDF
pdf_filename = output_filename.replace('.csv', '.pdf')
print(f'Generating PDF as {pdf_filename}')
cmd = f"mols2pdf.py -in {output_filename} -out {pdf_filename}"
import os
os.system(cmd)
Loading