Skip to content

Commit

Permalink
WIP #25 (probably not working)
Browse files Browse the repository at this point in the history
  • Loading branch information
Adafede committed Mar 13, 2024
1 parent dd422ce commit 8f33fb5
Show file tree
Hide file tree
Showing 3 changed files with 64 additions and 57 deletions.
21 changes: 11 additions & 10 deletions storage/models/structures_descriptors.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,20 @@
from sqlalchemy import Index
from sqlalchemy.orm import Mapped, mapped_column
from sqlalchemy import Column, Integer, String, Index, Float, ForeignKey
from sqlalchemy.orm import relationship

from storage.models.base import Base


class StructuresDescriptors(Base):
__tablename__ = "structures_descriptors"

id: Mapped[int] = mapped_column(primary_key=True)
smiles: Mapped[str]
# TODO decide how to add the whole dict
# TODO this has not been tested, probably not working
id = Column(Integer, primary_key=True)
structure_id = Column(Integer, ForeignKey("structures.id"))
structure = relationship("Structures", backref="descriptors")
descriptor_name = Column(String)
descriptor_value = Column(Float)

# TODO
# __table_args__ = (Index("structure_id", "id"),)
__table_args__ = (Index("descriptor_id", "descriptor_name"),)

# TODO
# def __repr__(self):
# return f"StructuresDescriptors(id={self.id}, smiles={self.smiles})"
def __repr__(self):
return f"StructuresDescriptors(id={self.id}, structure_id={self.structure_id}, descriptor_name={self.descriptor_name}, descriptor_value={self.descriptor_value})"
18 changes: 9 additions & 9 deletions storage/storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,15 +83,15 @@ def upsert_structures(self, structures: list[dict[str, object]]) -> None:
)
session.commit()

# TODO
# def upsert_structures_descriptors(self, descriptors: list[dict[str, object]]) -> None:
# with self.session(autoflush=False) as session:
# for i in range(0, len(descriptors), self.list_limit // 2):
# session.execute(
# insert(StructuresDescriptors),
# descriptors[i : i + self.list_limit // 2],
# )
# session.commit()
# TODO this has not been tested, probably not working
def upsert_structures_descriptors(self, descriptors: list[dict[str, object]]) -> None:
with self.session(autoflush=False) as session:
for i in range(0, len(descriptors), self.list_limit // 2):
session.execute(
insert(StructuresDescriptors),
descriptors[i : i + self.list_limit // 2],
)
session.commit()

def upsert_taxo_names(self, taxo_names: list[dict[str, object]]) -> None:
with self.session(autoflush=False) as session:
Expand Down
82 changes: 44 additions & 38 deletions update/generate_database_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,9 +61,23 @@ def run(path: Path) -> None:
"journal": row[journal_index],
}
journals_dict[int(row[journal_index])] = row[journal_title_index]

references = []
for ref, values in references_dict.items():
references.append(
{
"id": ref,
"doi": values["doi"],
"title": values["title"],
"date": values["date"],
"journal": values["journal"],
}
)
journals = []
for journal, title in journals_dict.items():
journals.append({"id": journal, "title": title})
logging.info("Processed references and journals")

structures_dict = {}
with open(path / "structures_table.csv", "r") as f:
reader = csv.reader(f)
headers = next(reader)
Expand All @@ -77,8 +91,6 @@ def run(path: Path) -> None:
inchikey_index = headers.index("structure_inchikey")
inchikey_no_stereo_index = headers.index("structure_inchikey_no_stereo")
formula_index = headers.index("structure_formula")

structures_dict = {}
for row in reader:
struct_id = int(row[id_index])
structures_dict[struct_id] = {
Expand All @@ -91,22 +103,43 @@ def run(path: Path) -> None:
"inchikey_no_stereo": row[inchikey_no_stereo_index],
"formula": row[formula_index],
}

structures = list(structures_dict.values())
logging.info(" Processed structures")

# TODO this has not been tested, probably not working
descriptors_dict = {}
with open(path / "descriptors_rdkit.csv", "r") as f:
reader = csv.reader(f)
headers = next(reader)
smiles_index = headers.index("smiles")
# Excluding the SMILES column
descriptor_indices = range(1, len(headers))
for row in reader:
smiles = row[smiles_index]
# Assuming SMILES strings are unique identifiers
struct_id = smiles
descriptors_dict[struct_id] = {
headers[i]: float(row[i]) for i in descriptor_indices
}
# In case
descriptors_dict[struct_id]["smiles"] = smiles
descriptors = list(descriptors_dict.values())
logging.info("Processed descriptors")

with open(path / "taxa_names.csv", "r") as f:
reader = csv.reader(f)
headers = next(reader)
taxon_index = headers.index("taxon")
name_index = headers.index("taxon_name")

taxo_names_dict = {int(row[taxon_index]): row[name_index] for row in reader}

taxo_names = []
for taxon, name in taxo_names_dict.items():
taxo_names.append({"id": taxon, "name": name})
logging.info(" Processed taxa names")

# Eventually TODO add taxa_names_com

taxon_ranks_dict = {}

with open(path / "ranks_names.csv", "r") as f:
reader = csv.reader(f)
headers = next(reader)
Expand All @@ -116,58 +149,31 @@ def run(path: Path) -> None:
ranks_names = [
{"id": int(row[rank_index]), "name": row[label_index]} for row in reader
]

logging.info(" Processed rank names")

with open(path / "taxa_ranks.csv", "r") as f:
reader = csv.reader(f)
headers = next(reader)
taxon_index = headers.index("taxon")
rank_index = headers.index("taxon_rank")

for row in reader:
rank_value = convert_to_int_safe(row[rank_index])
if rank_value is not None:
taxon_ranks_dict[int(row[taxon_index])] = {rank_value}

logging.info(" Processed taxa ranks")
taxo_ranks = []
for taxon, ranks in taxon_ranks_dict.items():
for rank in ranks:
taxo_ranks.append({"id": taxon, "rank_id": rank})
taxo_names = []
for taxon, name in taxo_names_dict.items():
taxo_names.append({"id": taxon, "name": name})

structures = list(structures_dict.values())

references = []
for ref, values in references_dict.items():
references.append(
{
"id": ref,
"doi": values["doi"],
"title": values["title"],
"date": values["date"],
"journal": values["journal"],
}
)

journals = []
for journal, title in journals_dict.items():
journals.append({"id": journal, "title": title})

logging.info(" Processed dicts")
logging.info(" Processed taxa ranks")

storage.upsert_taxo_parenting(generate_taxon_parents_with_distance(path))
logging.info(" Taxo parenting inserted")

storage.upsert_triplets(triplets)
logging.info(" Triplets inserted")
storage.upsert_structures(structures)
logging.info(" Structures inserted")
# TODO
# storage.upsert_structures_descriptors(descriptors)
# logging.info(" Structures descriptors inserted")
storage.upsert_structures_descriptors(descriptors)
logging.info(" Structures descriptors inserted")
storage.upsert_references(references)
logging.info(" References inserted")
storage.upsert_journals(journals)
Expand Down

0 comments on commit 8f33fb5

Please sign in to comment.