Skip to content
This repository has been archived by the owner on Aug 13, 2021. It is now read-only.

[361] CORD19 data pipeline #362

Draft
wants to merge 13 commits into
base: dev
Choose a base branch
from
102 changes: 50 additions & 52 deletions nesta/core/orms/arxiv_orm.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
'''
"""
Arxiv
=====
'''
"""
from sqlalchemy import Table, Column, ForeignKey
from sqlalchemy.dialects.mysql import VARCHAR, TEXT
from sqlalchemy.dialects.mysql import VARCHAR, TEXT, MEDIUMTEXT
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import relationship
from sqlalchemy.types import JSON, DATE, INTEGER, BIGINT, FLOAT, BOOLEAN
Expand All @@ -20,34 +20,35 @@


"""Association table for Arxiv articles and their categories."""
article_categories = Table('arxiv_article_categories', Base.metadata,
Column('article_id',
VARCHAR(40),
ForeignKey('arxiv_articles.id'),
primary_key=True),
Column('category_id',
VARCHAR(40),
ForeignKey('arxiv_categories.id'),
primary_key=True))
article_categories = Table(
"arxiv_article_categories",
Base.metadata,
Column(
"article_id", VARCHAR(40), ForeignKey("arxiv_articles.id"), primary_key=True
),
Column(
"category_id", VARCHAR(40), ForeignKey("arxiv_categories.id"), primary_key=True
),
)


"""Association table to Microsoft Academic Graph fields of study."""
article_fields_of_study = Table('arxiv_article_fields_of_study', Base.metadata,
Column('article_id',
VARCHAR(40),
ForeignKey('arxiv_articles.id'),
primary_key=True),
Column('fos_id',
BIGINT,
ForeignKey(FieldOfStudy.id),
primary_key=True))
article_fields_of_study = Table(
"arxiv_article_fields_of_study",
Base.metadata,
Column(
"article_id", VARCHAR(40), ForeignKey("arxiv_articles.id"), primary_key=True
),
Column("fos_id", BIGINT, ForeignKey(FieldOfStudy.id), primary_key=True),
)


class ArticleInstitute(Base):
"""Association table to GRID institutes."""
__tablename__ = 'arxiv_article_institutes'

article_id = Column(VARCHAR(40), ForeignKey('arxiv_articles.id'), primary_key=True)
__tablename__ = "arxiv_article_institutes"

article_id = Column(VARCHAR(40), ForeignKey("arxiv_articles.id"), primary_key=True)
institute_id = Column(VARCHAR(20), ForeignKey(Institute.id), primary_key=True)
is_multinational = Column(BOOLEAN)
matching_score = Column(FLOAT)
Expand All @@ -56,16 +57,17 @@ class ArticleInstitute(Base):

class Article(Base):
"""Arxiv articles and metadata."""
__tablename__ = 'arxiv_articles'

__tablename__ = "arxiv_articles"

id = Column(VARCHAR(40), primary_key=True, autoincrement=False)
datestamp = Column(DATE)
created = Column(DATE)
updated = Column(DATE)
title = Column(TEXT)
journal_ref = Column(TEXT)
title = Column(TEXT(collation="utf8mb4_unicode_ci"))
journal_ref = Column(TEXT(collation="utf8mb4_unicode_ci"))
doi = Column(VARCHAR(200))
abstract = Column(TEXT)
abstract = Column(MEDIUMTEXT(collation="utf8mb4_unicode_ci"))
authors = Column(JSON)
mag_authors = Column(JSON)
mag_id = Column(BIGINT)
Expand All @@ -74,19 +76,17 @@ class Article(Base):
citation_count_updated = Column(DATE)
msc_class = Column(VARCHAR(200))
institute_match_attempted = Column(BOOLEAN, default=False)
categories = relationship('Category',
secondary=article_categories)
fields_of_study = relationship(FieldOfStudy,
secondary=article_fields_of_study)
institutes = relationship('ArticleInstitute')
corex_topics = relationship('CorExTopic',
secondary='arxiv_article_corex_topics')
categories = relationship("Category", secondary=article_categories)
fields_of_study = relationship(FieldOfStudy, secondary=article_fields_of_study)
institutes = relationship("ArticleInstitute")
corex_topics = relationship("CorExTopic", secondary="arxiv_article_corex_topics")
article_source = Column(VARCHAR(7), index=True, default=None)


class Category(Base):
"""Lookup table for Arxiv category descriptions."""
__tablename__ = 'arxiv_categories'

__tablename__ = "arxiv_categories"

id = Column(VARCHAR(40), primary_key=True)
description = Column(VARCHAR(100))
Expand All @@ -107,39 +107,37 @@ class Category(Base):
# id = Column(VARCHAR(40), ForeignKey('arxiv_article_msc.msc_id'), primary_key=True)
# description = Column(VARCHAR(100))


class CorExTopic(Base):
"""CorEx topics derived from arXiv data"""
__tablename__ = 'arxiv_corex_topics'

__tablename__ = "arxiv_corex_topics"
id = Column(INTEGER, primary_key=True, autoincrement=False)
terms = Column(JSON)


class ArticleTopic(Base):
"""Association table to CorEx topics."""
__tablename__ = 'arxiv_article_corex_topics'
article_id = Column(VARCHAR(40),
ForeignKey(Article.id),
primary_key=True)
topic_id = Column(INTEGER,
ForeignKey(CorExTopic.id),
primary_key=True,
autoincrement=False)

__tablename__ = "arxiv_article_corex_topics"
article_id = Column(VARCHAR(40), ForeignKey(Article.id), primary_key=True)
topic_id = Column(
INTEGER, ForeignKey(CorExTopic.id), primary_key=True, autoincrement=False
)
topic_weight = Column(FLOAT)


class ArticleVector(Base):
"""Document vectors for articles."""
__tablename__ = 'arxiv_vector'
article_id = Column(VARCHAR(40),
ForeignKey(Article.id),
primary_key=True)

__tablename__ = "arxiv_vector"
article_id = Column(VARCHAR(40), ForeignKey(Article.id), primary_key=True)
vector = Column(JSON)


class ArticleCluster(Base):
"""Document clusters for articles."""
__tablename__ = 'arxiv_cluster'
article_id = Column(VARCHAR(40),
ForeignKey(Article.id),
primary_key=True)

__tablename__ = "arxiv_cluster"
article_id = Column(VARCHAR(40), ForeignKey(Article.id), primary_key=True)
clusters = Column(JSON)
Loading