Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

pubtator3 plugin #192

Merged
merged 16 commits into from
May 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions config_web/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@
from . import pfocr
from . import phewas
from . import pseudocap_go
from . import pubtator3
from . import rare_source
from . import repodb
from . import rhea
Expand Down
6 changes: 6 additions & 0 deletions config_web/pubtator3.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
ES_HOST = "http://localhost:9200"
ES_INDEX = "pending-pubtator3"
ES_DOC_TYPE = "association"

API_PREFIX = "pubtator3"
API_VERSION = ""
3 changes: 3 additions & 0 deletions plugins/pubtator3/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# flake8: noqa F401
from .dumper import Pubtator3Dumper
from .uploader import Pubtator3Uploader
95 changes: 95 additions & 0 deletions plugins/pubtator3/dumper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
import pathlib
import re
import urllib
import urllib.request

import bs4

from biothings import config
from biothings.hub.dataload.dumper import FTPDumper


logger = config.logger


class Pubtator3Dumper(FTPDumper):
SRC_NAME = "pubtator3"
SRC_ROOT_FOLDER = pathlib.Path(config.DATA_ARCHIVE_ROOT).joinpath(SRC_NAME)
FTP_HOST = "ftp.ncbi.nlm.nih.gov"
CWD_DIR = "/pub/lu/PubTator3/"
ARCHIVE = False
SCHEDULE = "0 12 * * *"
FTP_TIMEOUT = 5 * 60.0
FTP_USER = ""
FTP_PASSWD = ""
MAX_PARALLEL_DUMP = 2

def __init__(self):
self.SRC_ROOT_FOLDER = pathlib.Path(self.SRC_ROOT_FOLDER).resolve().absolute()
super().__init__(
src_name=self.SRC_NAME,
src_root_folder=self.SRC_ROOT_FOLDER,
log_folder=config.LOG_FOLDER,
archive=self.ARCHIVE,
)
self.prepare_client()
self.set_release()

def set_release(self) -> None:
"""
Extracts the version date from the FTP site for pubtator3

Since the FTP site is raw HTML the structure is fairly simplistic:
<a href="relation2pubtator3.gz">relation2pubtator3.gz</a> 2024-01-23 06:49 245M

We look for the attribute instance that matches the file pattern we care about and then
get the next sibling to extract the version date

We then manipulate the string to strip out the time and data size so we only pull back
string date
"""

pubtator_ftp_site = "https://ftp.ncbi.nlm.nih.gov/pub/lu/PubTator3/"
request_timeout_sec = 15

# [Bandit security warning note]
# any usage of the urllib trigger the warning: {B310: Audit url open for permitted schemes}
# urllib.request.urlopen supports file system access via ftp:// and file://
#
# if our usage accepted external input, then this would be a potential security concern, but
# our use case leverages hard-coded URL's pointing directly at the FDA webpages
try:
page_request = urllib.request.Request(url=pubtator_ftp_site, data=None, headers={}, method="GET")
with urllib.request.urlopen(url=page_request, timeout=request_timeout_sec) as http_response: # nosec
raw_html_structure = b"".join(http_response.readlines())
except Exception as gen_exc:
logger.exception(gen_exc)
release_version = "Unknown"
return release_version

html_parser = bs4.BeautifulSoup(raw_html_structure, features="html.parser")

attribute_tag = html_parser.find("a", href=re.compile("relation2pubtator3"))
metadata_string = attribute_tag.next_sibling
release_version = metadata_string.strip().split()[0]
self.release = release_version

def create_todump_list(self, force: bool = False) -> None:
"""
Generates the dump list

We access the ftp server using the parent class FTP client instance
to acquire the list of files. Then after validating the relation file exists,
we add the entry to our `to_dump` collection

We override the parent class instance of `create_todump_list`, but we have no
usage for the `force` argument
"""
relation_pubtator_filename = "relation2pubtator3.gz"
remote_pubtator_files = set(self.client.nlst())
if relation_pubtator_filename in remote_pubtator_files:
local_data_path = pathlib.Path(self.current_data_folder).resolve().absolute()
local_relation_pubtator_filepath = local_data_path.joinpath(relation_pubtator_filename)
dump_entry = {"remote": relation_pubtator_filename, "local": local_relation_pubtator_filepath}
logger.debug("dump entry: %s", dump_entry)
self.to_dump.append(dump_entry)
163 changes: 163 additions & 0 deletions plugins/pubtator3/uploader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,163 @@
"""
Uploader for the pubtator3 data plugin

Handles the parsing of the documents provided by [relation2pubtator3.gz]
"""

import itertools
from pathlib import Path
from typing import Union
import gzip

import biothings
from biothings import config
import biothings.hub.dataload.uploader


logger = config.logger


class Pubtator3Uploader(biothings.hub.dataload.uploader.IgnoreDuplicatedSourceUploader):
name = "pubtator3"
__metadata__ = {
"src_meta": {
"url": "https://www.ncbi.nlm.nih.gov/research/pubtator3/",
"license_url": "https://www.ncbi.nlm.nih.gov/home/about/policies/",
"description": "Search entities & relations in 35+ million biomedical publications.",
}
}

def load_data(self, data_folder: Union[str, Path]):
"""
We have multiple different instances for entries

instance 1: [MESH ID]
>>> 33846804 treat Chemical|MESH:D009828 Disease|MESH:D006528

instance 2: [GENE ID]
>>> 33846805 associate Gene|5289 Gene|5562

instance3: [MULTIPLE]
>>> 26018198 associate ProteinMutation|RS#:121908192;HGVS:c.581G>A;CorrespondingGene:2671 ProteinMutation|RS#:771809901;HGVS:c.373C>T;CorrespondingGene:2671
>>> 20817350 cause Disease|MESH:D000544 ProteinMutation|RS#:1800562;CorrespondingGene:3077
"""
download_file_name = "relation2pubtator3.gz"
unstructured_relational_datafile = Path(data_folder).joinpath(download_file_name)
unstructured_relational_datafile = unstructured_relational_datafile.resolve().absolute()
with gzip.open(unstructured_relational_datafile, "rb", compresslevel=9) as file_handle:
column_delimiter = "\t"
entity_delimiter = "|"
identifier_delimiter = ";"
value_delimiter = ":"

while True:
entry = file_handle.readline()
if entry == b"":
break

entry_collection = entry.decode("utf-8").strip().split(column_delimiter)
pmid = int(entry_collection[0])
predicate = str(entry_collection[1]).upper()
object_concept = str(entry_collection[2])
subject_concept = str(entry_collection[3])

object_entity_type, __, object_id = object_concept.partition(entity_delimiter)
subject_entity_type, __, subject_id = subject_concept.partition(entity_delimiter)

object_id_collection = object_id.split(identifier_delimiter)
subject_id_collection = subject_id.split(identifier_delimiter)

objects = []
object_values = []
subjects = []
subject_values = []
for object_id, subject_id in itertools.zip_longest(
object_id_collection, subject_id_collection, fillvalue=None
):
if object_id is not None:
sub_object_id = object_id.split(value_delimiter)
match len(sub_object_id):
case 1:
object_identifer_key = None
object_identifer_value = sub_object_id[0]
case 2:
object_identifer_key = sub_object_id[0]
object_identifer_value = sub_object_id[1]

objects.append(
{
"semantic_type_name": object_entity_type,
"identifier": {"key": object_identifer_key, "value": object_identifer_value},
}
)
object_values.append(object_identifer_value)

if subject_id is not None:
sub_subject_id = subject_id.split(value_delimiter)
match len(sub_subject_id):
case 1:
subject_identifier_key = None
subject_identifier_value = sub_subject_id[0]

case 2:
subject_identifier_key = sub_subject_id[0]
subject_identifier_value = sub_subject_id[1]

subjects.append(
{
"semantic_type_name": subject_entity_type,
"identifier": {"key": subject_identifier_key, "value": subject_identifier_value},
}
)
subject_values.append(subject_identifier_value)

unique_id = f"{pmid}-{object_concept}-{predicate}-{subject_concept}"

if len(objects) == 1:
objects = objects[0]
if len(subjects) == 1:
subjects = subjects[0]

document = {
"_id": unique_id,
"object": objects,
"pmid_count": 1,
"predicate": predicate,
"predication_count": 1,
"pmid": pmid,
"subject": subjects,
}
yield document

@classmethod
def get_mapping(self) -> dict:
mapping = {
"object": {
"properties": {
"semantic_type_name": {"normalizer": "keyword_lowercase_normalizer", "type": "keyword"},
"identifier": {
"properties": {
"key": {"normalizer": "keyword_lowercase_normalizer", "type": "keyword"},
"value": {"normalizer": "keyword_lowercase_normalizer", "type": "keyword"},
}
},
}
},
"pmid_count": {"type": "integer"},
"predicate": {"normalizer": "keyword_lowercase_normalizer", "type": "keyword"},
"predication_count": {"type": "integer"},
"pmid": {"type": "integer"},
"subject": {
"properties": {
"semantic_type_name": {"normalizer": "keyword_lowercase_normalizer", "type": "keyword"},
"identifier": {
"properties": {
"key": {"normalizer": "keyword_lowercase_normalizer", "type": "keyword"},
"value": {"normalizer": "keyword_lowercase_normalizer", "type": "keyword"},
}
},
}
},
}

return mapping