biothings · ctrl-schaff · May 23, 2024 · Apr 30, 2024 · May 3, 2024 · May 3, 2024
diff --git a/config_web/__init__.py b/config_web/__init__.py
@@ -57,6 +57,7 @@
 from . import pfocr
 from . import phewas
 from . import pseudocap_go
+from . import pubtator3
 from . import rare_source
 from . import repodb
 from . import rhea

diff --git a/config_web/pubtator3.py b/config_web/pubtator3.py
@@ -0,0 +1,6 @@
+ES_HOST = "http://localhost:9200"
+ES_INDEX = "pending-pubtator3"
+ES_DOC_TYPE = "association"
+
+API_PREFIX = "pubtator3"
+API_VERSION = ""
diff --git a/plugins/pubtator3/__init__.py b/plugins/pubtator3/__init__.py
@@ -0,0 +1,3 @@
+# flake8: noqa F401
+from .dumper import Pubtator3Dumper
+from .uploader import Pubtator3Uploader
diff --git a/plugins/pubtator3/dumper.py b/plugins/pubtator3/dumper.py
@@ -0,0 +1,95 @@
+import pathlib
+import re
+import urllib
+import urllib.request
+
+import bs4
+
+from biothings import config
+from biothings.hub.dataload.dumper import FTPDumper
+
+
+logger = config.logger
+
+
+class Pubtator3Dumper(FTPDumper):
+    SRC_NAME = "pubtator3"
+    SRC_ROOT_FOLDER = pathlib.Path(config.DATA_ARCHIVE_ROOT).joinpath(SRC_NAME)
+    FTP_HOST = "ftp.ncbi.nlm.nih.gov"
+    CWD_DIR = "/pub/lu/PubTator3/"
+    ARCHIVE = False
+    SCHEDULE = "0 12 * * *"
+    FTP_TIMEOUT = 5 * 60.0
+    FTP_USER = ""
+    FTP_PASSWD = ""
+    MAX_PARALLEL_DUMP = 2
+
+    def __init__(self):
+        self.SRC_ROOT_FOLDER = pathlib.Path(self.SRC_ROOT_FOLDER).resolve().absolute()
+        super().__init__(
+            src_name=self.SRC_NAME,
+            src_root_folder=self.SRC_ROOT_FOLDER,
+            log_folder=config.LOG_FOLDER,
+            archive=self.ARCHIVE,
+        )
+        self.prepare_client()
+        self.set_release()
+
+    def set_release(self) -> None:
+        """
+        Extracts the version date from the FTP site for pubtator3
+
+        Since the FTP site is raw HTML the structure is fairly simplistic:
+        <a href="relation2pubtator3.gz">relation2pubtator3.gz</a>    2024-01-23 06:49  245M
+
+        We look for the attribute instance that matches the file pattern we care about and then
+        get the next sibling to extract the version date
+
+        We then manipulate the string to strip out the time and data size so we only pull back
+        string date
+        """
+
+        pubtator_ftp_site = "https://ftp.ncbi.nlm.nih.gov/pub/lu/PubTator3/"
+        request_timeout_sec = 15
+
+        # [Bandit security warning note]
+        # any usage of the urllib trigger the warning: {B310: Audit url open for permitted schemes}
+        # urllib.request.urlopen supports file system access via ftp:// and file://
+        #
+        # if our usage accepted external input, then this would be a potential security concern, but
+        # our use case leverages hard-coded URL's pointing directly at the FDA webpages
+        try:
+            page_request = urllib.request.Request(url=pubtator_ftp_site, data=None, headers={}, method="GET")
+            with urllib.request.urlopen(url=page_request, timeout=request_timeout_sec) as http_response:  # nosec
+                raw_html_structure = b"".join(http_response.readlines())
+        except Exception as gen_exc:
+            logger.exception(gen_exc)
+            release_version = "Unknown"
+            return release_version
+
+        html_parser = bs4.BeautifulSoup(raw_html_structure, features="html.parser")
+
+        attribute_tag = html_parser.find("a", href=re.compile("relation2pubtator3"))
+        metadata_string = attribute_tag.next_sibling
+        release_version = metadata_string.strip().split()[0]
+        self.release = release_version
+
+    def create_todump_list(self, force: bool = False) -> None:
+        """
+        Generates the dump list
+
+        We access the ftp server using the parent class FTP client instance
+        to acquire the list of files. Then after validating the relation file exists,
+        we add the entry to our `to_dump` collection
+
+        We override the parent class instance of `create_todump_list`, but we have no
+        usage for the `force` argument
+        """
+        relation_pubtator_filename = "relation2pubtator3.gz"
+        remote_pubtator_files = set(self.client.nlst())
+        if relation_pubtator_filename in remote_pubtator_files:
+            local_data_path = pathlib.Path(self.current_data_folder).resolve().absolute()
+            local_relation_pubtator_filepath = local_data_path.joinpath(relation_pubtator_filename)
+            dump_entry = {"remote": relation_pubtator_filename, "local": local_relation_pubtator_filepath}
+            logger.debug("dump entry: %s", dump_entry)
+            self.to_dump.append(dump_entry)
diff --git a/plugins/pubtator3/uploader.py b/plugins/pubtator3/uploader.py
@@ -0,0 +1,163 @@
+"""
+Uploader for the pubtator3 data plugin
+
+Handles the parsing of the documents provided by [relation2pubtator3.gz]
+"""
+
+import itertools
+from pathlib import Path
+from typing import Union
+import gzip
+
+import biothings
+from biothings import config
+import biothings.hub.dataload.uploader
+
+
+logger = config.logger
+
+
+class Pubtator3Uploader(biothings.hub.dataload.uploader.IgnoreDuplicatedSourceUploader):
+    name = "pubtator3"
+    __metadata__ = {
+        "src_meta": {
+            "url": "https://www.ncbi.nlm.nih.gov/research/pubtator3/",
+            "license_url": "https://www.ncbi.nlm.nih.gov/home/about/policies/",
+            "description": "Search entities & relations in 35+ million biomedical publications.",
+        }
+    }
+
+    def load_data(self, data_folder: Union[str, Path]):
+        """
+        We have multiple different instances for entries
+
+        instance 1: [MESH ID]
+        >>> 33846804	treat	Chemical|MESH:D009828	Disease|MESH:D006528
+
+        instance 2: [GENE ID]
+        >>> 33846805	associate	Gene|5289	Gene|5562
+
+        instance3: [MULTIPLE]
+        >>> 26018198	associate	ProteinMutation|RS#:121908192;HGVS:c.581G>A;CorrespondingGene:2671	ProteinMutation|RS#:771809901;HGVS:c.373C>T;CorrespondingGene:2671
+        >>> 20817350        cause   Disease|MESH:D000544    ProteinMutation|RS#:1800562;CorrespondingGene:3077
+        """
+        download_file_name = "relation2pubtator3.gz"
+        unstructured_relational_datafile = Path(data_folder).joinpath(download_file_name)
+        unstructured_relational_datafile = unstructured_relational_datafile.resolve().absolute()
+        with gzip.open(unstructured_relational_datafile, "rb", compresslevel=9) as file_handle:
+            column_delimiter = "\t"
+            entity_delimiter = "|"
+            identifier_delimiter = ";"
+            value_delimiter = ":"
+
+            while True:
+                entry = file_handle.readline()
+                if entry == b"":
+                    break
+
+                entry_collection = entry.decode("utf-8").strip().split(column_delimiter)
+                pmid = int(entry_collection[0])
+                predicate = str(entry_collection[1]).upper()
+                object_concept = str(entry_collection[2])
+                subject_concept = str(entry_collection[3])
+
+                object_entity_type, __, object_id = object_concept.partition(entity_delimiter)
+                subject_entity_type, __, subject_id = subject_concept.partition(entity_delimiter)
+
+                object_id_collection = object_id.split(identifier_delimiter)
+                subject_id_collection = subject_id.split(identifier_delimiter)
+
+                objects = []
+                object_values = []
+                subjects = []
+                subject_values = []
+                for object_id, subject_id in itertools.zip_longest(
+                    object_id_collection, subject_id_collection, fillvalue=None
+                ):
+                    if object_id is not None:
+                        sub_object_id = object_id.split(value_delimiter)
+                        match len(sub_object_id):
+                            case 1:
+                                object_identifer_key = None
+                                object_identifer_value = sub_object_id[0]
+                            case 2:
+                                object_identifer_key = sub_object_id[0]
+                                object_identifer_value = sub_object_id[1]
+
+                        objects.append(
+                            {
+                                "semantic_type_name": object_entity_type,
+                                "identifier": {"key": object_identifer_key, "value": object_identifer_value},
+                            }
+                        )
+                        object_values.append(object_identifer_value)
+
+                    if subject_id is not None:
+                        sub_subject_id = subject_id.split(value_delimiter)
+                        match len(sub_subject_id):
+                            case 1:
+                                subject_identifier_key = None
+                                subject_identifier_value = sub_subject_id[0]
+
+                            case 2:
+                                subject_identifier_key = sub_subject_id[0]
+                                subject_identifier_value = sub_subject_id[1]
+
+                        subjects.append(
+                            {
+                                "semantic_type_name": subject_entity_type,
+                                "identifier": {"key": subject_identifier_key, "value": subject_identifier_value},
+                            }
+                        )
+                        subject_values.append(subject_identifier_value)
+
+                unique_id = f"{pmid}-{object_concept}-{predicate}-{subject_concept}"
+
+                if len(objects) == 1:
+                    objects = objects[0]
+                if len(subjects) == 1:
+                    subjects = subjects[0]
+
+                document = {
+                    "_id": unique_id,
+                    "object": objects,
+                    "pmid_count": 1,
+                    "predicate": predicate,
+                    "predication_count": 1,
+                    "pmid": pmid,
+                    "subject": subjects,
+                }
+                yield document
+
+    @classmethod
+    def get_mapping(self) -> dict:
+        mapping = {
+            "object": {
+                "properties": {
+                    "semantic_type_name": {"normalizer": "keyword_lowercase_normalizer", "type": "keyword"},
+                    "identifier": {
+                        "properties": {
+                            "key": {"normalizer": "keyword_lowercase_normalizer", "type": "keyword"},
+                            "value": {"normalizer": "keyword_lowercase_normalizer", "type": "keyword"},
+                        }
+                    },
+                }
+            },
+            "pmid_count": {"type": "integer"},
+            "predicate": {"normalizer": "keyword_lowercase_normalizer", "type": "keyword"},
+            "predication_count": {"type": "integer"},
+            "pmid": {"type": "integer"},
+            "subject": {
+                "properties": {
+                    "semantic_type_name": {"normalizer": "keyword_lowercase_normalizer", "type": "keyword"},
+                    "identifier": {
+                        "properties": {
+                            "key": {"normalizer": "keyword_lowercase_normalizer", "type": "keyword"},
+                            "value": {"normalizer": "keyword_lowercase_normalizer", "type": "keyword"},
+                        }
+                    },
+                }
+            },
+        }
+
+        return mapping