Skip to content

Commit

Permalink
UPD importer test
Browse files Browse the repository at this point in the history
  • Loading branch information
eboileau committed Feb 14, 2024
1 parent 1b0df05 commit 539bf6e
Show file tree
Hide file tree
Showing 6 changed files with 198 additions and 535 deletions.
99 changes: 0 additions & 99 deletions server/src/scimodom/services/__init__.py
Original file line number Diff line number Diff line change
@@ -1,99 +0,0 @@
from scimodom.database.database import get_session
from scimodom.services.importer.data import EUFDataImporter
from scimodom.services.importer.generic import BEDImporter
from scimodom.services.importer.header import EUFHeaderImporter


class Importer:
"""Defines a general Importer class to handle
EU (bedRMod) formatted files.
:param header: EU header importer
:type header: EUFHeaderImporter
:param data: EU data importer
:type data: EUFDataImporter
"""

def __init__(
self,
header: EUFHeaderImporter,
data: EUFDataImporter | None = None,
) -> None:
"""Initializer method."""
self._header = header
self._data = data

def init_data_importer(
self, association: dict[str, int], seqids: list[str]
) -> None:
"""Instantiate EUFDataImporter.
:param association: A dictionary of association IDs of the form
{name: association_id}, where name is the modification short_name.
The association ID provides information about the dataset (EUFID),
the modification, the organism, and the technology used.
:type association: dict of {str: int}
:param seqids: List of chromosomes or scaffolds. The seqid must be
one used with Ensembl, e.g. standard Ensembl chromosome name w/o
the "chr" prefix. Only records with seqid in seqids will be imported.
:type seqids: list of str
"""

version = self._header._specs_ver
filen = self._header._filen
session = get_session()
if self._header._handle.closed is False:
self._header.close()
if self._data is None:
self._data = EUFDataImporter(
session=session,
filen=filen,
handle=open(filen, "r"),
association=association,
seqids=seqids,
specs_ver=version,
)


def get_importer(filen: str, smid: str, eufid: str, title: str):
"""Instantiate Importer.
:param filen: File path
:type filen: str
:param smid: Sci-ModoM project ID or SMID
:type smid: str
:param eufid: EUF ID (dataset) or EUFID
:type eufid: str
:param title: Title associated with EUF/bedRMod dataset
:type title: str
"""
session = get_session()

return Importer(
header=EUFHeaderImporter(
session=session,
filen=filen,
handle=open(filen, "r"),
smid=smid,
eufid=eufid,
title=title,
),
data=None,
)


def get_bed_importer(
filen: str,
):
"""Instantiate BED Importer.
:param filen: File path
:type filen: str
"""
session = get_session()

return BEDImporter(
session=session,
filen=filen,
handle=open(filen, "r"),
)
99 changes: 99 additions & 0 deletions server/src/scimodom/services/importer/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
from scimodom.database.database import get_session
from scimodom.services.importer.data import EUFDataImporter
from scimodom.services.importer.generic import BEDImporter
from scimodom.services.importer.header import EUFHeaderImporter


class Importer:
"""Defines a general Importer class to handle
EU (bedRMod) formatted files.
:param header: EU header importer
:type header: EUFHeaderImporter
:param data: EU data importer
:type data: EUFDataImporter
"""

def __init__(
self,
header: EUFHeaderImporter,
data: EUFDataImporter | None = None,
) -> None:
"""Initializer method."""
self.header = header
self.data = data

def init_data_importer(
self, association: dict[str, int], seqids: list[str]
) -> None:
"""Instantiate EUFDataImporter.
:param association: A dictionary of association IDs of the form
{name: association_id}, where name is the modification short_name.
The association ID provides information about the dataset (EUFID),
the modification, the organism, and the technology used.
:type association: dict of {str: int}
:param seqids: List of chromosomes or scaffolds. The seqid must be
one used with Ensembl, e.g. standard Ensembl chromosome name w/o
the "chr" prefix. Only records with seqid in seqids will be imported.
:type seqids: list of str
"""

version = self.header._specs_ver
filen = self.header._filen
session = get_session()
if self.header._handle.closed is False:
self.header.close()
if self.data is None:
self.data = EUFDataImporter(
session=session(),
filen=filen,
handle=open(filen, "r"),
association=association,
seqids=seqids,
specs_ver=version,
)


def get_importer(filen: str, smid: str, eufid: str, title: str):
"""Instantiate Importer.
:param filen: File path
:type filen: str
:param smid: Sci-ModoM project ID or SMID
:type smid: str
:param eufid: EUF ID (dataset) or EUFID
:type eufid: str
:param title: Title associated with EUF/bedRMod dataset
:type title: str
"""
session = get_session()

return Importer(
header=EUFHeaderImporter(
session=session(),
filen=filen,
handle=open(filen, "r"),
smid=smid,
eufid=eufid,
title=title,
),
data=None,
)


def get_bed_importer(
filen: str,
):
"""Instantiate BED Importer.
:param filen: File path
:type filen: str
"""
session = get_session()

return BEDImporter(
session=session(),
filen=filen,
handle=open(filen, "r"),
)
10 changes: 5 additions & 5 deletions server/src/scimodom/services/importer/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,16 +118,16 @@ def __init__(
self._buffer: BaseImporter._Buffer
self._dtypes: dict[str, dict[str, Any]] = dict()
self._lino: int = skiprows
if header is None:
self._header = self._get_header()
else:
self._header = header
self._num_cols: int = len(self._header)
if comment is not None and len(comment) > 1:
raise ValueError(
f"Maximum length of 1 expected, got {len(comment)} for comment."
)
self._comment = comment
if header is None:
self._header = self._get_header()
else:
self._header = header
self._num_cols: int = len(self._header)

@abstractmethod
def parse_record(self, record: dict[str, str]) -> dict[str, Any]:
Expand Down
24 changes: 22 additions & 2 deletions server/tests/unit/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
from scimodom.utils.specifications import SPECS_EUF

# data path
DataPath = namedtuple("DataPath", "ASSEMBLY_PATH ANNOTATION_PATH META_PATH")
DataPath = namedtuple("DataPath", "LOC ASSEMBLY_PATH ANNOTATION_PATH META_PATH")


@pytest.fixture()
Expand Down Expand Up @@ -251,6 +251,9 @@ def project_template():

@pytest.fixture(scope="session")
def data_path(tmp_path_factory):
format = SPECS_EUF["format"]
version = SPECS_EUF["versions"][-1]

loc = tmp_path_factory.mktemp("data")
ASSEMBLY_PATH = loc / "assembly"
ASSEMBLY_PATH.mkdir()
Expand All @@ -273,4 +276,21 @@ def data_path(tmp_path_factory):
with open(Path(path, chrom_file), "w") as f:
f.write("1\t1000000")

yield DataPath(ASSEMBLY_PATH, ANNOTATION_PATH, META_PATH)
with open(Path(loc, "test.bed"), "w") as f:
f.write(f"#fileformat={format}v{version}\n")
f.write("#organism=9606\n")
f.write("#modification_type=RNA\n")
f.write("#assembly=GRCh38\n")
f.write("#annotation_source=Annotation\n")
f.write("#annotation_version=Version\n")
f.write("#sequencing_platform=Sequencing platform\n")
f.write("#basecalling=\n")
f.write("#bioinformatics_workflow=Workflow\n")
f.write("#experiment=Description of experiment.\n")
f.write("#external_source=\n")
f.write(
"#chrom\tchromstart\tchromEnd\tname\tscore\tstrand\tthickstart\tthickEnd\titermRgb\tcoverage\tfrequency\n"
)
f.write("1\t0\t10\tm6A\t1000\t+\t0\t10\t0,0,0\t10\t1\n")

yield DataPath(loc, ASSEMBLY_PATH, ANNOTATION_PATH, META_PATH)
47 changes: 44 additions & 3 deletions server/tests/unit/services/test_data_importer.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ def _get_data(EUF_specs):
A\t0\t10\tm6A\t1000\t+\t0\t10\t0,0,0\t10\t1
1\t0\t10\tm6A\t1000\t\t0\t10\t0,0,0\t10\t1
1\t0\t10\tm5C\t1000\t+\t0\t10\t0,0,0\t10\t1
1\t0\t10\tm5C\t1000\t+
1\t0\t10\tm6A\t1000\t+\t0\t10\t0,0,0\t10\t200"""
return StringIO(string)

Expand All @@ -76,6 +77,9 @@ def _get_data_with_header(fmt):
chrom\tstart\tend\tname\tscore\tstrand\tthick_start\tthick_end\titem_rgb\tcoverage\tfrequency
1\t0\t10\tm6A\t1000\t+\t0\t10\t0,0,0\t10\t1"""
comment = "@"
elif fmt == "wrong":
string = """chrom\tchromStart\tchromEnd\tname\tscore\tstrand\tthickStart\tthickEnd\titemRgb\tcoverage\tfrequency
1\t0\t10\tm6A\t1000\t+\t0\t10\t0,0,0\t10\t1"""
return skiprows, comment, StringIO(string)


Expand Down Expand Up @@ -176,12 +180,11 @@ def test_importer_parse_records(Session, EUF_specs):
"fmt",
[("first"), ("second"), ("third"), ("comment")],
)
def test_base_importer(fmt, Session):
def test_base_importer_header(fmt, Session):
skiprows, comment, handle = _get_data_with_header(fmt)

class TestBaseImporter(BaseImporter):
def __init__(self):
# self._comment = comment
super().__init__(
session=Session(),
filen="filen",
Expand All @@ -197,4 +200,42 @@ def parse_record(record):
return record

importer = TestBaseImporter()
print(importer._header)
importer._validate_columns()
expected_header = [
"chrom",
"start",
"end",
"name",
"score",
"strand",
"thick_start",
"thick_end",
"item_rgb",
"coverage",
"frequency",
]
assert importer._header == expected_header


def test_base_importer_columns_fail(Session):
skiprows, comment, handle = _get_data_with_header("wrong")

class TestBaseImporter(BaseImporter):
def __init__(self):
super().__init__(
session=Session(),
filen="filen",
handle=handle,
model=Data,
sep="\t",
header=None,
skiprows=skiprows,
comment=comment,
)

def parse_record(record):
return record

importer = TestBaseImporter()
with pytest.raises(Exception) as excinfo:
importer._validate_columns()
Loading

0 comments on commit 539bf6e

Please sign in to comment.