Skip to content

Commit

Permalink
Merge pull request #1368 from microbiomedata/berkeley-schema-update
Browse files Browse the repository at this point in the history
Update to berkeley schema
  • Loading branch information
naglepuff authored Aug 26, 2024
2 parents 165a33a + aeed40c commit 6811a66
Show file tree
Hide file tree
Showing 29 changed files with 469 additions and 78 deletions.
2 changes: 1 addition & 1 deletion .docker-env
Original file line number Diff line number Diff line change
Expand Up @@ -7,5 +7,5 @@ PGDATABASE=nmdc_a
NMDC_DATABASE_URI="postgresql://postgres:postgres@db/nmdc_a"
NMDC_CELERY_BACKEND="redis://redis:6379/0"
NMDC_CELERY_BROKER="redis://redis:6379/0"
NMDC_INGEST_DATABASE_URI="postgresql://postgres:postgres@db/nmdc_a"
NMDC_INGEST_DATABASE_URI="postgresql://postgres:postgres@db/nmdc_b"
NMDC_MONGO_HOST=tunnel
5 changes: 4 additions & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,10 @@ FROM tiangolo/uvicorn-gunicorn:python3.9-2023-06-05
LABEL org.opencontainers.image.source=https://github.com/microbiomedata/nmdc-server
RUN rm /app/main.py

RUN apt-get update && apt-get install -y postgresql-client
RUN apt clean
RUN apt-get upgrade
RUN apt-get update
RUN apt-get install -y postgresql-client

RUN pip install -U pip setuptools wheel
COPY . /app/
Expand Down
8 changes: 7 additions & 1 deletion nmdc_server/aggregations.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,7 @@ def get_data_object_aggregation(
models.DataObject.workflow_type,
models.DataObject.file_type,
func.count(models.DataObject.id),
func.sum(func.coalesce(models.DataObject.file_size_bytes, 0)),
)
.filter(
models.DataObject.workflow_type != None,
Expand All @@ -195,6 +196,7 @@ def get_data_object_aggregation(
db.query(
models.DataObject.workflow_type,
func.count(models.DataObject.id),
func.sum(func.coalesce(models.DataObject.file_size_bytes, 0)),
)
.filter(
models.DataObject.workflow_type != None,
Expand All @@ -205,13 +207,15 @@ def get_data_object_aggregation(
)
for row in rows:
agg[row[0]].count = row[1]
agg[row[0]].size = row[2]

# aggregate file_types
rows = (
db.query(
models.DataObject.workflow_type,
models.DataObject.file_type,
func.count(models.DataObject.id),
func.sum(func.coalesce(models.DataObject.file_size_bytes, 0)),
)
.filter(
models.DataObject.workflow_type != None,
Expand All @@ -222,5 +226,7 @@ def get_data_object_aggregation(
.group_by(models.DataObject.workflow_type, models.DataObject.file_type)
)
for row in rows:
agg[row[0]].file_types[row[1]] = row[2]
agg[row[0]].file_types[row[1]] = schemas.DataObjectAggregationNode(
count=row[2], size=row[3]
)
return agg
83 changes: 76 additions & 7 deletions nmdc_server/api.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import csv
import json
import logging
from io import BytesIO
from io import BytesIO, StringIO
from typing import Any, Dict, List, Optional
from uuid import UUID

Expand All @@ -10,7 +11,7 @@
from sqlalchemy.orm import Session
from starlette.responses import StreamingResponse

from nmdc_server import __version__, crud, jobs, models, query, schemas, schemas_submission
from nmdc_server import crud, jobs, models, query, schemas, schemas_submission
from nmdc_server.auth import admin_required, get_current_user, login_required_responses
from nmdc_server.bulk_download_schema import BulkDownload, BulkDownloadCreate
from nmdc_server.config import Settings
Expand Down Expand Up @@ -38,9 +39,9 @@ async def get_settings() -> Dict[str, Any]:


# get application version number
@router.get("/version", name="Get application version identifier")
async def get_version() -> Dict[str, Any]:
return {"nmdc-server": __version__}
@router.get("/version", name="Get application and schema version identifiers")
async def get_version() -> schemas.VersionInfo:
return schemas.VersionInfo()


# get the current user information
Expand All @@ -63,6 +64,12 @@ def text_search(terms: str, limit=6, db: Session = Depends(get_db)):
"field": "name",
"op": "like",
}
study_id_filter = {
"table": "study",
"value": terms.lower(),
"field": "id",
"op": "like",
}
study_description_filter = {
"table": "study",
"value": terms.lower(),
Expand Down Expand Up @@ -103,6 +110,7 @@ def text_search(terms: str, limit=6, db: Session = Depends(get_db)):
filters = crud.text_search(db, terms, limit)
plaintext_filters = [
query.SimpleConditionSchema(**study_name_filter),
query.SimpleConditionSchema(**study_id_filter),
query.SimpleConditionSchema(**study_description_filter),
query.SimpleConditionSchema(**study_title_filter),
query.SimpleConditionSchema(**biosample_name_filter),
Expand Down Expand Up @@ -623,6 +631,69 @@ async def download_zip_file(
)


@router.get(
"/metadata_submission/report",
tags=["metadata_submission"],
)
async def get_metadata_submissions_report(
db: Session = Depends(get_db),
user: models.User = Depends(get_current_user),
):
r"""
Download a TSV file containing a high-level report of Submission Portal submissions,
including their ID, author info, study info, and PI info.
"""
if not user.is_admin:
raise HTTPException(status_code=403, detail="Your account has insufficient privileges.")

# Get the submissions from the database.
q = crud.get_query_for_all_submissions(db)
submissions = q.all()

# Iterate through the submissions, building the data rows for the report.
header_row = [
"Submission ID",
"Author ORCID",
"Author Name",
"Study Name",
"PI Name",
"PI Email",
]
data_rows = []
for s in submissions:
metadata = s.metadata_submission # creates a concise alias
author_user = s.author # note: `s.author` is a `models.User` instance
study_form = metadata["studyForm"] if "studyForm" in metadata else {}
study_name = study_form["studyName"] if "studyName" in study_form else ""
pi_name = study_form["piName"] if "piName" in study_form else ""
pi_email = study_form["piEmail"] if "piEmail" in study_form else ""
data_row = [s.id, s.author_orcid, author_user.name, study_name, pi_name, pi_email]
data_rows.append(data_row)

# Build the report as an in-memory TSV "file" (buffer).
# Reference: https://docs.python.org/3/library/csv.html#csv.writer
buffer = StringIO()
writer = csv.writer(buffer, delimiter="\t")
writer.writerow(header_row)
writer.writerows(data_rows)

# Reset the buffer's internal file pointer to the beginning of the buffer, so that,
# when we stream the buffer's contents later, all of its contents are included.
buffer.seek(0)

# Stream the buffer's contents to the HTTP client as a downloadable TSV file.
# Reference: https://fastapi.tiangolo.com/advanced/custom-response
# Reference: https://mimetype.io/text/tab-separated-values
filename = "submissions-report.tsv"
response = StreamingResponse(
buffer,
media_type="text/tab-separated-values",
headers={"Content-Disposition": f'attachment; filename="{filename}"'},
)

return response


@router.get(
"/metadata_submission",
tags=["metadata_submission"],
Expand Down Expand Up @@ -787,7 +858,6 @@ def create_github_issue(submission, user):
omicsprocessingtypes = ", ".join(multiomicsform["omicsProcessingTypes"])
sampletype = ", ".join(submission.metadata_submission["templates"])
sampledata = submission.metadata_submission["sampleData"]
fundingsource = studyform["fundingSource"]
numsamples = 0
for key in sampledata:
numsamples = max(numsamples, len(sampledata[key]))
Expand Down Expand Up @@ -817,7 +887,6 @@ def create_github_issue(submission, user):
f"Data types: {omicsprocessingtypes}",
f"Sample type:{sampletype}",
f"Number of samples:{numsamples}",
f"Funding source:{fundingsource}",
] + valid_ids
body_string = " \n ".join(body_lis)
payload_dict = {
Expand Down
37 changes: 35 additions & 2 deletions nmdc_server/crud.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import re
from datetime import datetime
from typing import Any, Dict, List, Optional, Tuple, Type, TypeVar, cast
from uuid import UUID
Expand Down Expand Up @@ -224,6 +225,14 @@ def list_omics_processing_data_objects(db: Session, id: str) -> Query:
)


# KEGG
def get_pathway_prefix(term) -> Optional[str]:
pathway_prefixes = set(["map", "ko", "ec", "rn", "org"])
pathway_re = f"^({'|'.join(re.escape(p) for p in pathway_prefixes)})"
match = re.match(pathway_re, term)
return match.group(0) if match else None


def list_ko_terms_for_module(db: Session, module: str) -> List[str]:
q = db.query(models.KoTermToModule.term).filter(models.KoTermToModule.module.ilike(module))
return [row[0] for row in q]
Expand All @@ -235,13 +244,24 @@ def list_ko_terms_for_pathway(db: Session, pathway: str) -> List[str]:


def kegg_text_search(db: Session, query: str, limit: int) -> List[models.KoTermText]:
pathway_prefix = get_pathway_prefix(query)
term = query.replace(pathway_prefix, "map") if pathway_prefix else query
q = (
db.query(models.KoTermText)
.filter(models.KoTermText.text.ilike(f"%{query}%") | models.KoTermText.term.ilike(query))
.filter(models.KoTermText.text.ilike(f"%{term}%") | models.KoTermText.term.ilike(term))
.order_by(models.KoTermText.term)
.limit(limit)
)
return list(q)
results = list(q)
if pathway_prefix:
default_pathway_prefix = "map"
# Transform pathway results to match given prefix. They are ingested with the
# 'map' prefix, but can searched for with various other prefixes.
for term_text in results:
if term_text.term.startswith(default_pathway_prefix):
term_text.term = term_text.term.replace(default_pathway_prefix, pathway_prefix)
term_text.text = term_text.text.replace(default_pathway_prefix, pathway_prefix)
return results


# biosample
Expand Down Expand Up @@ -653,6 +673,19 @@ def get_submissions_for_user(db: Session, user: models.User, column_sort: str, o
return permitted_submissions


def get_query_for_all_submissions(db: Session):
r"""
Returns a SQLAlchemy query that can be used to retrieve all submissions.
Reference: https://fastapi.tiangolo.com/tutorial/sql-databases/#crud-utils
Reference: https://docs.sqlalchemy.org/en/14/orm/session_basics.html
"""
all_submissions = db.query(models.SubmissionMetadata).order_by(
models.SubmissionMetadata.created.desc()
)
return all_submissions


def get_roles_for_submission(
db: Session, submission: models.SubmissionMetadata
) -> List[models.SubmissionRole]:
Expand Down
2 changes: 1 addition & 1 deletion nmdc_server/fakes.py
Original file line number Diff line number Diff line change
Expand Up @@ -365,7 +365,7 @@ class Meta:
"piEmail": "",
"piOrcid": "",
"linkOutWebpage": [],
"fundingSource": "",
"fundingSources": [],
"description": "",
"notes": "",
"contributors": [],
Expand Down
7 changes: 3 additions & 4 deletions nmdc_server/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -292,10 +292,9 @@ def groups(self) -> Iterator[Tuple[str, Iterator[BaseConditionSchema]]]:
def transform_condition(self, db, condition: BaseConditionSchema) -> List[BaseConditionSchema]:
# Transform KEGG.(PATH|MODULE) queries into their respective ORTHOLOGY terms
if condition.key == "Table.gene_function:id" and type(condition.value) is str:
if condition.value.startswith(KeggTerms.PATHWAY[0]):
searchable_name = condition.value.replace(
KeggTerms.PATHWAY[0], KeggTerms.PATHWAY[1]
)
if any([condition.value.startswith(val) for val in KeggTerms.PATHWAY[0]]):
prefix = [val for val in KeggTerms.PATHWAY[0] if condition.value.startswith(val)][0]
searchable_name = condition.value.replace(prefix, KeggTerms.PATHWAY[1])
ko_terms = db.query(models.KoTermToPathway.term).filter(
models.KoTermToPathway.pathway.ilike(searchable_name)
)
Expand Down
28 changes: 25 additions & 3 deletions nmdc_server/schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

from datetime import date, datetime
from enum import Enum
from importlib.metadata import version
from typing import Any, Dict, List, Optional, Union
from urllib.parse import quote
from uuid import UUID
Expand All @@ -18,7 +19,7 @@
from sqlalchemy import BigInteger, Column, DateTime, Float, Integer, LargeBinary, String
from sqlalchemy.dialects.postgresql.json import JSONB

from nmdc_server import models
from nmdc_server import __version__, models
from nmdc_server.data_object_filters import DataObjectFilter, WorkflowActivityTypeEnum

DateType = Union[datetime, date]
Expand Down Expand Up @@ -189,9 +190,13 @@ class Config:
orm_mode = True


class DataObjectAggregationElement(BaseModel):
class DataObjectAggregationNode(BaseModel):
count: int = 0
file_types: Dict[str, int] = {}
size: int = 0


class DataObjectAggregationElement(DataObjectAggregationNode):
file_types: Dict[str, DataObjectAggregationNode] = {}


DataObjectAggregation = Dict[str, DataObjectAggregationElement]
Expand Down Expand Up @@ -688,3 +693,20 @@ class LockOperationResult(BaseModel):
message: str
locked_by: Optional[User]
lock_updated: Optional[datetime]


class VersionInfo(BaseModel):
"""Version information for the nmdc-server itself and the schemas.
This model has default field values and is immutable because these values cannot
change at runtime.
"""

nmdc_server: str = __version__
nmdc_schema: str = version("nmdc-schema")
nmdc_submission_schema: str = version("nmdc-submission-schema")

class Config:
# In Pydantic V2, use `frozen=True`
# https://docs.pydantic.dev/2.8/concepts/models/#faux-immutability
allow_mutation = False
2 changes: 1 addition & 1 deletion nmdc_server/schemas_submission.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ class StudyForm(BaseModel):
piName: str
piEmail: str
piOrcid: str
fundingSource: Optional[str]
fundingSources: Optional[List[str]]
linkOutWebpage: List[str]
studyDate: Optional[str]
description: str
Expand Down
10 changes: 9 additions & 1 deletion nmdc_server/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,15 @@

class KeggTerms:
ORTHOLOGY = ("KEGG.ORTHOLOGY:K", "K")
PATHWAY = ("KEGG.PATHWAY:MAP", "MAP")
PATHWAY = (
[
"KEGG.PATHWAY:MAP",
"KEGG.PATHWAY:EC",
"KEGG.PATHWAY:RN",
"KEGG.PATHWAY:KO",
],
"MAP",
)
MODULE = ("KEGG.MODULE:M", "M")


Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ dependencies = [
"ipython==7.31.1",
"itsdangerous==2.0.1",
"mypy<0.920",
"nmdc-schema==10.7.0",
"nmdc-schema==11.0.0rc20",
"nmdc-submission-schema==10.7.0",
"pint==0.18",
"psycopg2==2.9.3",
Expand Down
18 changes: 18 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,3 +58,21 @@ def logged_in_user(db, client) -> schemas.User:
client.headers["Authorization"] = f"Bearer {token_response.access_token.decode()}"

return user


@pytest.fixture
def logged_in_admin_user(db, client) -> schemas.User:
r"""
Returns a logged-in user that is an admin.
TODO: Consider adding an `is_admin: bool = False` parameter to the `logged_in_user` fixture
and then consolidating this fixture with that one.
"""

user = UserFactory(is_admin=True)
db.commit()

token_response = create_token_response(user)
client.headers["Authorization"] = f"Bearer {token_response.access_token.decode()}"

return user
Loading

0 comments on commit 6811a66

Please sign in to comment.