Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Turn code for generic rollup into Dagster job #642

Open
sujaypatil96 opened this issue Aug 15, 2024 · 0 comments · May be fixed by #643
Open

Turn code for generic rollup into Dagster job #642

sujaypatil96 opened this issue Aug 15, 2024 · 0 comments · May be fixed by #643
Assignees

Comments

@sujaypatil96
Copy link
Collaborator

sujaypatil96 commented Aug 15, 2024

@lru_cache
def get_class_names_to_collection_names_map():
    vg = ViewGetter()
    schema_view = vg.get_view()

    collection_names = get_collection_names_from_schema()

    class_names_to_collection_names = {}
    for collection_name in collection_names:
        class_name = get_name_of_class_objects_in_collection(
            schema_view, collection_name
        )
        class_names_to_collection_names[class_name] = collection_name

    return class_names_to_collection_names
def get_collection_from_typecode(doc_id: str):
    typecode = doc_id.split(":")[1].split("-")[0]
    class_map_data = typecodes()

    class_map = {
        entry["name"]: entry["schema_class"].split(":")[1] for entry in class_map_data
    }
    class_name = class_map.get(typecode)
    if class_name:
        collection_dict = (
            get_class_names_to_collection_names_map()
        )
        collection_name = collection_dict.get(class_name)
        return collection_name

    return None
def collect_all_associated_ids(mdb, study_id):
    biosample_associated_objects = []
    study = raise404_if_none(
        mdb.study_set.find_one({"id": study_id}, projection={"id": 1}),
        detail="Study not found",
    )
    if not study:
        print("Study not found.")
        return []

    biosamples = mdb.biosample_set.find({"part_of": study["id"]}, projection={"id": 1})
    biosample_ids = [biosample["id"] for biosample in biosamples]
    if not biosample_ids:
        print("No biosamples found for the study.")
        return []

    for biosample_id in biosample_ids:
        current_ids = [biosample_id]
        collected_ids_by_collection = {}

        while current_ids:
            documents = list(
                mdb.alldocs.find(
                    {"has_input": {"$in": current_ids}},
                    projection={"id": 1, "has_input": 1, "has_output": 1},
                )
            )
            if not documents:
                print(f"No documents found for input IDs: {current_ids}")
                break

            new_current_ids = []
            for document in documents:
                document_id = document["id"]
                collection_name = get_collection_from_typecode(document_id)
                if collection_name:
                    collected_ids_by_collection.setdefault(collection_name, []).append(
                        document_id
                    )

                if "has_input" in document:
                    for inp in document["has_input"]:
                        inp_collection_name = get_collection_from_typecode(inp)
                        if inp_collection_name:
                            collected_ids_by_collection.setdefault(
                                inp_collection_name, []
                            ).append(inp)

                if "has_output" in document:
                    for out in document["has_output"]:
                        out_collection_name = get_collection_from_typecode(out)
                        if out_collection_name:
                            collected_ids_by_collection.setdefault(
                                out_collection_name, []
                            ).append(out)
                        new_current_ids.append(out)

                        if mdb.alldocs.find_one(
                            {"was_generated_by": out}, projection={"id": 1}
                        ):
                            collected_ids_by_collection.setdefault(
                                out_collection_name, []
                            ).append(out + " (generated)")

            current_ids = new_current_ids

        if collected_ids_by_collection:
            formatted_document = {
                "biosample_id": biosample_id,
                "associated_ids_by_collection": collected_ids_by_collection,
            }
            biosample_associated_objects.append(formatted_document)

    return biosample_associated_objects
@sujaypatil96 sujaypatil96 self-assigned this Aug 15, 2024
@sujaypatil96 sujaypatil96 linked a pull request Aug 16, 2024 that will close this issue
10 tasks
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

Successfully merging a pull request may close this issue.

1 participant