Skip to content

Commit

Permalink
Merge pull request #141 from compomics/use-ms2rescore_rs
Browse files Browse the repository at this point in the history
Use Rust for parsing spectrum files
  • Loading branch information
RalfG authored Apr 7, 2024
2 parents 23dfa95 + dc86649 commit 4f1ff2e
Show file tree
Hide file tree
Showing 8 changed files with 85 additions and 128 deletions.
7 changes: 4 additions & 3 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,10 @@ jobs:
run: |
pip install .[dev]
# - name: Test with pytest
# run: |
# pytest
- name: Test with pytest
run: |
pytest
- name: Test installation
run: |
ms2rescore --help
Expand Down
2 changes: 1 addition & 1 deletion ms2rescore/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None:
im_required = "ionmob" in config["feature_generators"] and None in psm_list["ion_mobility"]
if rt_required or im_required:
logger.info("Parsing missing retention time and/or ion mobility values from spectra...")
get_missing_values(config, psm_list, missing_rt=rt_required, missing_im=im_required)
get_missing_values(psm_list, config, rt_required=rt_required, im_required=im_required)

# Add rescoring features
for fgen_name, fgen_config in config["feature_generators"].items():
Expand Down
15 changes: 7 additions & 8 deletions ms2rescore/parse_psms.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import logging
import re
from itertools import chain
from typing import Dict, Union

import psm_utils.io
Expand Down Expand Up @@ -60,9 +59,9 @@ def parse_psms(config: Dict, psm_list: Union[PSMList, None]) -> PSMList:

if config["psm_id_pattern"]:
pattern = re.compile(config["psm_id_pattern"])
logger.debug("Applying `psm_id_pattern`...")
logger.debug("Applying 'psm_id_pattern'...")
logger.debug(
f"Parsing `{psm_list['spectrum_id'][0]}` to `{_match_psm_ids(psm_list['spectrum_id'][0], pattern)}`"
f"Parsing '{psm_list[0].spectrum_id}' to '{_match_psm_ids(psm_list[0].spectrum_id, pattern)}'"
)
new_ids = [_match_psm_ids(old_id, pattern) for old_id in psm_list["spectrum_id"]]
psm_list["spectrum_id"] = new_ids
Expand All @@ -86,7 +85,7 @@ def _read_psms(config, psm_list):
valid_psms = 0
for psm_file in config["psm_file"]:
logger.info(
f"Reading PSMs from PSM file ({current_file}/{total_files}): `{psm_file}`..."
f"Reading PSMs from PSM file ({current_file}/{total_files}): '{psm_file}'..."
)
try:
id_file_psm_list = psm_utils.io.read_file(
Expand All @@ -97,8 +96,8 @@ def _read_psms(config, psm_list):
)
except psm_utils.io.PSMUtilsIOException:
raise MS2RescoreConfigurationError(
"Error occurred while reading PSMs. Please check the `psm_file` and "
"`psm_file_type` settings. See "
"Error occurred while reading PSMs. Please check the 'psm_file' and "
"'psm_file_type' settings. See "
"https://ms2rescore.readthedocs.io/en/latest/userguide/input-files/"
" for more information."
)
Expand Down Expand Up @@ -129,7 +128,7 @@ def _find_decoys(config, psm_list):
if not any(psm_list["is_decoy"]):
raise MS2RescoreConfigurationError(
"No decoy PSMs found. Please check if decoys are present in the PSM file and that "
"the `id_decoy_pattern` option is correct. See "
"the 'id_decoy_pattern' option is correct. See "
"https://ms2rescore.readthedocs.io/en/latest/userguide/configuration/#selecting-decoy-psms"
" for more information."
)
Expand All @@ -150,7 +149,7 @@ def _match_psm_ids(old_id, regex_pattern):
return match[1]
except (TypeError, IndexError):
raise MS2RescoreConfigurationError(
f"`psm_id_pattern` could not be extracted from PSM spectrum IDs (i.e. {old_id})."
f"'psm_id_pattern' could not be extracted from PSM spectrum IDs (i.e. {old_id})."
" Ensure that the regex contains a capturing group?"
)

Expand Down
144 changes: 30 additions & 114 deletions ms2rescore/parse_spectra.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,9 @@
import logging
import re
from itertools import chain
from typing import Dict, Tuple

from ms2rescore_rs import get_precursor_info
from psm_utils import PSMList
from pyteomics.mgf import MGF
from pyteomics.mzml import MzML
from rich.progress import track

from ms2rescore.exceptions import MS2RescoreError
Expand All @@ -16,122 +14,40 @@
logger = logging.getLogger(__name__)


def get_missing_values(config, psm_list, missing_rt=False, missing_im=False):
def get_missing_values(
psm_list: PSMList, config: dict, rt_required: bool = False, im_required: bool = False
):
"""Get missing RT/IM features from spectrum file."""
logger.debug("Extracting missing RT/IM values from spectrum file(s).")

psm_dict = psm_list.get_psm_dict()
for runs in psm_dict.values():
for run, psms in track(runs.items(), description="Extracting RT/IM values..."):
for run, psms in runs.items():
psm_list_run = PSMList(psm_list=list(chain.from_iterable(psms.values())))
spectrum_file = infer_spectrum_path(config["spectrum_path"], run)

if spectrum_file.suffix.lower() == ".mzml":
rt_dict, im_dict = _parse_values_from_mzml(
spectrum_file, config, run, missing_rt, missing_im
)
elif spectrum_file.suffix.lower() == ".mgf":
rt_dict, im_dict = _parse_values_from_mgf(
spectrum_file, config, run, missing_rt, missing_im
)

for value_dict, value in zip([rt_dict, im_dict], ["retention_time", "ion_mobility"]):
if value_dict:
try:
psm_list_run[value] = [value_dict[psm.spectrum_id] for psm in psm_list_run]
except KeyError:
raise ParsingError(
f"Could not parse {value} values from spectrum file for run {run}."
)


def _parse_values_from_mgf(
spectrum_file, config, run, missing_rt, missing_im
) -> Tuple[Dict, Dict]:
"""
Parse retention time and/or ion mobility from an MGF file.
Notes
-----
- Extracting values (e.g., ion mobility) according to the Matrix documentation:
http://www.matrixscience.com/help/data_file_help.html
"""
rt_dict = {}
im_dict = {}

spectrum_id_pattern = re.compile(
config["spectrum_id_pattern"] if config["spectrum_id_pattern"] else r"(.*)"
)

for spectrum in MGF(str(spectrum_file)):
matched_id = spectrum_id_pattern.match(spectrum["params"]["title"]).group()
if missing_rt:
try:
rt_dict[matched_id] = float(spectrum["params"]["rtinseconds"])
except KeyError:
raise ParsingError(
"Could not parse retention time (`rtinseconds`) from spectrum file for "
f"run {run}. Please make sure that the retention time key is present in the "
"spectrum file or disable the relevant feature generator."
)
if missing_im:
try:
im_dict[matched_id] = float(spectrum["params"]["ion_mobility"])
except KeyError:
raise ParsingError(
"Could not parse ion mobility (`ion_mobility`) from spectrum file "
f"for run {run}. Please make sure that the ion mobility key is present in the "
"spectrum file or disable the relevant feature generator."
)

return rt_dict, im_dict


def _parse_values_from_mzml(
spectrum_file, config, run, missing_rt, missing_im
) -> Tuple[Dict, Dict]:
"""Parse retention time and/or ion mobility from an mzML file."""
rt_dict = {}
im_dict = {}

spectrum_id_pattern = re.compile(
config["spectrum_id_pattern"] if config["spectrum_id_pattern"] else r"(.*)"
)

for spectrum in MzML(str(spectrum_file)):
matched_id = spectrum_id_pattern.match(spectrum["id"]).group()
if missing_rt:
try:
rt_dict[matched_id] = float(spectrum["scanList"]["scan"][0]["scan start time"])
except KeyError:
raise ParsingError(
"Could not parse retention time (`scan start time`) from spectrum file for "
f"run {run}. Please make sure that the retention time key is present in the "
"spectrum file or disable the relevant feature generator."
)
if missing_im:
try:
im_dict[matched_id] = float(
spectrum["scanList"]["scan"][0]["reverse ion mobility"]
)
except KeyError:
raise ParsingError(
"Could not parse ion mobility (`reverse ion mobility`) from spectrum file "
f"for run {run}. Please make sure that the ion mobility key is present in the "
"spectrum file or disable the relevant feature generator."
)

return rt_dict, im_dict


class ParseMGFError(MS2RescoreError):
"""Error parsing MGF file."""

pass


class ParsingError(MS2RescoreError):
logger.debug("Reading spectrum file: '%s'", spectrum_file)
precursors = get_precursor_info(str(spectrum_file))

if config["spectrum_id_pattern"]:
spectrum_id_pattern = re.compile(config["spectrum_id_pattern"])
precursors = {
spectrum_id_pattern.search(spectrum_id).group(1): precursor
for spectrum_id, precursor in precursors.items()
}

for psm in psm_list_run:
try:
if rt_required:
psm.retention_time = precursors[psm.spectrum_id].rt
if im_required:
psm.ion_mobility = precursors[psm.spectrum_id].im
if not psm.precursor_mz:
psm.precursor_mz = precursors[psm.spectrum_id].mz
except KeyError as e:
raise SpectrumParsingError(
f"Could not extract missing RT/IM values from spectrum file for run {run}."
) from e


class SpectrumParsingError(MS2RescoreError):
"""Error parsing retention time from spectrum file."""

pass
4 changes: 4 additions & 0 deletions ms2rescore/rescoring_engines/mokapot.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,10 @@ def convert_psm_list(
feature_df.columns = [f"feature:{f}" for f in feature_df.columns]
combined_df = pd.concat([psm_df[required_columns], feature_df], axis=1)

# Ensure filename for FlashLFQ txt output
if not combined_df["run"].notnull().all():
combined_df["run"] = "ms_run"

feature_names = [f"feature:{f}" for f in feature_names] if feature_names else None

lin_psm_data = LinearPsmDataset(
Expand Down
5 changes: 3 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,9 @@ classifiers = [
dynamic = ["version"]
requires-python = ">=3.8"
dependencies = [
"ms2rescore_rs",
"numpy>=1.16.0; python_version != '3.11'",
"numpy==1.24.3; python_version == '3.11'", # Incompatibility with sklearn, pygam, and TF...
"numpy==1.24.3; python_version == '3.11'", # Incompatibility with sklearn, pygam, and TF...
"pandas>=1.0",
"rich>=12",
"pyteomics>=4.1.0",
Expand All @@ -47,7 +48,7 @@ dependencies = [
"psm_utils>=0.4",
"customtkinter>=5,<6",
"mokapot>=0.9",
"pydantic>=1.8.2,<2", # Fix compatibility with v2 in psm_utils
"pydantic>=1.8.2,<2", # Fix compatibility with v2 in psm_utils
"jinja2>=3",
"plotly>=5",
]
Expand Down
13 changes: 13 additions & 0 deletions tests/test_data/test.mgf
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
BEGIN IONS
TITLE=peptide: peptide1
CHARGE=2+
PEPMASS=475.137295
ION_MOBILITY=42.42
RTINSECONDS=51.2
72.04439 100
148.06043 600
232.07504 300
263.08737 400
347.10198 500
423.11802 200
END IONS
23 changes: 23 additions & 0 deletions tests/test_parse_spectra.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
import pytest
from psm_utils import PSM, PSMList

from ms2rescore.parse_spectra import get_missing_values


def test_get_missing_values():
psm_list = PSMList(
psm_list=[
PSM(peptidoform="PEPTIDEK/2", spectrum_id="peptide1"),
]
)
get_missing_values(
psm_list,
config={
"spectrum_path": "tests/test_data/test.mgf",
"spectrum_id_pattern": "peptide: (.*)",
},
rt_required=True,
im_required=True,
)
assert psm_list[0].retention_time == pytest.approx(0.853, 0.001)
assert psm_list[0].ion_mobility == pytest.approx(42.42, 0.01)

0 comments on commit 4f1ff2e

Please sign in to comment.