#!/usr/bin/env python3
#
# unknowns.py
"""
Metadata and pipeline for unknown samples.
"""
#
# Copyright © 2023 Dominic Davis-Foster <dominic@davis-foster.co.uk>
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
# OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
# OR OTHER DEALINGS IN THE SOFTWARE.
#
# stdlib
import json
from operator import itemgetter
from typing import Dict, Iterator, List, Tuple, Type
# 3rd party
import attrs
import pyms_nist_search
import tomli_w
from dom_toml.config import Config
from dom_toml.config.fields import String
from domdf_python_tools.paths import PathPlus
from domdf_python_tools.typing import PathLike
from libgunshotmatch.consolidate import ConsolidatedPeakFilter
from libgunshotmatch.datafile import Repeat
from libgunshotmatch.method import Method
from libgunshotmatch.project import Project
from libgunshotmatch.search import identify_peaks
from libgunshotmatch.utils import _fix_init_annotations
from pyms.DPA.Alignment import exprl2alignment
from pyms.Experiment import Experiment
# this package
from gunshotmatch_pipeline import prepare_datafile
from gunshotmatch_pipeline.nist_ms_search import engine_on_demand
from gunshotmatch_pipeline.projects import LoaderMixin
from gunshotmatch_pipeline.utils import tomllib
__all__ = ("UnknownSettings", "filter_and_identify_peaks", "process_unknown")
[docs]@_fix_init_annotations
@attrs.define
class UnknownSettings(Config, LoaderMixin):
"""
Settings for an unknown propellant or OGSR sample.
.. autosummary-widths:: 28/100
"""
#: The unknown sample's name or identifier.
name: str = String.field(default=attrs.NOTHING)
#: The input datafile
datafile: str = String.field(default=attrs.NOTHING)
#: Relative or absolute filename to the method TOML file. The table name is "method".
method: str = String.field(default=attrs.NOTHING)
#: Relative or absolute filename to the configuration TOML file. The table name is "config".
config: str = String.field(default=attrs.NOTHING)
#: Relative or absolute path to the directory the output files should be placed in.
output_directory: str = String.field(default=attrs.NOTHING)
#: Relative or absolute path to the directory containing the data files.
data_directory: str = String.field(default='')
@property
def datafile_path(self) -> PathPlus:
"""
The absolute path to the datafile.
"""
return (PathPlus(self.data_directory) / self.datafile).abspath()
[docs] @classmethod
def from_toml(cls: Type["UnknownSettings"], toml_string: str) -> "UnknownSettings":
"""
Parse an :class:`~.UnknownSettings` from a TOML string.
:param toml_string:
"""
parsed_toml: Dict[str, str] = tomllib.loads(toml_string)
understood_keys = {"name", "datafile", "method", "config", "output_directory", "data_directory"}
toml_subset = {k: v for k, v in parsed_toml.items() if k in understood_keys}
return cls(**toml_subset)
[docs] @classmethod
def from_json(cls: Type["UnknownSettings"], json_string: str) -> "UnknownSettings":
"""
Parse an :class:`~.UnknownSettings` from a JSON string.
:param json_string:
"""
parsed_json = json.loads(json_string)
return cls(**parsed_json)
[docs] def to_toml(self) -> str:
"""
Convert an :class:`~.UnknownSettings` to a TOML string.
"""
return tomli_w.dumps(self.to_dict())
[docs]def filter_and_identify_peaks(
repeat: Repeat,
method: Method,
engine: pyms_nist_search.Engine,
) -> None:
"""
Filter peaks by minimum peak area, then identify compounds.
:param repeat:
:param method:
:param engine: NIST MS Search engine.
"""
top_n_peaks = method.alignment.top_n_peaks
min_peak_area = method.alignment.min_peak_area
peak_index_area_map: List[Tuple[float, float]] = []
for peak in repeat.peaks:
assert peak.area is not None
peak_index_area_map.append((peak.area, peak.rt))
if top_n_peaks: # If ``0`` all peaks are included.
print(f"Filtering to the largest {top_n_peaks} peaks with a peak area above {min_peak_area}")
peak_index_area_map.sort(key=itemgetter(0), reverse=True)
peak_index_area_map = peak_index_area_map[:top_n_peaks]
else:
print(f"Filtering to peaks with an average peak area above {min_peak_area}")
top_peaks_times = [rt / 60 for area, rt in peak_index_area_map if area >= min_peak_area]
# peak_index_area_map = [(idx, peak.area, peak.rt) for idx, peak in enumerate(repeat.peaks)]
# peak_index_area_map.sort(key=itemgetter(1), reverse=True)
# # Get indices of largest n peaks based on `ident_top_peaks`
# # top_peaks_indices = []
# top_peaks_times = []
# # print("tail of area_alignment=", area_alignment.tail(top_n_peaks))
# # Limit to the largest `ident_top_peaks` peaks
# for peak_no, area, rt in peak_index_area_map[:top_n_peaks]:
# # Ignore peak if average peak area is less then min_peak_area
# if area >= min_peak_area:
# # top_peaks_indices.append(peak_no)
# top_peaks_times.append(rt / 60)
# # top_peaks_times.append(round_rt(rt / 60))
print(f"Identifying Compounds for {repeat.name}")
qualified_peaks = identify_peaks(
engine,
top_peaks_times,
repeat.peaks, # verbose=True,
)
repeat.qualified_peaks = qualified_peaks
[docs]def process_unknown(
unknown: UnknownSettings,
output_dir: PathLike,
recreate: bool = False,
) -> Project:
"""
Process an "unknown" sample.
:param unknown:
:param output_dir:
:param recreate: Force regeneration of ``.gsmr`` and ``.gsmp`` files.
"""
output_dir = PathPlus(output_dir)
output_dir.maybe_make()
method = unknown.load_method()
config = unknown.load_config()
gsmp_filename = output_dir / f"{unknown.name}.gsmp"
# print(gsmp_filename)
if gsmp_filename.exists() and not recreate:
print(f"Loading Unknown from file {gsmp_filename.as_posix()!r}")
project = Project.from_file(gsmp_filename)
else:
with engine_on_demand(config.pyms_nist_search) as search:
gsmr_filename = (output_dir / unknown.datafile).with_suffix(".gsmr")
# print(gsmr_filename)
if gsmr_filename.exists() and not recreate:
print(f"Loading Repeat from file {gsmr_filename.as_posix()!r}")
repeat = Repeat.from_file(gsmr_filename)
repeat.peaks.datafile_name = repeat.name
else:
print("\nParsing", unknown.datafile_path)
repeat, gcms_data = prepare_datafile(unknown.datafile_path, method)
filter_and_identify_peaks(repeat, method, engine=search.engine)
repeat.export(output_dir)
alignment = exprl2alignment([Experiment(repeat.name, repeat.peaks)])[0]
project = Project(name=unknown.name, alignment=alignment, datafile_data={repeat.name: repeat})
if not project.consolidated_peaks:
# ms_comparison_df = project.consolidate()
cp_filter = ConsolidatedPeakFilter(
name_filter=method.consolidate.name_filter,
min_match_factor=int(method.consolidate.min_match_factor * 0.8),
min_appearances=1, # verbose=True,
)
ms_comparison_df = project.consolidate(search.engine, cp_filter)
# assert project.consolidated_peaks is not None
# print(len(project.consolidated_peaks))
# print(ms_comparison_df)
# export_filename = project.export(output_dir)
export_filename = project.export(output_dir)
print(f"Project saved to {export_filename!r}")
return project
@_fix_init_annotations
@attrs.define
class Unknowns(Config):
"""
Unknown samples.
Analogue of :class:`gunshotmatch_pipeline.projects.Projects`.
.. versionadded:: 0.11.0
"""
#: Settings for specific unknowns.
per_unknown_settings: Dict[str, UnknownSettings] = attrs.field(factory=dict)
@classmethod
def from_toml(cls: Type["Unknowns"], toml_string: str) -> "Unknowns":
"""
Parse a :class:`~.Unknowns` from a TOML string.
:param toml_string:
"""
unknown_settings_toml = tomllib.loads(toml_string)
return cls({k: UnknownSettings(k, **v) for k, v in unknown_settings_toml.items() if isinstance(v, dict)})
def get_unknown_settings(self, unknown_name: str) -> UnknownSettings:
"""
Returns the settings for the given unknown.
:param unknown_name:
"""
return self.per_unknown_settings[unknown_name]
__getitem__ = get_unknown_settings
def iter_unknown_settings(self) -> Iterator[UnknownSettings]:
"""
Iterate over the per-unknown settings.
"""
yield from self.per_unknown_settings.values()
__iter__ = iter_unknown_settings
def load_unknown(self, unknown_name: str) -> Project:
"""
Load a previously created unknown.
:param unknown_name:
"""
unknown_settings = self.get_unknown_settings(unknown_name)
output_dir = PathPlus(unknown_settings.output_directory)
return Project.from_file(output_dir / f"{unknown_name}.gsmp")
def iter_loaded_unknowns(self) -> Iterator[Project]:
"""
Iterate :class:`~libgunshotmatch.project.Project` objects loaded from disk.
"""
for unknown_name in self.per_unknown_settings.keys():
yield self.load_unknown(unknown_name)
def __len__(self) -> int:
return len(self.per_unknown_settings)