Source code for gunshotmatch_pipeline.results

#!/usr/bin/env python3
#
#  results.py
"""
Results presented in different formats.
"""
#
#  Copyright © 2020-2023 Dominic Davis-Foster <dominic@davis-foster.co.uk>
#
#  Permission is hereby granted, free of charge, to any person obtaining a copy
#  of this software and associated documentation files (the "Software"), to deal
#  in the Software without restriction, including without limitation the rights
#  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
#  copies of the Software, and to permit persons to whom the Software is
#  furnished to do so, subject to the following conditions:
#
#  The above copyright notice and this permission notice shall be included in all
#  copies or substantial portions of the Software.
#
#  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
#  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
#  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
#  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
#  DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
#  OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
#  OR OTHER DEALINGS IN THE SOFTWARE.
#

# stdlib
import datetime
import string
from collections import defaultdict
from string import ascii_lowercase
from typing import Dict, List

# 3rd party
import pandas  # type: ignore[import-untyped]
from libgunshotmatch.project import Project
from typing_extensions import TypedDict

__all__ = (
		"compounds",
		"compounds_from_matches",
		"machine_learning_data",
		"matches",
		"unknown",
		"unknown_machine_learning_data",
		"MatchesMetadata",
		"MatchesCompounds",
		"Matches",
		)


[docs]class MatchesMetadata(TypedDict):
	"""
	Type hint for the ``metadata`` key in :typeddict:`~.Matches`.
	"""

	project: str
	original_filenames: List[str]
	created: str


#: Type hint for the ``compounds`` key in :typeddict:`~.Matches`.
MatchesCompounds = TypedDict(
		"MatchesCompounds",
		{
				"Mean Retention Time": float,  # in minutes
				"Mean Peak Area": float,
				"CAS": str,
				"Retention Times": List[float],
				"Peak Areas": List[float],
				"Hit Numbers": List[int],
				"Match Factors": List[int],
				"Reverse Match Factors": List[int],
				},
		)


[docs]class Matches(TypedDict):
	"""
	Return type from :func:`~.matches`.
	"""

	metadata: MatchesMetadata
	compounds: Dict[str, MatchesCompounds]


[docs]def matches(project: Project) -> Matches:
	"""
	Returns data on the "best match" for each peak.

	:param project:

	:rtype:

	.. latex:clearpage::
	"""

	matches_json_data: Matches = {
			"metadata": {
					"project": project.name,
					"original_filenames": [
							repeat.datafile.original_filename for repeat in project.datafile_data.values()
							],
					"created": datetime.datetime.now().isoformat(),
					},
			"compounds": {},
			}

	assert project.consolidated_peaks is not None
	for peak in project.consolidated_peaks:
		hit = peak.hits[0]
		# TODO: account for multiple peaks with same name
		matches_json_data["compounds"][hit.name] = {
				"Mean Retention Time": peak.rt / 60,  # in minutes
				"Mean Peak Area": peak.area,  # "Peak Number": peak.peak_number,
				"CAS": hit.cas,
				"Retention Times": [rt / 60 for rt in peak.rt_list],
				"Peak Areas": peak.area_list,
				"Hit Numbers": hit.hit_numbers,
				"Match Factors": hit.mf_list,
				"Reverse Match Factors": hit.rmf_list,
				}

	return matches_json_data


_CompoundName = str
_ProjectName = str
_PeakArea = float
_CompoundDataType = Dict[_CompoundName, Dict[_ProjectName, List[_PeakArea]]]
_PropellantNameAndID = str


[docs]def compounds_from_matches(*matches_data: Matches, normalize: bool = False) -> _CompoundDataType:
	r"""
	Prepares data on the compounds in each repeat from the output of :func:`~.matches` for each project.

	The output mapping gives the peak areas for each compound in the different projects, grouped by compound.

	:param \*matches_data:
	:param normalize:
	"""

	# Get single array of compound name to 5x peak areas per project
	compound_data: _CompoundDataType = defaultdict(dict)

	for loaded_data in matches_data:

		project_name = loaded_data["metadata"]["project"]

		if normalize:
			data_size = len(loaded_data["metadata"]["original_filenames"])
			df_for_norm = pandas.DataFrame(index=list(string.ascii_lowercase[:data_size]))

			for compound in loaded_data["compounds"]:
				df_for_norm[compound] = loaded_data["compounds"][compound]["Peak Areas"]

			res = df_for_norm.div(df_for_norm.sum(axis=1), axis=0)
			# df_for_norm["total"] = df_for_norm[compounds].sum(axis=1)
			# res["total"] = res[compounds].sum(axis=1)
			# print(res)
			# print(df_for_norm)

			for compound in res.columns:
				compound_data[compound][project_name] = list(res[compound])
		else:
			for compound in loaded_data["compounds"]:
				compound_data[compound][project_name] = loaded_data["compounds"][compound]["Peak Areas"]

	return compound_data


[docs]def compounds(*project: Project, normalize: bool = False) -> _CompoundDataType:
	r"""
	Returns data on the compounds in each repeat in the project(s).

	The output mapping gives the peak areas for each compound in the different projects, grouped by compound.

	:param \*project:
	:param normalize:
	"""

	# Get single array of compound name to 5x peak areas per project
	return compounds_from_matches(*(matches(p) for p in project), normalize=normalize)


[docs]def unknown(unknown_project: Project, normalize: bool = False) -> _CompoundDataType:
	"""
	Returns results for an unknown sample.

	The output mapping is formatted the same as that from :func:`~.compounds`, but with only one "project".

	:param unknown_project:
	:param normalize:
	"""

	return compounds_from_matches(matches(unknown_project), normalize=normalize)


[docs]def machine_learning_data(
		*project: Project,
		normalize: bool = False,
		) -> Dict[_CompoundName, Dict[_PropellantNameAndID, _PeakArea]]:
	r"""
	Returns data formatted for training a decision tree or other machine learning model.

	:param \*project:
	:param normalize:
	"""

	decision_tree_compound_data: Dict[_CompoundName, Dict[_PropellantNameAndID, _PeakArea]] = defaultdict(dict)

	for compound, propellant_peak_areas in compounds(*project, normalize=normalize).items():
		for propellant, peak_areas in propellant_peak_areas.items():
			for identifier, peak_area in zip(ascii_lowercase, peak_areas):
				decision_tree_compound_data[compound][f"{propellant}-{identifier}"] = peak_area

	return decision_tree_compound_data


[docs]def unknown_machine_learning_data(
		unknown_project: Project,
		normalize: bool = False,
		) -> Dict[_CompoundName, Dict[_PropellantNameAndID, _PeakArea]]:
	"""
	Returns data formatted for training a decision tree or other machine learning model.

	:param unknown_project:
	:param normalize:
	"""

	decision_tree_compound_data: Dict[_CompoundName, Dict[_PropellantNameAndID, _PeakArea]] = defaultdict(dict)

	for compound, propellant_peak_areas in unknown(unknown_project, normalize=normalize).items():
		assert len(propellant_peak_areas) == 1
		for propellant, peak_areas in propellant_peak_areas.items():
			for peak_area in peak_areas:
				decision_tree_compound_data[compound][f"{propellant}"] = peak_area

	return decision_tree_compound_data