Source code for dimspy.process.peak_filters

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright © 2017-2020 Ralf Weber, Albert Zhou.
#
# This file is part of DIMSpy.
#
# DIMSpy is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# DIMSpy is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with DIMSpy.  If not, see <https://www.gnu.org/licenses/>.
#


import logging
from functools import reduce
from typing import Union, Sequence, Tuple, Any

import numpy as np
from ..models.peak_matrix import PeakMatrix, mask_peakmatrix, unmask_peakmatrix
from ..models.peaklist import PeakList


# peaklist filters
[docs]def filter_attr(pl: PeakList, attr_name: str, max_threshold: Union[int, float, None] = None, min_threshold: [int, float, None] = None, flag_name: Union[str, None] = None, flag_index: Union[int, None] = None): """ Peaklist attribute values filter. :param pl: The target peaklist :param attr_name: Name of the target attribute :param max_threshold: Maximum threshold. A peak will be unflagged if the value of it's attr_name is larger than the threshold. Default = None, indicating no threshold :param min_threshold: Minimum threshold. A peak will be unflagged if the value of it's attr_name is smaller than the threshold. Default = None, indicating no threshold :param flag_name: Name of the new flag attribute. Default = None, indicating using attr_name + '_flag' :param flag_index: Index of the new flag to be inserted into the peaklist. Default = None :rtype: PeakList object This filter accepts real value attributes only. """ if min_threshold is None and max_threshold is None: raise ValueError('must specify minimum or maximum threshold value') flt = lambda x: np.logical_and((min_threshold <= x) if min_threshold is not None else True, (x <= max_threshold) if max_threshold is not None else True) if flag_name is None: flag_name = attr_name + '_flag' return pl.add_attribute(flag_name, flt(pl[attr_name]), is_flag=True, on_index=flag_index)
[docs]def filter_ringing(pl: PeakList, threshold: float, bin_size: Union[int, float] = 1.0, flag_name: str = 'ringing_flag', flag_index: Union[int, None] = None): """ Peaklist ringing filter. :param pl: The target peaklist :param threshold: Intensity threshold ratio :param bin_size: size of the mz chunk for intensity filtering. Default = 1.0 ppm :param flag_name: Name of the new flag attribute. Default = 'ringing_flag' :param flag_index: Index of the new flag to be inserted into the peaklist. Default = None :rtype: PeakList object This filter will split the mz values into bin_size chunks, and search the highest intensity value for each chunk. All other peaks, if it's intensity is smaller than threshold x the highest intensity in that chunk, will be unflagged. """ if not 0 <= threshold <= 1: raise ValueError('mzr_remove: Provide a value in the range [0.0, 1.0]') inds = np.digitize(pl.mz, np.arange(np.floor(np.min(pl.mz)), np.ceil(np.max(pl.mz)) + bin_size, bin_size) - 0.5) blks = [(inds == i) for i in np.unique(inds)] mask = np.array(reduce(lambda x, y: x + y, [[np.max(pl.intensity[c])] * np.sum(c) for c in blks])) return pl.add_attribute(flag_name, pl.intensity > (mask * threshold), is_flag=True, on_index=flag_index)
[docs]def filter_mz_ranges(pl: PeakList, mz_ranges: Sequence[Tuple[float, float]], flag_name: str = 'mz_ranges_flag', flagged_only: bool = False, flag_index: Union[int, None] = None): """ Peaklist mz range filter. :param pl: The target peaklist :param mz_ranges: The mz ranges to remove. Must be in the format of [(mz_min1, mz_max2), (mz_min2, mz_max2), ...] :param flag_name: Name of the new flag attribute. Default = 'mz_range_remove_flag' :param flag_index: Index of the new flag to be inserted into the peaklist. Default = None :rtype: PeakList This filter will remove all the peaks whose mz values are within any of the ranges in the mz_remove_rngs. """ if flagged_only: flags = np.ones(pl.shape[0], dtype=bool) else: flags = np.ones(pl.full_size, dtype=bool) for mzr in mz_ranges: if len(mzr) != 2: raise ValueError( 'mzr_remove: Provide a list of "start" and "end" values for each m/z range that needs to be removed.') if mzr[0] >= mzr[1]: raise ValueError('mzr_remove: Start value cannot be larger then end value.') flags[ (pl.get_attribute("mz", flagged_only) >= mzr[0]) & (pl.get_attribute("mz", flagged_only) <= mzr[1])] = False pl.add_attribute(flag_name, flags, flagged_only=flagged_only, is_flag=True, on_index=flag_index) return pl
# PeakMatrix filters
[docs]def filter_rsd(pm: PeakMatrix, rsd_threshold: Union[int, float], qc_tag: Any, on_attr: str = 'intensity', flag_name: str = 'rsd_flag'): """ PeakMatrix RSD filter. :param pm: The target peak matrix :param rsd_threshold: Threshold of the RSD of the QC samples :param qc_tag: Tag (label) to unmask qc samples :param on_attr: Calculate RSD on given attribute. Default = "intensity" :param flag_name: Name of the new flag. Default = 'rsd_flag' :rtype: PeakMatrix This filter will calculate the RSD values of the QC samples. A peak with a QC RSD value larger than the threshold will be unflagged. """ rsd_values = pm.rsd(qc_tag, on_attr=on_attr) if np.any(np.isnan(rsd_values)): logging.warning('nan found in QC rsd values, filter might not work properly') pm.add_flag(flag_name, [not (np.isnan(v) or v > rsd_threshold) for v in rsd_values]) return pm
[docs]def filter_fraction(pm: PeakMatrix, fraction_threshold: float, within_classes: bool = False, class_tag_type: Any = None, flag_name: str = 'fraction_flag'): """ PeakMatrix fraction filter. :param pm: The target peak matrix :param fraction_threshold: Threshold of the sample fractions :param within_classes: Whether to calculate the fraction array within each class. Default = False :param class_tag_type: Tag type to unmask samples within the same class (e.g. "classLabel"). Default = None :param flag_name: Name of the new flag. Default = 'fraction_flag' :rtype: PeakMatrix object This filter will calculate the fraction array over all samples or within each class (based on class_tag_type). The peaks with a fraction value smaller than the threshold will be unflagged. """ if not within_classes: pm.add_flag(flag_name, pm.fraction >= fraction_threshold) else: if class_tag_type is None: raise KeyError('must provide class tag type for within classes filtering') if not all([t.has_tag_type(class_tag_type) for t in pm.peaklist_tags]): raise AttributeError('not all tags have tag type [%s]' % class_tag_type) flg = np.zeros(pm.shape[1]) for tag in pm.tags_of(class_tag_type): with unmask_peakmatrix(pm, tag) as m: flg = np.logical_or(flg, (m.fraction >= fraction_threshold)) pm.add_flag(flag_name, flg) return pm
[docs]def filter_blank_peaks(pm: PeakMatrix, blank_tag: Any, fraction_threshold: Union[int, float] = 1, fold_threshold: Union[int, float] = 1, method: str = 'mean', rm_blanks: bool = True, flag_name: str = 'blank_flag'): """ PeakMatrix blank filter. :param pm: The target peak matrix :param blank_tag: Tag (label) to mask blank samples. e.g Tag("blank", "classLabel") :param fraction_threshold: Threshold of the sample fractions. Default = 1 :param fold_threshold: Threshold of the blank sample intensity folds. Default = 1 :param method: Method to calculate blank sample intensity array. Valid values include 'mean', 'median', and 'max'. Default = 'mean' :param rm_blanks: Whether to remove (not mask) blank samples after filtering :param flag_name: Name of the new flag. Default = 'blank_flag' :rtype: PeakMatrix object This filter will calculate the intensity array of the blanks using the "method", and compare with the intensities of the other samples. If fraction_threshold% of the intensity values of a peak are smaller than the blank intensities x fold_threshold, this peak will be unflagged. """ if not any([blank_tag in x for x in pm.peaklist_tags]): raise ValueError('blank tag [%s] does not exist' % blank_tag) if method not in ('mean', 'median', 'max'): raise ValueError('filter method must be mean, median or max') with unmask_peakmatrix(pm, blank_tag) as m: mm = np.ma.masked_array(m.intensity_matrix, mask = ~(m.intensity_matrix > 0)) ints = mm[0] if mm.shape[0] == 1 else getattr(np, method)(mm, axis = 0) imsk = ints.mask ints = np.array(ints) * fold_threshold with mask_peakmatrix(pm, blank_tag) as m: faild_int = np.sum(m.intensity_matrix >= ints, axis=0) < (fraction_threshold * m.shape[0]) m.add_flag(flag_name, ~(~imsk & faild_int)) if rm_blanks: pm = pm.remove_samples(np.where([x.has_tag(blank_tag) for x in pm.peaklist_tags])[0]) return pm