Source code for chemistry_tools.spectrum_similarity

#!/usr/bin/env python3
# -*- coding: UTF-8 -*-
#
# Copyright 2019-2020 Dominic Davis-Foster
#
# Adapted from SpectrumSimilarity.R
# Part of OrgMassSpecR
# Copyright 2011-2017 Nathan Dodder <nathand@sccwrp.org>
# Available under the BSD 2-Clause License


__author__ = "Dominic Davis-Foster"
__copyright__ = "Copyright 2019 Dominic Davis-Foster"

__license__ = "LGPL"
__version__ = "0.1.0"
__email__ = "dominic@davis-foster.co.uk"

import numpy as np
import pandas as pd


[docs]def SpectrumSimilarity(
		spec_top, spec_bottom, t=0.25, b=10, top_label=None,
		bottom_label=None, xlim=(50, 1200), x_threshold=0, print_alignment=False,
		print_graphic=True, output_list=False,
		):
	"""

	:param spec_top: Array containing the experimental spectrum's peak list with the m/z values in the
		first column and corresponding intensities in the second
	:type spec_top: numpy.array
	:param spec_bottom: Array containing the reference spectrum's peak list with the m/z values in the
		first column and corresponding intensities in the second
	:type spec_bottom: numpy.array
	:param t: numeric value specifying the tolerance used to align the m/z values of the two spectra.
	:type t: float
	:param b: numeric value specifying the baseline threshold for peak identification. Expressed as a percent of the maximum intensity.
	:type b: float
	:param top_label: string to label the top spectrum.
	:type top_label: str
	:param bottom_label: string to label the bottom spectrum.
	:type bottom_label: str
	:param xlim: tuple of length 2, defining the beginning and ending values of the x-axis.
	:type xlim: tuple of int
	:param x_threshold: numeric value specifying
	:type x_threshold: float
	:param print_alignment:  whether the intensities should be printed
	:type print_alignment: bool
	:param print_graphic:
	:type print_graphic: bool
	:param output_list: whether the intensities should be returned
	:type output_list: bool
	:return:
	:rtype:
	"""

	if print_graphic:
		import matplotlib.pyplot as plt

	# format spectra and normalize intensitites
	top_tmp = pd.DataFrame(data=spec_top, columns=["mz", "intensity"])
	top_tmp["normalized"] = top_tmp.apply(normalize, args=(max(top_tmp["intensity"]),), axis=1)
	top_tmp = top_tmp[top_tmp["mz"].between(xlim[0], xlim[1])]
	top_plot = top_tmp[["mz", "normalized"]].copy()  # data frame for plotting spectrum
	top_plot.columns = ["mz", "intensity"]
	top = top_plot[top_plot["intensity"] >= b]  # data frame for similarity score calculation

	bottom_tmp = pd.DataFrame(data=spec_bottom, columns=["mz", "intensity"])
	bottom_tmp["normalized"] = bottom_tmp.apply(normalize, args=(max(bottom_tmp["intensity"]),), axis=1)
	bottom_tmp = bottom_tmp[bottom_tmp["mz"].between(xlim[0], xlim[1])]
	bottom_plot = bottom_tmp[["mz", "normalized"]].copy()  # data frame for plotting spectrum
	bottom_plot.columns = ["mz", "intensity"]
	bottom = bottom_plot[bottom_plot["intensity"] >= b]  # data frame for similarity score calculation

	# align the m/z axis of the two spectra, the bottom spectrum is used as the reference

	# Unimplemented R code
	#   for(i in 1:nrow(bottom))
	# 	top["mz"][bottom["mz"][i] >= top["mz"] - t & bottom["mz"][i] <= top["mz"] + t] = bottom["mz"][i]
	# 	top[,1][bottom[,1][i] >= top[,1] - t & bottom[,1][i] <= top[,1] + t] <- bottom[,1][i]
	#   alignment <- merge(top, bottom, by = 1, all = TRUE)
	#   if(length(unique(alignment[,1])) != length(alignment[,1])) warning("the m/z tolerance is set too high")
	# alignment[,c(2,3)][is.na(alignment[,c(2,3)])] <- 0   # convert NAs to zero (R-Help, Sept. 15, 2004, John Fox)
	# names(alignment) <- c("mz", "intensity.top", "intensity.bottom")
	#
	alignment = pd.merge(top, bottom, on="mz", how="outer")
	alignment.fillna(value=0, inplace=True)  # Convert NaN to 0
	alignment.columns = ["mz", "intensity_top", "intensity_bottom"]
	if print_alignment:
		with pd.option_context('display.max_rows', None, 'display.max_columns', None):
			print(alignment)

	# similarity score calculation

	if x_threshold < 0:
		print("Error: x_threshold argument must be zero or a positive number")
		return 1

	# Unimplemented R code
	# alignment <- alignment[alignment[,1] >= x.threshold, ]

	u = np.array(alignment.iloc[:, 1])
	v = np.array(alignment.iloc[:, 2])

	similarity_score = np.dot(u, v) / (np.sqrt(np.sum(np.square(u))) * np.sqrt(np.sum(np.square(v))))

	# Reverse Match
	reverse_alignment = pd.merge(top, bottom, on="mz", how="right")
	reverse_alignment = reverse_alignment.dropna()  # Remove rows containing NaN
	reverse_alignment.columns = ["mz", "intensity_top", "intensity_bottom"]
	u = np.array(reverse_alignment.iloc[:, 1])
	v = np.array(reverse_alignment.iloc[:, 2])

	reverse_similarity_score = np.dot(u, v) / (np.sqrt(np.sum(np.square(u))) * np.sqrt(np.sum(np.square(v))))

	# generate plot

	if print_graphic:
		fig, ax = plt.subplots()
		# fig.scatter(top_plot["mz"],top_plot["intensity"], s=0)
		ax.vlines(top_plot["mz"], 0, top_plot["intensity"], color="blue")
		ax.vlines(bottom["mz"], 0, -bottom["intensity"], color="red")
		ax.set_ylim(-125, 125)
		ax.set_xlim(xlim[0], xlim[1])
		ax.axhline(color="black", linewidth=0.5)
		ax.set_ylabel("Intensity (%)")
		ax.set_xlabel("m/z", style="italic", family="serif")

		h_centre = xlim[0] + (xlim[1] - xlim[0]) // 2

		ax.text(h_centre, 110, top_label, horizontalalignment="center", verticalalignment="center")
		ax.text(h_centre, -110, bottom_label, horizontalalignment="center", verticalalignment="center")
		plt.show()

	# Unimplemented R code
	# 	ticks <- c(-100, -50, 0, 50, 100)
	# 	plot.window(xlim = c(0, 20), ylim = c(-10, 10))
	#
	#
	#   if(output.list == TRUE) {
	#
	# 	# Grid graphics head to tail plot
	#
	# 	headTailPlot <- function() {
	#
	# 	  pushViewport(plotViewport(c(5, 5, 2, 2)))
	# 	  pushViewport(dataViewport(xscale = xlim, yscale = c(-125, 125)))
	#
	# 	  grid.rect()
	# 	  tmp <- pretty(xlim)
	# 	  xlabels <- tmp[tmp >= xlim[1] & tmp <= xlim[2]]
	# 	  grid.xaxis(at = xlabels)
	# 	  grid.yaxis(at = c(-100, -50, 0, 50, 100))
	#
	# 	  grid.segments(top_plot$mz,
	# 					top_plot$intensity,
	# 					top_plot$mz,
	# 					rep(0, length(top_plot$intensity)),
	# 					default.units = "native",
	# 					gp = gpar(lwd = 0.75, col = "blue"))
	#
	# 	  grid.segments(bottom_plot$mz,
	# 					-bottom_plot$intensity,
	# 					bottom_plot$mz,
	# 					rep(0, length(bottom_plot$intensity)),
	# 					default.units = "native",
	# 					gp = gpar(lwd = 0.75, col = "red"))
	#
	# 	  grid.abline(intercept = 0, slope = 0)
	#
	# 	  grid.text("intensity (%)", x = unit(-3.5, "lines"), rot = 90)
	# 	  grid.text("m/z", y = unit(-3.5, "lines"))
	#
	# 	  popViewport(1)
	# 	  pushViewport(dataViewport(xscale = c(0, 20), yscale = c(-10, 10)))
	# 	  grid.text(top.label, unit(10, "native"), unit(9, "native"))
	# 	  grid.text(bottom.label, unit(10, "native"), unit(-9, "native"))
	#
	# 	  popViewport(2)
	#
	# 	}
	#
	# 	p <- grid.grabExpr(headTailPlot())
	#
	#   }
	#
	#
	#

	#	with pd.option_context('display.max_rows', None, 'display.max_columns', None):
	#		print(similarity_score)

	if output_list:
		return similarity_score, reverse_similarity_score, alignment
	# Unimplemented R code
	#
	# return(list(similarity.score = similarity_score,
	# 	alignment = alignment,
	# 	plot = p))
	#
	else:
		return similarity_score, reverse_similarity_score


# simscore <- as.vector((u %*% v)^2 / (sum(u^2) * sum(v^2)))   # cos squared


def normalize(row, max_val):
	# http://jonathansoma.com/lede/foundations/classes/pandas%20columns%20and%20functions/apply-a-function-to-every-row-in-a-pandas-dataframe/
	return (row["intensity"] / float(max_val)) * 100.0


def create_array(intensities, mz):
	return np.column_stack((mz, intensities))