Source code for chemistry_tools.formulae.utils

#!/usr/bin/env python3
#
#  utils.py
"""
General utility functions.
"""
#
#  Copyright (c) 2020 Dominic Davis-Foster <dominic@davis-foster.co.uk>
#
#  This program is free software; you can redistribute it and/or modify
#  it under the terms of the GNU Lesser General Public License as published by
#  the Free Software Foundation; either version 3 of the License, or
#  (at your option) any later version.
#
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
#  GNU Lesser General Public License for more details.
#
#  You should have received a copy of the GNU Lesser General Public License
#  along with this program; if not, write to the Free Software
#  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
#  MA 02110-1301, USA.
#
#  Based on ChemPy (https://github.com/bjodah/chempy)
#  |  Copyright (c) 2015-2018, Björn Dahlgren
#  |  All rights reserved.
#  |
#  |  Redistribution and use in source and binary forms, with or without modification,
#  |  are permitted provided that the following conditions are met:
#  |
#  |    Redistributions of source code must retain the above copyright notice, this
#  |    list of conditions and the following disclaimer.
#  |
#  |    Redistributions in binary form must reproduce the above copyright notice, this
#  |    list of conditions and the following disclaimer in the documentation and/or
#  |    other materials provided with the distribution.
#  |
#  |  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
#  |  ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
#  |  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
#  |  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
#  |  ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
#  |  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
#  |  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
#  |  ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
#  |  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
#  |  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#  Also based on Pyteomics (https://github.com/levitsky/pyteomics)
#  |  Copyright (c) 2011-2015, Anton Goloborodko & Lev Levitsky
#  |  Licensed under the Apache License, Version 2.0 (the "License");
#  |  you may not use this file except in compliance with the License.
#  |  You may obtain a copy of the License at
#  |
#  |    http://www.apache.org/licenses/LICENSE-2.0
#  |
#  |  Unless required by applicable law or agreed to in writing, software
#  |  distributed under the License is distributed on an "AS IS" BASIS,
#  |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  |  See the License for the specific language governing permissions and
#  |  limitations under the License.
#  |
#  |  See also:
#  |  Goloborodko, A.A.; Levitsky, L.I.; Ivanov, M.V.; and Gorshkov, M.V. (2013)
#  |  "Pyteomics - a Python Framework for Exploratory Data Analysis and Rapid Software
#  |  Prototyping in Proteomics", Journal of The American Society for Mass Spectrometry,
#  |  24(2), 301–304. DOI: `10.1007/s13361-012-0516-6 <http://dx.doi.org/10.1007/s13361-012-0516-6>`_
#  |
#  |  Levitsky, L.I.; Klein, J.; Ivanov, M.V.; and Gorshkov, M.V. (2018)
#  |  "Pyteomics 4.0: five years of development of a Python proteomics framework",
#  |  Journal of Proteome Research.
#  |  DOI: `10.1021/acs.jproteome.8b00717 <http://dx.doi.org/10.1021/acs.jproteome.8b00717>`_
#
#  Also based on molmass (https://github.com/cgohlke/molmass)
#  |  Copyright (c) 1990-2020, Christoph Gohlke
#  |  All rights reserved.
#  |  Licensed under the BSD 3-Clause License
#  |  Redistribution and use in source and binary forms, with or without
#  |  modification, are permitted provided that the following conditions are met:
#  |
#  |  1. Redistributions of source code must retain the above copyright notice,
#  |     this list of conditions and the following disclaimer.
#  |
#  |  2. Redistributions in binary form must reproduce the above copyright notice,
#  |     this list of conditions and the following disclaimer in the documentation
#  |     and/or other materials provided with the distribution.
#  |
#  |  3. Neither the name of the copyright holder nor the names of its
#  |     contributors may be used to endorse or promote products derived from
#  |     this software without specific prior written permission.
#  |
#  |  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
#  |  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
#  |  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
#  |  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
#  |  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
#  |  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
#  |  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
#  |  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
#  |  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
#  |  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
#  |  POSSIBILITY OF SUCH DAMAGE.
#  |

# stdlib
import re
from functools import lru_cache
from typing import Dict, Iterator, List, Sequence, Tuple

# this package
from chemistry_tools.elements import ELEMENTS

__all__ = [
		"GROUPS",
		"split_isotope",
		"hill_order",
		]

#: Common chemical groups
GROUPS: Dict[str, str] = {
		"Abu": "C4H7NO",
		"Acet": "C2H3O",
		"Acm": "C3H6NO",
		"Adao": "C10H15O",
		"Aib": "C4H7NO",
		"Ala": "C3H5NO",
		"Arg": "C6H12N4O",
		"Argp": "C6H11N4O",
		"Asn": "C4H6N2O2",
		"Asnp": "C4H5N2O2",
		"Asp": "C4H5NO3",
		"Aspp": "C4H4NO3",
		"Asu": "C8H13NO3",
		"Asup": "C8H12NO3",
		"Boc": "C5H9O2",
		"Bom": "C8H9O",
		"Bpy": "C10H8N2",  # Bipyridine
		"Brz": "C8H6BrO2",
		"Bu": "C4H9",
		"Bum": "C5H11O",
		"Bz": "C7H5O",
		"Bzl": "C7H7",
		"Bzlo": "C7H7O",
		"Cha": "C9H15NO",
		"Chxo": "C6H11O",
		"Cit": "C6H11N3O2",
		"Citp": "C6H10N3O2",
		"Clz": "C8H6ClO2",
		"Cp": "C5H5",
		"Cy": "C6H11",
		"Cys": "C3H5NOS",
		"Cysp": "C3H4NOS",
		"Dde": "C10H13O2",
		"Dnp": "C6H3N2O4",
		"Et": "C2H5",
		"Fmoc": "C15H11O2",
		"For": "CHO",
		"Gln": "C5H8N2O2",
		"Glnp": "C5H7N2O2",
		"Glp": "C5H5NO2",
		"Glu": "C5H7NO3",
		"Glup": "C5H6NO3",
		"Gly": "C2H3NO",
		"Hci": "C7H13N3O2",
		"Hcip": "C7H12N3O2",
		"His": "C6H7N3O",
		"Hisp": "C6H6N3O",
		"Hser": "C4H7NO2",
		"Hserp": "C4H6NO2",
		"Hx": "C6H11",
		"Hyp": "C5H7NO2",
		"Hypp": "C5H6NO2",
		"Ile": "C6H11NO",
		"Ivdde": "C14H21O2",
		"Leu": "C6H11NO",
		"Lys": "C6H12N2O",
		"Lysp": "C6H11N2O",
		"Mbh": "C15H15O2",
		"Me": "CH3",
		"Mebzl": "C8H9",
		"Meobzl": "C8H9O",
		"Met": "C5H9NOS",
		"Mmt": "C20H17O",
		"Mtc": "C14H19O3S",
		"Mtr": "C10H13O3S",
		"Mts": "C9H11O2S",
		"Mtt": "C20H17",
		"Nle": "C6H11NO",
		"Npys": "C5H3N2O2S",
		"Nva": "C5H9NO",
		"Odmab": "C20H26NO3",
		"Orn": "C5H10N2O",
		"Ornp": "C5H9N2O",
		"Pbf": "C13H17O3S",
		"Pen": "C5H9NOS",
		"Penp": "C5H8NOS",
		"Ph": "C6H5",
		"Phe": "C9H9NO",
		"Phepcl": "C9H8ClNO",
		"Phg": "C8H7NO",
		"Pmc": "C14H19O3S",
		"Ppa": "C8H7O2",
		"Pro": "C5H7NO",
		"Prop": "C3H7",
		"Py": "C5H5N",
		"Pyr": "C5H5NO2",
		"Sar": "C3H5NO",
		"Ser": "C3H5NO2",
		"Serp": "C3H4NO2",
		"Sta": "C8H15NO2",
		"Stap": "C8H14NO2",
		"Tacm": "C6H12NO",
		"Tbdms": "C6H15Si",
		"Tbu": "C4H9",
		"Tbuo": "C4H9O",
		"Tbuthio": "C4H9S",
		"Tfa": "C2F3O",
		"Thi": "C7H7NOS",
		"Thr": "C4H7NO2",
		"Thrp": "C4H6NO2",
		"Tips": "C9H21Si",
		"Tms": "C3H9Si",
		"Tos": "C7H7O2S",
		"Trp": "C11H10N2O",
		"Trpp": "C11H9N2O",
		"Trt": "C19H15",
		"Tyr": "C9H9NO2",
		"Tyrp": "C9H8NO2",
		"Val": "C5H9NO",
		"Valoh": "C5H9NO2",
		"Valohp": "C5H8NO2",
		"Xan": "C13H9O",
		}

_isotope_regex_1 = re.compile(r"^([A-z]+)(\[\d*])$")
_isotope_regex_2 = re.compile(r"^(\[[A-z]+)(\d*])$")
_isotope_regex_3 = re.compile(r"^(\[\d*)([A-z]+])$")
_iso_bracket_regex = re.compile(r"^(\[)(\d+)(])$")
_hill_carbon_re = re.compile(r"(C(?:\[[0-9]+])?|\[[0-9]+C])")
_hill_hydrogen_re = re.compile(r"(H(?:\[[0-9]+])?|\[[0-9]+H])")


[docs]@lru_cache() def split_isotope(string: str) -> Tuple[str, int]: """ Returns the symbol and mass number for the isotope represented by ``string``. Valid isotopes include ``'[C12]'``, ``'C[12]'`` and ``'[12C]'``. :param string: :return: Tuple representing the element and the isotope number. """ isotope = '0' iso_re_1 = _isotope_regex_1.findall(string) iso_re_2 = _isotope_regex_2.findall(string) iso_re_3 = _isotope_regex_3.findall(string) if iso_re_1: elem, isotope = iso_re_1[0] isotope = _iso_bracket_regex.findall(isotope)[0][1] elif iso_re_2: elem, isotope = iso_re_2[0] elem = elem.lstrip('[') isotope = isotope.rstrip(']') elif iso_re_3: isotope, elem = iso_re_3[0] isotope = isotope.lstrip('[') elem = elem.rstrip(']') else: elem = string if elem not in ELEMENTS: raise ValueError(f"Unknown chemical element with symbol {elem}") return ELEMENTS[elem].symbol, int(isotope)
[docs]def hill_order(symbols: Sequence[str]) -> Iterator[str]: """ Returns an iterator over the given element symbols in order of Hill notation. :bold-title:`Example:` .. code-block:: python >>> for i in hill_order("H", "C[12]", "O"): print(i, end='') CHO """ symbols_list: List[str] = list(set(symbols)) carbon_isotopes = list(filter(_hill_carbon_re.findall, symbols_list)) # print(carbon_isotopes) if carbon_isotopes: for isotope in sorted(carbon_isotopes): symbols_list.remove(isotope) yield isotope hydrogen_isotopes = list(filter(_hill_hydrogen_re.findall, symbols_list)) for isotope in sorted(hydrogen_isotopes): symbols_list.remove(isotope) yield isotope yield from sorted(symbols_list)