Source code for chemistry_tools.names

#!/usr/bin/env python3
#
#  names.py
"""
Functions for working with IUPAC names for chemicals.
"""
#
#  Copyright (c) 2020 Dominic Davis-Foster <dominic@davis-foster.co.uk>
#
#  This program is free software; you can redistribute it and/or modify
#  it under the terms of the GNU Lesser General Public License as published by
#  the Free Software Foundation; either version 3 of the License, or
#  (at your option) any later version.
#
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
#  GNU Lesser General Public License for more details.
#
#  You should have received a copy of the GNU Lesser General Public License
#  along with this program; if not, write to the Free Software
#  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
#  MA 02110-1301, USA.
#

# stdlib
import re
from typing import Any, Dict, List, Pattern, Sequence, Tuple

# 3rd party
from pandas import DataFrame  # type: ignore[import]

# this package
from chemistry_tools import cached_requests
from chemistry_tools.constants import prefixes
from chemistry_tools.pubchem.errors import HTTP_ERROR_CODES

__all__ = [
		"get_IUPAC_parts",
		"sort_IUPAC_names",
		"get_IUPAC_sort_order",
		"get_sorted_parts",
		"sort_array_by_name",
		"sort_dataframe_by_name",
		"iupac_name_from_cas",
		"cas_from_iupac_name",
		"multiplier_regex",
		"re_strings",
		]

#: Regular expression to match "multiple" prefixes such as **mono-**.
multiplier_regex = re.compile('*'.join([f"({prefix})" for prefix in prefixes.values()]) + '*')

#: List of regular expressions to decompose an IUPAC name.
re_strings: List[Pattern] = [
		re.compile(r"((\d+),?)+(\d+)-"),
		multiplier_regex,
		re.compile(r"nitro"),
		re.compile(r"phenyl"),
		re.compile(r"aniline"),
		re.compile(r"anisole"),
		re.compile(r"benzene"),
		re.compile(r"centralite"),
		re.compile(r"formamide"),
		re.compile(r"glycerine"),
		re.compile(r"nitrate"),
		re.compile(r"glycol"),
		re.compile(r"phthalate"),
		re.compile(r"picrate"),
		re.compile(r"toluene"),
		re.compile(r"methyl"),
		re.compile(r"(?<!m)ethyl"),
		re.compile(r"propyl"),
		re.compile(r"butyl"),
		re.compile(r" "),
		re.compile(r"\("),
		re.compile(r"\)"),
		re.compile(r"hydroxyl"),
		re.compile(r"amin[oe]"),
		re.compile(r"amide"),
		]

_iupac_subs: List[Tuple[Pattern, str]] = [
		# (Regex, replacement)

		# e.g. Bis(2-Nitrophenyl)Amine -> 2,2'-Dinitrophenylamine
		(re.compile(r"^(bis)(\()(\d)(-)(.*)(phenyl)(\))"), r"\3,\3'-Di\5di\6"),
		# e.g. 2-Nitro-N-(4-nitrophenyl)aniline -> 2,4'-Dinitrophenylaniline
		(re.compile(r"^(\d)(-nitro-n-\()(\d)(-nitro)(.*)(\))"), r"\1,\3'-Dinitro-N-\5"),
		(re.compile(r"-?[Nn]-phenylaniline"), "diphenylamine"),
		(re.compile(r"carbanilide"), "-1,3-diphenylurea"),
		(re.compile(r"(glycerol)(-)(\d)(-nitrate)"), r"\3-mononitroglycerin"),
		(re.compile(r"n,n'-"), "1,3-"),
		(re.compile(r"dipicryl"), "hexanitrodiphenyl"),
		(re.compile(r"picryl$"), "-1,3,5-trinitrobenzene"),
		(re.compile(r"picryl"), "-1,3,5-trinitrophenyl"),
		]

_hyphen_digit_hyphen = re.compile(r"(-)(\d)(-)")


[docs]def get_IUPAC_parts(string: str) -> List[str]: """ Splits an IUPAC name for a compound into its constituent parts. :param string: The IUPAC name to split. :returns: A list of constituent parts. """ string = string.lower() for regex, sub in _iupac_subs: string = regex.sub(sub.lower(), string) split_points = set() for regex in re_strings: for match in list(regex.finditer(string.lower())): start, end = match.span() if start != end: split_points.add(start) split_points.add(end) for match in _hyphen_digit_hyphen.finditer(string.lower()): start, end = match.span() if start != end: split_points.add(start + 1) split_points.discard(0) split_points_list = sorted(split_points) start_point = 0 string_chars = list(string) elements = [] for point in split_points_list: elements.append(''.join(string_chars[start_point:point])) start_point = point elements.append(''.join(string_chars[start_point:])) # Fixups fixups = [ ["guani", "di", "ne"], ] for fixup in fixups: length = len(fixup) for i in range(len(elements)): if elements[i:i + length] == fixup: elements = elements[:i] + [''.join(fixup)] + elements[i + length:] # Remove null elements null_elements = {' ', ''} elements = [x for x in elements if x not in null_elements] while not elements[-1]: elements = elements[:-1] return elements
# # from string import ascii_letters # alphabet = ascii_letters + "0123456789" + ",'" + "- " + '!"#$%&()*+./:;<=>?@[\\]^_`{|}~'
[docs]def sort_IUPAC_names(iupac_names: Sequence[str]) -> List[str]: """ Sort a list of IUPAC names into order. :param iupac_names: The list of IUPAC names to sort :return: The list of sorted IUPAC names. """ sort_order = get_IUPAC_sort_order(iupac_names) # return [iupac_names[split_names.index(name)] for name in sorted_names] return sorted(iupac_names, key=lambda x: sort_order[x])
[docs]def get_IUPAC_sort_order(iupac_names: Sequence[str]) -> Dict[str, int]: """ Returns the order the given IUPAC names should be sorted in. Useful when sorting arrays containing data in addition to the name. e.g. .. code-block:: python >>> sort_order = get_IUPAC_sort_order([row[0] for row in data]) >>> sorted_data = sorted(data, key=lambda row: sort_order[row[0]]) where row[0] would be the name of the compound :param iupac_names: The list of IUPAC names to sort. :return: Dictionary mapping the IUPAC names to the order in which they should be sorted. """ split_names, sorted_names = _get_split_and_sorted_lists(iupac_names) sort_order = {} for index, name in enumerate(sorted_names): sort_order[iupac_names[split_names.index(name)]] = index return sort_order
[docs]def get_sorted_parts(iupac_names: Sequence[str]) -> List[List[str]]: """ Returns the constituent parts of the IUPAC names sorted into order. The parts returned are in reverse order (i.e. ``'diphenylamine'`` becomes ``['amine', 'phenyl', 'di']``). :param iupac_names: """ split_names, sorted_names = _get_split_and_sorted_lists(iupac_names) return [split_names[split_names.index(name)] for name in sorted_names]
def _get_split_and_sorted_lists(iupac_names: Sequence[str]) -> Tuple[List[List[str]], List[List[str]]]: split_names = [] for name in iupac_names: split_name = get_IUPAC_parts(name.lower()) if split_name[0].lower() in prefixes.values(): # no positional information at beginning split_name = [' ', *split_name] split_names.append(split_name[::-1]) sorted_names = sorted(split_names) return split_names, sorted_names
[docs]def sort_array_by_name(array: List[List[Any]], name_col: int = 0, reverse: bool = False) -> List[List[Any]]: """ Sort a list of lists by the IUPAC name in each row. :param array: :param name_col: The index of the column containing the IUPAC names :param reverse: Whether the names should be sorted in reverse order. Default is :py:obj:`False`, which sorts from A-Z. :no-default reverse: :return: The sorted array """ names = [row[name_col] for row in array] sort_order = get_IUPAC_sort_order(names) sorted_array = sorted(array, key=lambda row: sort_order[row[name_col]], reverse=reverse) return sorted_array
[docs]def sort_dataframe_by_name(df: DataFrame, name_col: str, reverse: bool = False) -> DataFrame: """ Sorts a :class:`pandas.DataFrame` by the IUPAC name in each row. :param df: :param name_col: The name of the column containing the IUPAC names :param reverse: Whether the names should be sorted in reverse order. Default is :py:obj:`False`, which sorts from A-Z :no-default reverse: :return: The sorted :class:`~pandas.DataFrame` """ names = df[name_col] sort_order = get_IUPAC_sort_order(names) sorted_df = df.loc[df[name_col].map(sort_order).sort_values(ascending=(not reverse)).index] return sorted_df
[docs]def iupac_name_from_cas(cas_number: str) -> str: """ Returns the corresponding IUPAC name for the given CAS registry number. :param cas_number: The cas number to search :return: The IUPAC name """ r = cached_requests.get(f"https://cactus.nci.nih.gov/chemical/structure/{cas_number}/iupac_name") if r.status_code in HTTP_ERROR_CODES: raise ValueError(f"No compound found for CAS registry number {cas_number}.") return r.text
[docs]def cas_from_iupac_name(iupac_name: str) -> str: """ Returns the corresponding CAS registry number for the given IUPAC name. :param iupac_name: The IUPAC name to search. :return: The CAS registry number. """ r = cached_requests.get(f"https://cactus.nci.nih.gov/chemical/structure/{iupac_name}/cas") if r.status_code in HTTP_ERROR_CODES: raise ValueError(f"No compound found for name {iupac_name}.") return r.text.split('\n')[0]