#!/usr/bin/env python3
#
# names.py
"""
Functions for working with IUPAC names for chemicals.
"""
#
# Copyright (c) 2020 Dominic Davis-Foster <dominic@davis-foster.co.uk>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
# MA 02110-1301, USA.
#
# stdlib
import re
from typing import Any, Dict, List, Pattern, Sequence, Tuple
# 3rd party
from pandas import DataFrame # type: ignore[import]
# this package
from chemistry_tools import cached_requests
from chemistry_tools.constants import prefixes
from chemistry_tools.pubchem.errors import HTTP_ERROR_CODES
__all__ = [
"get_IUPAC_parts",
"sort_IUPAC_names",
"get_IUPAC_sort_order",
"get_sorted_parts",
"sort_array_by_name",
"sort_dataframe_by_name",
"iupac_name_from_cas",
"cas_from_iupac_name",
"multiplier_regex",
"re_strings",
]
#: Regular expression to match "multiple" prefixes such as **mono-**.
multiplier_regex = re.compile('*'.join([f"({prefix})" for prefix in prefixes.values()]) + '*')
#: List of regular expressions to decompose an IUPAC name.
re_strings: List[Pattern] = [
re.compile(r"((\d+),?)+(\d+)-"),
multiplier_regex,
re.compile(r"nitro"),
re.compile(r"phenyl"),
re.compile(r"aniline"),
re.compile(r"anisole"),
re.compile(r"benzene"),
re.compile(r"centralite"),
re.compile(r"formamide"),
re.compile(r"glycerine"),
re.compile(r"nitrate"),
re.compile(r"glycol"),
re.compile(r"phthalate"),
re.compile(r"picrate"),
re.compile(r"toluene"),
re.compile(r"methyl"),
re.compile(r"(?<!m)ethyl"),
re.compile(r"propyl"),
re.compile(r"butyl"),
re.compile(r" "),
re.compile(r"\("),
re.compile(r"\)"),
re.compile(r"hydroxyl"),
re.compile(r"amin[oe]"),
re.compile(r"amide"),
]
_iupac_subs: List[Tuple[Pattern, str]] = [
# (Regex, replacement)
# e.g. Bis(2-Nitrophenyl)Amine -> 2,2'-Dinitrophenylamine
(re.compile(r"^(bis)(\()(\d)(-)(.*)(phenyl)(\))"), r"\3,\3'-Di\5di\6"),
# e.g. 2-Nitro-N-(4-nitrophenyl)aniline -> 2,4'-Dinitrophenylaniline
(re.compile(r"^(\d)(-nitro-n-\()(\d)(-nitro)(.*)(\))"), r"\1,\3'-Dinitro-N-\5"),
(re.compile(r"-?[Nn]-phenylaniline"), "diphenylamine"),
(re.compile(r"carbanilide"), "-1,3-diphenylurea"),
(re.compile(r"(glycerol)(-)(\d)(-nitrate)"), r"\3-mononitroglycerin"),
(re.compile(r"n,n'-"), "1,3-"),
(re.compile(r"dipicryl"), "hexanitrodiphenyl"),
(re.compile(r"picryl$"), "-1,3,5-trinitrobenzene"),
(re.compile(r"picryl"), "-1,3,5-trinitrophenyl"),
]
_hyphen_digit_hyphen = re.compile(r"(-)(\d)(-)")
[docs]def get_IUPAC_parts(string: str) -> List[str]:
"""
Splits an IUPAC name for a compound into its constituent parts.
:param string: The IUPAC name to split.
:returns: A list of constituent parts.
"""
string = string.lower()
for regex, sub in _iupac_subs:
string = regex.sub(sub.lower(), string)
split_points = set()
for regex in re_strings:
for match in list(regex.finditer(string.lower())):
start, end = match.span()
if start != end:
split_points.add(start)
split_points.add(end)
for match in _hyphen_digit_hyphen.finditer(string.lower()):
start, end = match.span()
if start != end:
split_points.add(start + 1)
split_points.discard(0)
split_points_list = sorted(split_points)
start_point = 0
string_chars = list(string)
elements = []
for point in split_points_list:
elements.append(''.join(string_chars[start_point:point]))
start_point = point
elements.append(''.join(string_chars[start_point:]))
# Fixups
fixups = [
["guani", "di", "ne"],
]
for fixup in fixups:
length = len(fixup)
for i in range(len(elements)):
if elements[i:i + length] == fixup:
elements = elements[:i] + [''.join(fixup)] + elements[i + length:]
# Remove null elements
null_elements = {' ', ''}
elements = [x for x in elements if x not in null_elements]
while not elements[-1]:
elements = elements[:-1]
return elements
#
# from string import ascii_letters
# alphabet = ascii_letters + "0123456789" + ",'" + "- " + '!"#$%&()*+./:;<=>?@[\\]^_`{|}~'
[docs]def sort_IUPAC_names(iupac_names: Sequence[str]) -> List[str]:
"""
Sort a list of IUPAC names into order.
:param iupac_names: The list of IUPAC names to sort
:return: The list of sorted IUPAC names.
"""
sort_order = get_IUPAC_sort_order(iupac_names)
# return [iupac_names[split_names.index(name)] for name in sorted_names]
return sorted(iupac_names, key=lambda x: sort_order[x])
[docs]def get_IUPAC_sort_order(iupac_names: Sequence[str]) -> Dict[str, int]:
"""
Returns the order the given IUPAC names should be sorted in.
Useful when sorting arrays containing data in addition to the name.
e.g.
.. code-block:: python
>>> sort_order = get_IUPAC_sort_order([row[0] for row in data])
>>> sorted_data = sorted(data, key=lambda row: sort_order[row[0]])
where row[0] would be the name of the compound
:param iupac_names: The list of IUPAC names to sort.
:return: Dictionary mapping the IUPAC names to the order in which they should be sorted.
"""
split_names, sorted_names = _get_split_and_sorted_lists(iupac_names)
sort_order = {}
for index, name in enumerate(sorted_names):
sort_order[iupac_names[split_names.index(name)]] = index
return sort_order
[docs]def get_sorted_parts(iupac_names: Sequence[str]) -> List[List[str]]:
"""
Returns the constituent parts of the IUPAC names sorted into order.
The parts returned are in reverse order
(i.e. ``'diphenylamine'`` becomes ``['amine', 'phenyl', 'di']``).
:param iupac_names:
"""
split_names, sorted_names = _get_split_and_sorted_lists(iupac_names)
return [split_names[split_names.index(name)] for name in sorted_names]
def _get_split_and_sorted_lists(iupac_names: Sequence[str]) -> Tuple[List[List[str]], List[List[str]]]:
split_names = []
for name in iupac_names:
split_name = get_IUPAC_parts(name.lower())
if split_name[0].lower() in prefixes.values():
# no positional information at beginning
split_name = [' ', *split_name]
split_names.append(split_name[::-1])
sorted_names = sorted(split_names)
return split_names, sorted_names
[docs]def sort_array_by_name(array: List[List[Any]], name_col: int = 0, reverse: bool = False) -> List[List[Any]]:
"""
Sort a list of lists by the IUPAC name in each row.
:param array:
:param name_col: The index of the column containing the IUPAC names
:param reverse: Whether the names should be sorted in reverse order. Default is :py:obj:`False`, which sorts from A-Z.
:no-default reverse:
:return: The sorted array
"""
names = [row[name_col] for row in array]
sort_order = get_IUPAC_sort_order(names)
sorted_array = sorted(array, key=lambda row: sort_order[row[name_col]], reverse=reverse)
return sorted_array
[docs]def sort_dataframe_by_name(df: DataFrame, name_col: str, reverse: bool = False) -> DataFrame:
"""
Sorts a :class:`pandas.DataFrame` by the IUPAC name in each row.
:param df:
:param name_col: The name of the column containing the IUPAC names
:param reverse: Whether the names should be sorted in reverse order. Default is :py:obj:`False`, which sorts from A-Z
:no-default reverse:
:return: The sorted :class:`~pandas.DataFrame`
"""
names = df[name_col]
sort_order = get_IUPAC_sort_order(names)
sorted_df = df.loc[df[name_col].map(sort_order).sort_values(ascending=(not reverse)).index]
return sorted_df
[docs]def iupac_name_from_cas(cas_number: str) -> str:
"""
Returns the corresponding IUPAC name for the given CAS registry number.
:param cas_number: The cas number to search
:return: The IUPAC name
"""
r = cached_requests.get(f"https://cactus.nci.nih.gov/chemical/structure/{cas_number}/iupac_name")
if r.status_code in HTTP_ERROR_CODES:
raise ValueError(f"No compound found for CAS registry number {cas_number}.")
return r.text
[docs]def cas_from_iupac_name(iupac_name: str) -> str:
"""
Returns the corresponding CAS registry number for the given IUPAC name.
:param iupac_name: The IUPAC name to search.
:return: The CAS registry number.
"""
r = cached_requests.get(f"https://cactus.nci.nih.gov/chemical/structure/{iupac_name}/cas")
if r.status_code in HTTP_ERROR_CODES:
raise ValueError(f"No compound found for name {iupac_name}.")
return r.text.split('\n')[0]