Source code for chemistry_tools.pubchem.compound

#!/usr/bin/env python
#
#  compound.py
"""
Represents a chemical compound.
"""
#
#  Copyright (c) 2019-2020 Dominic Davis-Foster <dominic@davis-foster.co.uk>
#
#  This program is free software; you can redistribute it and/or modify
#  it under the terms of the GNU Lesser General Public License as
#  published by the Free Software Foundation; either version 3 of the
#  License, or (at your option) any later version.
#
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
#  GNU Lesser General Public License for more details.
#
#  You should have received a copy of the GNU Lesser General Public License
#  along with this program; if not, write to the Free Software
#  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
#  MA 02110-1301, USA.
#
#  Based on PubChemPy https://github.com/mcs07/PubChemPy/blob/master/LICENSE
#  |  Copyright 2014 Matt Swain <m.swain@me.com>
#  |  Licensed under the MIT License
#  |
#  |  Permission is hereby granted, free of charge, to any person obtaining a copy
#  |  of this software and associated documentation files (the "Software"), to deal
#  |  in the Software without restriction, including without limitation the rights
#  |  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
#  |  copies of the Software, and to permit persons to whom the Software is
#  |  furnished to do so, subject to the following conditions:
#
#  |  The above copyright notice and this permission notice shall be included in
#  |  all copies or substantial portions of the Software.
#
#  |  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
#  |  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
#  |  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
#  |  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
#  |  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
#  |  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
#  |  THE SOFTWARE.
#

# stdlib
from typing import Any, Dict, FrozenSet, List, Optional, Sequence, Type, TypeVar, Union

# 3rd party
from domdf_python_tools.bases import Dictable
from domdf_python_tools.doctools import prettify_docstrings
from pandas import DataFrame, Series  # type: ignore[import]

# this package
from chemistry_tools._memoized_property import memoized_property
from chemistry_tools.formulae import Formula
from chemistry_tools.pubchem.atom import Atom, parse_atoms
from chemistry_tools.pubchem.bond import Bond, parse_bonds
from chemistry_tools.pubchem.enums import CoordinateType
from chemistry_tools.pubchem.full_record import parse_full_record, rest_get_full_record
from chemistry_tools.pubchem.properties import (
		force_valid_properties,
		parse_properties,
		rest_get_properties_json,
		valid_properties
		)
from chemistry_tools.pubchem.synonyms import get_synonyms

__all__ = ["Compound", "compounds_to_frame", 'C']

C = TypeVar('C', bound="Compound")


[docs]@prettify_docstrings class Compound(Dictable): """ Represents a single record from the PubChem Compound database. The PubChem Compound database is constructed from the Substance database using a standardization and deduplication process. Each Compound is uniquely identified by a CID. :param title: The title of the compound record (usually the name of the compound) :param CID: :param description: """ def __init__(self, title: str, CID: int, description: str, **_): super().__init__() self.title: str = str(title) self.CID: int = int(CID) self.description: str = str(description) self._properties: Dict = {prop: None for prop in valid_properties} self.record_type: str = "2d" self._synonyms: Optional[List[str]] = None # Pre-cache all properties # self.get_properties("all") self._has_full_record: bool = False @property def __dict__(self): # noqa: MAN002 return dict( title=self.title, CID=self.CID, description=self.description, properties=self._properties, record_type=self.record_type, counts=self._record["counts"], atoms=self._atoms, bonds=self._bonds, )
[docs] def __repr__(self) -> str: return f"Compound({self.cid})" if self.cid else "Compound()"
[docs] def to_series(self) -> Series: """ Return a pandas :class:`~pandas.Series` containing Compound data. """ return Series(dict(self))
@property def cid(self) -> int: """ Returns the ID of this compound. """ return self.CID @property def has_full_record(self) -> bool: """ Returns whether this compound has a full record available. """ return self._has_full_record @memoized_property def _record(self) -> Dict[str, Any]: # Only requested when required record = parse_full_record(rest_get_full_record(self.CID, "cid", self.record_type))[0] self._has_full_record = True for prop in record["properties"]: if not self._properties["CanonicalSMILES"]: if prop.label == "SMILES" and prop.name == "Canonical": self._properties["CanonicalSMILES"] = prop.value if not self._properties["IsomericSMILES"]: if prop.label == "SMILES" and prop.name == "Isomeric": self._properties["IsomericSMILES"] = prop.value if not self._properties["TPSA"]: if prop.label == "Topological" and prop.name == "Polar Surface Area": self._properties["TPSA"] = prop.value if not self._properties["InChIKey"]: if prop.label == "InChIKey" and prop.name == "Standard": self._properties["InChIKey"] = prop.value if not self._properties["InChI"]: if prop.label == "InChI" and prop.name == "Standard": self._properties["InChI"] = prop.value if not self._properties["HBondAcceptorCount"]: if prop.label == "Count" and prop.name == "Hydrogen Bond Acceptor": self._properties["HBondAcceptorCount"] = prop.value if not self._properties["HBondDonorCount"]: if prop.label == "Count" and prop.name == "Hydrogen Bond Donor": self._properties["HBondDonorCount"] = prop.value if not self._properties["RotatableBondCount"]: if prop.label == "Count" and prop.name == "Rotatable Bond": self._properties["RotatableBondCount"] = prop.value if not self._properties["MolecularWeight"]: if prop.label == "Molecular Weight" and prop.name is None: self._properties["MolecularWeight"] = prop.value # TODO: label='Weight', name='MonoIsotopic', # label='Molecular Formula', name=None, # label='Mass', name='Exact', # label='Log P', name='XLogP3' # return record @memoized_property def _atoms(self) -> Optional[Dict[FrozenSet[int], Atom]]: """ Derive Atom objects from the record. """ if "atoms" not in self._record: return None atoms_dict = self._record["atoms"] coords_dict = self._record.get("coords", None) return parse_atoms(atoms_dict, coords_dict) @memoized_property def _bonds(self) -> Optional[Dict[FrozenSet[int], Bond]]: """ Derive Bond objects from the record. """ if "bonds" not in self._record: return None bonds_dict = self._record["bonds"] coords_dict = self._record.get("coords", None) return parse_bonds(bonds_dict, coords_dict)
[docs] def precache(self) -> None: """ Precache all properties for this compound. """ self.get_properties("all") _ = self._atoms _ = self._bonds
@property def atoms(self) -> List[Atom]: """ List of :class:`Atoms <chemistry_tools.pubchem.atom.Atom>` in this Compound. """ return sorted(self._atoms.values(), key=lambda x: x.aid) @property def bonds(self) -> List[Bond]: """ List of :class:`Bonds <chemistry_tools.pubchem.bond.Bond>` between :class:`Atoms <chemistry_tools.pubchem.atom.Atom>` in this Compound. """ # noqa: D400 return sorted(self._bonds.values(), key=lambda x: (x.aid1, x.aid2)) @property def coordinate_type(self) -> Optional[str]: """ The coordinate type of this compound. """ if CoordinateType.TWO_D in self._record["coords"][0]["type"]: return "2d" elif CoordinateType.THREE_D in self._record["coords"][0]["type"]: return "3d" return None @property def elements(self) -> List[str]: """ List of element symbols for atoms in this Compound. """ return [a.element for a in self.atoms]
[docs] def get_properties(self, properties: Union[Sequence[str], str]) -> Dict[str, Any]: """ Returns the requested properties for the Compound. :param properties: The properties to retrieve for the compound. Can be either a comma-separated string or a list. See :ref:`the table at the start of this chapter <properties table>` for a list of valid properties. :return: Dictionary mapping the property names to their values """ if isinstance(properties, str) and properties.lower() == "all": properties = list(valid_properties.keys()) properties = force_valid_properties(properties) cached_properties = [] properties_to_get = [] for prop in properties: if self._properties[prop] is not None: cached_properties.append(prop) else: properties_to_get.append(prop) output = {} if properties_to_get: # print("Getting from API") data = rest_get_properties_json(self.CID, "cid", properties) new_properties = parse_properties(data)[0] for prop in properties_to_get: self._properties[prop] = new_properties[prop] output[prop] = new_properties[prop] for prop in cached_properties: # print("Getting from cache") output[prop] = self._properties[prop] return output
[docs] def get_property(self, prop: str) -> Any: """ Get a single property for the compound. :param prop: The property to retrieve for the compound. See :ref:`the table at the start of this chapter <properties table>` for a list of valid properties. """ prop = str(prop) if prop not in self._properties: raise ValueError(f"Unknown property '{prop}'") if self._properties[prop] is not None: # print("Getting from cache") return self._properties[prop] else: # print("Getting from API") data = rest_get_properties_json(self.CID, "cid", prop) new_properties = parse_properties(data)[0] self._properties[prop] = new_properties[prop] return new_properties[prop]
@property def synonyms(self) -> Optional[List[str]]: """ Returns a list of synonyms for the Compound. """ if not self._synonyms: data = get_synonyms(self.CID, "cid")[0] if not data["CID"] == self.CID: raise ValueError("Wrong compound returned") else: self._synonyms = data["synonyms"] return self._synonyms
[docs] @classmethod def from_cid(cls: Type['C'], cid: Union[str, int], record_type: str = "2d") -> "Compound": """ Returns the Compound objects for the compound with the given CID. """ # this package from chemistry_tools.pubchem.lookup import get_compounds comp = get_compounds(cid, "cid")[0] if comp.CID != int(cid): raise ValueError("Wrong compound returned") comp.record_type = record_type return comp
# Convenience attributes for some properties @property def molecular_formula(self) -> Formula: """ Molecular formula. """ return self.get_property("MolecularFormula") @property def canonical_smiles(self) -> str: """ Canonical SMILES, with no stereochemistry information. """ return self.get_property("CanonicalSMILES") smiles = canonical_smiles @property def charge(self) -> int: """ The charge of the compound. """ return self.get_property("Charge") @property def molecular_weight(self) -> float: """ Molecular Weight. """ return float(self.get_property("MolecularWeight")) molecular_mass = molecular_weight @memoized_property def canonicalized(self) -> bool: """ Whether the compound is canonicalized. """ for prop in self._record["properties"]: if prop.label == "Compound" and prop.name == "Canonicalized": return bool(prop.value) return False
[docs] def get_iupac_name(self, type_: str = "Systematic") -> Optional[str]: r""" Return the IUPAC name of this compound. :param type\_: The type of IUPAC name. """ # Allowed, CAS-like Style, Markup, Preferred, Systematic, Traditional for prop in self._record["properties"]: if prop.label == "IUPAC Name" and prop.name == type_.capitalize(): return prop.value return None
@memoized_property def iupac_name(self) -> Optional[str]: """ The preferred IUPAC name of this compound. """ return self.get_iupac_name("Preferred") @memoized_property def systematic_name(self) -> Optional[str]: """ The systematic IUPAC name of this compound. """ return self.get_iupac_name("Systematic") @property def fingerprint(self) -> Optional[str]: """ Raw padded and hex-encoded fingerprint, as returned by the PUG REST API. """ for prop in self._record["properties"]: if prop.label == "Fingerprint" and prop.name == "SubStructure Keys": return prop.value return None @property def cactvs_fingerprint(self) -> Optional[str]: """ PubChem CACTVS fingerprint. Each bit in the fingerprint represents the presence or absence of one of 881 chemical substructures. .. seealso:: ftp://ftp.ncbi.nlm.nih.gov/pubchem/specifications/pubchem_fingerprints.txt """ # noqa: D403 # Skip first 4 bytes (contain length of fingerprint) and last 7 bits (padding) then re-pad to 881 bits if self.fingerprint: return f"{int(self.fingerprint[8:], 16):020b}"[:-7].zfill(881) else: return None
# @memoized_property # def hill_formula(self): # element_count = Counter(self.elements) # hill = [] # # alphabet = sorted(ELEMENTS.symbols) # # if "C" in element_count: # hill.append("C") # alphabet.remove("C") # count = element_count["C"] # if count > 1: # hill.append(f"<sub>{count}</sub>") # if "H" in element_count: # hill.append("H") # alphabet.remove("H") # count = element_count["H"] # if count > 1: # hill.append(f"<sub>{count}</sub>") # # for element in alphabet: # if element in element_count: # hill.append(element) # count = element_count[element] # if count > 1: # hill.append(f"<sub>{count}</sub>") # # return "".join(hill) # TODO from record: # charge # properties # label='Compound', name='Canonicalized' # label='Compound Complexity', name=None # cid # counts # TODO:
[docs]def compounds_to_frame(compounds: Union[Compound, List[Compound]]) -> DataFrame: """ Construct a :class:`~.pandas.DataFrame` from a list of :class:`~chemistry_tools.pubchem.compound.Compound` objects. :param compounds: """ # noqa: D400 if isinstance(compounds, Compound): compounds = [compounds] return DataFrame.from_records([dict(c) for c in compounds], index="CID")