diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..059eb39
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,414 @@
+
+# Created by https://www.toptal.com/developers/gitignore/api/python,intellij,pycharm,macos,windows,linux,vim
+# Edit at https://www.toptal.com/developers/gitignore?templates=python,intellij,pycharm,macos,windows,linux,vim
+
+### Intellij ###
+# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
+# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
+
+# User-specific stuff
+.idea/**/workspace.xml
+.idea/**/tasks.xml
+.idea/**/usage.statistics.xml
+.idea/**/dictionaries
+.idea/**/shelf
+
+# AWS User-specific
+.idea/**/aws.xml
+
+# Generated files
+.idea/**/contentModel.xml
+
+# Sensitive or high-churn files
+.idea/**/dataSources/
+.idea/**/dataSources.ids
+.idea/**/dataSources.local.xml
+.idea/**/sqlDataSources.xml
+.idea/**/dynamic.xml
+.idea/**/uiDesigner.xml
+.idea/**/dbnavigator.xml
+
+# Gradle
+.idea/**/gradle.xml
+.idea/**/libraries
+
+# Gradle and Maven with auto-import
+# When using Gradle or Maven with auto-import, you should exclude module files,
+# since they will be recreated, and may cause churn. Uncomment if using
+# auto-import.
+# .idea/artifacts
+# .idea/compiler.xml
+# .idea/jarRepositories.xml
+# .idea/modules.xml
+# .idea/*.iml
+# .idea/modules
+# *.iml
+# *.ipr
+
+# CMake
+cmake-build-*/
+
+# Mongo Explorer plugin
+.idea/**/mongoSettings.xml
+
+# File-based project format
+*.iws
+
+# IntelliJ
+out/
+
+# mpeltonen/sbt-idea plugin
+.idea_modules/
+
+# JIRA plugin
+atlassian-ide-plugin.xml
+
+# Cursive Clojure plugin
+.idea/replstate.xml
+
+# Crashlytics plugin (for Android Studio and IntelliJ)
+com_crashlytics_export_strings.xml
+crashlytics.properties
+crashlytics-build.properties
+fabric.properties
+
+# Editor-based Rest Client
+.idea/httpRequests
+
+# Android studio 3.1+ serialized cache file
+.idea/caches/build_file_checksums.ser
+
+### Intellij Patch ###
+# Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721
+
+# *.iml
+# modules.xml
+# .idea/misc.xml
+# *.ipr
+
+# Sonarlint plugin
+# https://plugins.jetbrains.com/plugin/7973-sonarlint
+.idea/**/sonarlint/
+
+# SonarQube Plugin
+# https://plugins.jetbrains.com/plugin/7238-sonarqube-community-plugin
+.idea/**/sonarIssues.xml
+
+# Markdown Navigator plugin
+# https://plugins.jetbrains.com/plugin/7896-markdown-navigator-enhanced
+.idea/**/markdown-navigator.xml
+.idea/**/markdown-navigator-enh.xml
+.idea/**/markdown-navigator/
+
+# Cache file creation bug
+# See https://youtrack.jetbrains.com/issue/JBR-2257
+.idea/$CACHE_FILE$
+
+# CodeStream plugin
+# https://plugins.jetbrains.com/plugin/12206-codestream
+.idea/codestream.xml
+
+### Linux ###
+*~
+
+# temporary files which can be created if a process still has a handle open of a deleted file
+.fuse_hidden*
+
+# KDE directory preferences
+.directory
+
+# Linux trash folder which might appear on any partition or disk
+.Trash-*
+
+# .nfs files are created when an open file is removed but is still being accessed
+.nfs*
+
+### macOS ###
+# General
+.DS_Store
+.AppleDouble
+.LSOverride
+
+# Icon must end with two \r
+Icon
+
+
+# Thumbnails
+._*
+
+# Files that might appear in the root of a volume
+.DocumentRevisions-V100
+.fseventsd
+.Spotlight-V100
+.TemporaryItems
+.Trashes
+.VolumeIcon.icns
+.com.apple.timemachine.donotpresent
+
+# Directories potentially created on remote AFP share
+.AppleDB
+.AppleDesktop
+Network Trash Folder
+Temporary Items
+.apdisk
+
+### PyCharm ###
+# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
+# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
+
+# User-specific stuff
+
+# AWS User-specific
+
+# Generated files
+
+# Sensitive or high-churn files
+
+# Gradle
+
+# Gradle and Maven with auto-import
+# When using Gradle or Maven with auto-import, you should exclude module files,
+# since they will be recreated, and may cause churn. Uncomment if using
+# auto-import.
+# .idea/artifacts
+# .idea/compiler.xml
+# .idea/jarRepositories.xml
+# .idea/modules.xml
+# .idea/*.iml
+# .idea/modules
+# *.iml
+# *.ipr
+
+# CMake
+
+# Mongo Explorer plugin
+
+# File-based project format
+
+# IntelliJ
+
+# mpeltonen/sbt-idea plugin
+
+# JIRA plugin
+
+# Cursive Clojure plugin
+
+# Crashlytics plugin (for Android Studio and IntelliJ)
+
+# Editor-based Rest Client
+
+# Android studio 3.1+ serialized cache file
+
+### PyCharm Patch ###
+# Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721
+
+# *.iml
+# modules.xml
+# .idea/misc.xml
+# *.ipr
+
+# Sonarlint plugin
+# https://plugins.jetbrains.com/plugin/7973-sonarlint
+
+# SonarQube Plugin
+# https://plugins.jetbrains.com/plugin/7238-sonarqube-community-plugin
+
+# Markdown Navigator plugin
+# https://plugins.jetbrains.com/plugin/7896-markdown-navigator-enhanced
+
+# Cache file creation bug
+# See https://youtrack.jetbrains.com/issue/JBR-2257
+
+# CodeStream plugin
+# https://plugins.jetbrains.com/plugin/12206-codestream
+
+### Python ###
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+# Usually these files are written by a python script from a template
+# before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+# For a library or package, you might want to ignore these files since the code is
+# intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+# However, in case of collaboration, if having platform-specific dependencies or dependencies
+# having no cross-platform support, pipenv may install dependencies that don't work, or not
+# install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+### Vim ###
+# Swap
+[._]*.s[a-v][a-z]
+!*.svg # comment out if you don't need vector files
+[._]*.sw[a-p]
+[._]s[a-rt-v][a-z]
+[._]ss[a-gi-z]
+[._]sw[a-p]
+
+# Session
+Session.vim
+Sessionx.vim
+
+# Temporary
+.netrwhist
+# Auto-generated tag files
+tags
+# Persistent undo
+[._]*.un~
+
+### Windows ###
+# Windows thumbnail cache files
+Thumbs.db
+Thumbs.db:encryptable
+ehthumbs.db
+ehthumbs_vista.db
+
+# Dump file
+*.stackdump
+
+# Folder config file
+[Dd]esktop.ini
+
+# Recycle Bin used on file shares
+$RECYCLE.BIN/
+
+# Windows Installer files
+*.cab
+*.msi
+*.msix
+*.msm
+*.msp
+
+# Windows shortcuts
+*.lnk
+
+# End of https://www.toptal.com/developers/gitignore/api/python,intellij,pycharm,macos,windows,linux,vim
+
+# Don't commit example results
+ChEMBL_1614027_*
diff --git a/ChEMBL_1614027.gz b/ChEMBL_1614027.gz
deleted file mode 100644
index c1feac3..0000000
Binary files a/ChEMBL_1614027.gz and /dev/null differ
diff --git a/NAA_Workflow_ChEMBL.ipynb b/NAA_Workflow_ChEMBL.ipynb
index 67775d9..ece81b0 100644
--- a/NAA_Workflow_ChEMBL.ipynb
+++ b/NAA_Workflow_ChEMBL.ipynb
@@ -8,7 +8,7 @@
"\n",
"1. Data curation and clean up\n",
"2. Run Nonadditivity Analysis\n",
- "3. Generate Plots\n"
+ "3. Generate Plots"
]
},
{
@@ -28,28 +28,28 @@
"from rdkit.Chem.MolStandardize.standardize import canonicalize_tautomer_smiles\n",
"from rdkit.Chem import rdFMCS\n",
"rdBase.DisableLog('rdApp.info')\n",
- "\n",
+ "import os\n",
+ "from textwrap import dedent\n",
"import sys\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"import numpy as np\n",
"import seaborn as sns\n",
- "\n",
+ "from rdkit.Chem import PandasTools\n",
"from scipy import stats\n",
"from scipy.stats import normaltest\n",
"from PIL import Image\n",
"from PIL import ImageFont\n",
"from PIL import ImageDraw\n",
- "\n",
+ "from tqdm.auto import tqdm\n",
"import multiprocessing as mp\n",
- "from multiprocessing import Process, Pipe\n"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Set path for reading and writing data"
+ "from multiprocessing import Process, Pipe\n",
+ "\n",
+ "import pystow\n",
+ "import chembl_downloader\n",
+ "from nonadditivity_az.utils import get_processed_assay_df\n",
+ "from nonadditivity_az.plotting import NA_distribution, plot_outliers, draw_image\n",
+ "from nonadditivity.api import run_nonadd_calculation_helper"
]
},
{
@@ -58,24 +58,7 @@
"metadata": {},
"outputs": [],
"source": [
- "my_path = './naa/ChEMBL_1614027/'\n",
- "my_name = 'ChEMBL_1614027'"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# STEP I"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# Data Curation\n",
- "\n",
- "* Data example for ChEMBL1614027"
+ "sns.set_style(\"white\")"
]
},
{
@@ -84,14 +67,9 @@
"metadata": {},
"outputs": [],
"source": [
- "data = pd.read_csv(my_path+'ChEMBL_1614027.gz', compression='gzip', header=0, sep=';', error_bad_lines=False)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Rearrange columns and keep the ones necessary"
+ "# https://www.rdkit.org/docs/Cookbook.html\n",
+ "from rdkit.Chem.Draw import IPythonConsole\n",
+ "IPythonConsole.ipython_useSVG = True"
]
},
{
@@ -100,18 +78,8 @@
"metadata": {},
"outputs": [],
"source": [
- "def rearrange(df):\n",
- " df = df.rename(columns=({'Molecule ChEMBL ID':'COMPOUND_NAME', 'Smiles': 'SMILES', 'Standard Value': 'VALUE', 'Standard Value' : 'VALUE', 'Standard Units': 'UNIT', 'Standard Type': 'ENDPOINT'}))\n",
- " df = df[['SMILES', 'COMPOUND_NAME', 'ENDPOINT', 'Standard Relation', 'VALUE', 'UNIT']]\n",
- " \n",
- " return df"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Remove NaNs from SMILES column"
+ "import matplotlib_inline\n",
+ "matplotlib_inline.backend_inline.set_matplotlib_formats(\"svg\")"
]
},
{
@@ -120,17 +88,14 @@
"metadata": {},
"outputs": [],
"source": [
- "def discard_nan_smiles(df):\n",
- " df = df.dropna(subset = ['SMILES'])\n",
- " \n",
- " return(df)"
+ "PandasTools.ChangeMoleculeRendering(renderer=\"SVG\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "### Deleting uncertain values"
+ "### Set path for reading and writing data"
]
},
{
@@ -139,431 +104,690 @@
"metadata": {},
"outputs": [],
"source": [
- "def discard_uncertain_values(df):\n",
- " df = df[df['Standard Relation'] != \"'>'\"]\n",
- " df = df[df['Standard Relation'] != \"'<'\"]\n",
- " df['VALUE'] = df['VALUE'].astype(float)\n",
- " df = df[df['VALUE'] > 0] #in case one needs to delete negative values\n",
- " df = df.drop(columns=['Standard Relation'])\n",
- "\n",
- " return(df)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Converting values to logged ones"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 7,
- "metadata": {},
- "outputs": [],
- "source": [
- "unit_conversion = {\n",
- " 'M': 1,\n",
- " 'mM': 1000,\n",
- " 'uM': 1000000,\n",
- " 'nM': 1000000000,\n",
- " 'pM': 1000000000000,\n",
- " 'fM': 1000000000000000 \n",
- "}\n",
- "\n",
- "def log_converstion(x, UNIT):\n",
- " if UNIT not in unit_conversion:\n",
- " return x\n",
- " x= -1 * np.log10(x / unit_conversion[UNIT])\n",
- " return x\n",
- "\n",
- "def create_conversion_column(df):\n",
- " arr = [log_converstion(x['VALUE'], x['UNIT']) for idx, x in df.iterrows()]\n",
- "\n",
- " df['NEW_VALUE'] = arr\n",
- " df = df[df['NEW_VALUE'] > 0]\n",
- " df = df.drop(columns=['VALUE'])\n",
- " \n",
- " # deleting the values that are more than 10 mM and less than 1 fM\n",
- " \n",
- " df = df[df['NEW_VALUE'] > 2]\n",
- " df = df[df['NEW_VALUE'] < 11]\n",
- " \n",
- " return df"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Calculating the avarage of the activity and calculating the median"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 8,
- "metadata": {},
- "outputs": [],
- "source": [
- "def calculate_average(df):\n",
- " df['Median_Value'] = df.groupby(['COMPOUND_NAME'])['NEW_VALUE'].transform('median')\n",
- " df['max_value'] = df.groupby(['COMPOUND_NAME'])['NEW_VALUE'].transform('max')\n",
- " df['min_value'] = df.groupby(['COMPOUND_NAME'])['NEW_VALUE'].transform('min')\n",
- " df['difference'] = df.max_value - df.min_value\n",
- " \n",
- " df = df.drop_duplicates(subset=['COMPOUND_NAME'], keep= 'first')\n",
- "\n",
- " return(df)"
+ "assay_chembl_id = 'CHEMBL1614027'"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "### Delete compounds that have been measured several times in one test and differ more than 2.5 log units"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 9,
- "metadata": {},
- "outputs": [],
- "source": [
- "def discard_ambiguous_compound_measurements(df, max_thrs = 2.5):\n",
- " \n",
- " df = df[df.difference < max_thrs] \n",
- " \n",
- " df = df[['SMILES', 'COMPOUND_NAME', 'ENDPOINT', 'Median_Value', 'MEASUREMENT']]\n",
- " df = df.rename(columns=({'Median_Value':'VALUE'}))\n",
- " \n",
- " return(df)"
+ "# STEP I"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "### Standardize molecules using RDkit"
+ "# Data Curation"
]
},
{
"cell_type": "code",
- "execution_count": 10,
- "metadata": {},
- "outputs": [],
- "source": [
- "def standardize_rdkit(row, col):\n",
- " smi = row[col]\n",
- "\n",
- " try:\n",
- " mol = Chem.MolFromSmiles(smi) # sanitization is done by default\n",
- " fmol = rdMolStandardize.FragmentParent(mol) # returns largest fragment\n",
- " cmol = rdMolStandardize.ChargeParent(fmol) # uncharges the largest fragment\n",
- " smi = Chem.MolToSmiles(cmol) \n",
- " ssmi = MolStandardize.canonicalize_tautomer_smiles(smi) # returns the canonicalized tautomer\n",
- " tsmi = MolStandardize.rdMolStandardize.StandardizeSmiles(ssmi) # standardize \n",
- " except:\n",
- " tsmi = 'none'\n",
- " \n",
- " return tsmi"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 11,
+ "execution_count": 7,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " ID | \n",
+ " SMILES | \n",
+ " VALUE | \n",
+ " MEASUREMENT | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " CHEMBL598952 | \n",
+ " O=c1onc2cnc3ccccc3n12 | \n",
+ " 4.40000 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " CHEMBL1358313 | \n",
+ " N#CCCn1c(=O)c(-c2cccc(C#N)c2)nc2cnc(Oc3ccccc3)... | \n",
+ " 4.40000 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " CHEMBL11684 | \n",
+ " CC1(C)Oc2ccc(C#N)cc2[C@H](N2CCCC2=O)[C@H]1O | \n",
+ " 4.40000 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " CHEMBL1357940 | \n",
+ " CS(=O)(=O)N1CCC2(CCN(c3ccccc3)CC2)CC1 | \n",
+ " 4.40000 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " CHEMBL302213 | \n",
+ " NC[C@@H](CC(=O)O)c1ccc(Cl)cc1 | \n",
+ " 4.40000 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 2871 | \n",
+ " CHEMBL1568083 | \n",
+ " O=C(O)c1cscc1Cc1cccs1 | \n",
+ " 8.60206 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 2872 | \n",
+ " CHEMBL1517793 | \n",
+ " C[C@H]1CCC[C@@H](C)N1 | \n",
+ " 8.60206 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 2873 | \n",
+ " CHEMBL1513940 | \n",
+ " COc1ccc2c3c([nH]c2c1)[C@@H]1C[C@H]2C(C(=O)O)[C... | \n",
+ " 8.60206 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 2874 | \n",
+ " CHEMBL1513508 | \n",
+ " CCOC(=O)OCC1OC(C#Cc2ccc(C(C)(C)C)cc2)C=CC1OC(=... | \n",
+ " 8.60206 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 2875 | \n",
+ " CHEMBL1337541 | \n",
+ " CCCCCCCCCC(=O)N[C@@H](CN1CCOCC1)[C@@H](O)c1ccccc1 | \n",
+ " 8.79588 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
2876 rows × 4 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " ID SMILES \\\n",
+ "0 CHEMBL598952 O=c1onc2cnc3ccccc3n12 \n",
+ "1 CHEMBL1358313 N#CCCn1c(=O)c(-c2cccc(C#N)c2)nc2cnc(Oc3ccccc3)... \n",
+ "2 CHEMBL11684 CC1(C)Oc2ccc(C#N)cc2[C@H](N2CCCC2=O)[C@H]1O \n",
+ "3 CHEMBL1357940 CS(=O)(=O)N1CCC2(CCN(c3ccccc3)CC2)CC1 \n",
+ "4 CHEMBL302213 NC[C@@H](CC(=O)O)c1ccc(Cl)cc1 \n",
+ "... ... ... \n",
+ "2871 CHEMBL1568083 O=C(O)c1cscc1Cc1cccs1 \n",
+ "2872 CHEMBL1517793 C[C@H]1CCC[C@@H](C)N1 \n",
+ "2873 CHEMBL1513940 COc1ccc2c3c([nH]c2c1)[C@@H]1C[C@H]2C(C(=O)O)[C... \n",
+ "2874 CHEMBL1513508 CCOC(=O)OCC1OC(C#Cc2ccc(C(C)(C)C)cc2)C=CC1OC(=... \n",
+ "2875 CHEMBL1337541 CCCCCCCCCC(=O)N[C@@H](CN1CCOCC1)[C@@H](O)c1ccccc1 \n",
+ "\n",
+ " VALUE MEASUREMENT \n",
+ "0 4.40000 1 \n",
+ "1 4.40000 1 \n",
+ "2 4.40000 1 \n",
+ "3 4.40000 1 \n",
+ "4 4.40000 1 \n",
+ "... ... ... \n",
+ "2871 8.60206 1 \n",
+ "2872 8.60206 1 \n",
+ "2873 8.60206 1 \n",
+ "2874 8.60206 1 \n",
+ "2875 8.79588 1 \n",
+ "\n",
+ "[2876 rows x 4 columns]"
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
- "def generateStandarizedSmiles(smilesfile, smiles_column):\n",
- "\n",
- " pool = mp.Pool(8) # set number of cores for parallelization\n",
- " stsmi_list = pool.starmap(standardize_rdkit, [(smi, smiles_column) for idx, smi in smilesfile.iterrows()])\n",
- " pool.close()\n",
- " \n",
- " smilesfile[smilesfile.columns[smiles_column]] = stsmi_list\n",
- "\n",
- " return (smilesfile)"
+ "df, infile = get_processed_assay_df(assay_chembl_id)\n",
+ "df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "### Discard duplicate SMILES\n",
- "\n",
- "- Keep the one with the highest value, i.e. most active one"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 12,
- "metadata": {},
- "outputs": [],
- "source": [
- "def merge_duplicate_smiles(df):\n",
- " df = df.sort_values('VALUE').drop_duplicates(subset=['SMILES'], keep='last') \n",
- " \n",
- " return(df)"
+ "# STEP II"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "### Remove molecules with > 70 heavy atoms"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 13,
- "metadata": {},
- "outputs": [],
- "source": [
- "def discarding_heavy_mols(smi, min_size = 0, max_size = 70):\n",
- " try:\n",
- " mol = Chem.MolFromSmiles(smi, sanitize=False)\n",
- " if min_size <= mol.GetNumHeavyAtoms() <= max_size:\n",
- " return False\n",
- " else:\n",
- " return True\n",
- " except:\n",
- " return True \n",
- " \n",
- "def removeHeavyMols(df, smiles_column):\n",
- " idx = []\n",
- " discard = []\n",
- " for index, row in df.iterrows():\n",
- " #for smi in df.iloc[:,smiles_column]:\n",
- " if discarding_heavy_mols(row[smiles_column]):\n",
- " idx.append(index)\n",
- " discard.append(row.values.tolist())\n",
+ "# NAA\n",
"\n",
- " df.drop(idx, inplace=True)\n",
- " return (df)"
+ "- Code available from gitHub by Christian Kramer: https://github.com/KramerChristian/NonadditivityAnalysis\n",
+ "- Corresponding publication: https://pubs.acs.org/doi/abs/10.1021/acs.jcim.9b00631"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "### Apply all of the above functionalities and safe file"
+ "### Run NAA on cleanup data set"
]
},
{
"cell_type": "code",
- "execution_count": 14,
+ "execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "#cmpds: 3082\n",
- "#unique cmpds: 2933\n",
- "#cpds with SMILES: 3047\n",
- "#cpds with values: 3047\n",
- "#cpds after merging multi measurements: 2893\n"
+ "Identifier Column found: ID\n",
+ "Smiles column found: SMILES\n",
+ "Activity column #1: VALUE\n",
+ "Generating MMP Fragments\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
- "RDKit ERROR: [17:00:49] Can't kekulize mol. Unkekulized atoms: 0 3 6\n",
- "RDKit ERROR: \n",
- "RDKit ERROR: [17:00:49] Can't kekulize mol. Unkekulized atoms: 0 3 6\n",
- "RDKit ERROR: \n",
- "RDKit ERROR: [17:00:49] Can't kekulize mol. Unkekulized atoms: 0 3 6\n",
- "RDKit ERROR: \n",
- "RDKit ERROR: [17:00:49] Can't kekulize mol. Unkekulized atoms: 0 3 6\n",
- "RDKit ERROR: \n",
- "RDKit ERROR: [17:00:49] Can't kekulize mol. Unkekulized atoms: 0 3 6\n",
- "RDKit ERROR: \n",
- "RDKit ERROR: [17:00:49] Can't kekulize mol. Unkekulized atoms: 0 3 6\n",
- "RDKit ERROR: \n"
+ " \r"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
- "#cpds after merging duplicate SMILES: 2877\n",
- "#cpds < 70 HA: 2876\n"
+ "Indexing MMP Fragments\n"
]
- }
- ],
- "source": [
- "df = rearrange(data)\n",
- "print('#cmpds: ', len(df['COMPOUND_NAME']))\n",
- "print('#unique cmpds: ', len(df['COMPOUND_NAME'].value_counts()))\n",
- "\n",
- "# Counting how many times compounds were measured in tests\n",
- "df['MEASUREMENT'] = df.groupby(['COMPOUND_NAME'])['COMPOUND_NAME'].transform('count')\n",
- "\n",
- "# Discard cmpds without SMILES\n",
- "df = discard_nan_smiles(df)\n",
- "print('#cpds with SMILES: ', len(df.iloc[:,0]))\n",
- "\n",
- "# Discard ambiguous data\n",
- "df = discard_uncertain_values(df)\n",
- "print('#cpds with values: ', len(df.iloc[:,0]))\n",
- "\n",
- "# Convert IC50 to pIC50\n",
- "df = create_conversion_column(df) \n",
- "\n",
- "# Calculate average values and discard cpds with > 2.5 log unit measurement differences\n",
- "df = calculate_average(df)\n",
- "df = discard_ambiguous_compound_measurements(df)\n",
- "print('#cpds after merging multi measurements: ', len(df.iloc[:,0]))\n",
- "\n",
- "# standardize SMILES, merge duplicates and retain higher active one\n",
- "smiles_column = 0\n",
- "df = generateStandarizedSmiles(df, smiles_column)\n",
- "df = df[df['SMILES'] != 'none']\n",
- "df = merge_duplicate_smiles(df)\n",
- "print('#cpds after merging duplicate SMILES: ', len(df.iloc[:,0]))\n",
- "\n",
- "# Remove cpds with > 70 HA\n",
- "smiles_column = 0\n",
- "df = removeHeavyMols(df, smiles_column)\n",
- "print('#cpds < 70 HA: ', len(df.iloc[:,0]))\n",
- "\n",
- "# Rename columns for subsequent NAA\n",
- "df = df.rename(columns=({'COMPOUND_NAME':'ID'}))\n",
- "df = df[['ID', 'SMILES', 'VALUE', 'MEASUREMENT']]\n",
- "\n",
- "# safe file\n",
- "df.to_csv(my_path+my_name+'_curated.csv', index = False)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# STEP II"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# NAA\n",
- "\n",
- "- Code available from gitHub by Christian Kramer: https://github.com/KramerChristian/NonadditivityAnalysis\n",
- "- Corresponoding publication: https://pubs.acs.org/doi/abs/10.1021/acs.jcim.9b00631"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "In order to run NAA out of the jupyter notebook, the system variable for the NAA path has to be set.\n",
- "\n",
- "If this is not done automatically, do the following in your commandline (Linux):\n",
- "> cd $CONDA_PREFIX\n",
- "\n",
- "> mkdir -p ./etc/conda/activate.d\n",
- "\n",
- "> mkdir -p ./etc/conda/deactivate.d\n",
- "\n",
- "> touch ./etc/conda/activate.d/env_vars.sh\n",
- "\n",
- "> touch ./etc/conda/deactivate.d/env_vars.sh\n",
- "\n",
- "- change activate.d/env_vars.sh\n",
- "> export NNA=/path/to/nna/\n",
- "- change deactivate.d/env_vars.sh\n",
- "> unset NNA"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 15,
- "metadata": {},
- "outputs": [],
- "source": [
- "def init_naa():\n",
- " # save system variable in a local python variable, otherwise the !python call doesn't work\n",
- " naapath = !echo $NAA\n",
- " naapath = naapath[0]\n",
- " \n",
- " return naapath"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 42,
- "metadata": {},
- "outputs": [],
- "source": [
- "def naa(infile, outfile, myprops, myunit, mydelim='comma', myseries='', naapath=init_naa()):\n",
- " print (\"\\nanalyzing: \", infile)\n",
- "\n",
- " if (myseries != '') :\n",
- " print ('\\n\\n', naapath+'Nonadditivity_Analysis.py -in ', infile ,' -delimiter ', mydelim , ' -series_column ', myseries, ' -props ', myprops ,' -units ', myunit ,' -out ', outfile, '\\n\\n')\n",
- " !python {naapath}/Nonadditivity_Analysis.py -in {infile} -delimiter {mydelim} -series_column {myseries} -props {myprops} -units {myunit} -out {outfile}\n",
- " else: \n",
- " print ('\\n\\n', naapath+'Nonadditivity_Analysis.py -in ', infile ,' -delimiter ', mydelim ,' -props ', myprops ,' -units ', myunit ,' -out ', outfile, '\\n\\n')\n",
- " !python {naapath}/Nonadditivity_Analysis.py -in {infile} -delimiter {mydelim} -props {myprops} -units {myunit} -out {outfile}\n",
- " \n",
- " print (\"Done analysing.\\n\")\n",
- " return outfile"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Run NAA on cleanup data set"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 43,
- "metadata": {},
- "outputs": [
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " \r"
+ ]
+ },
{
"name": "stdout",
"output_type": "stream",
"text": [
- "\n",
- "analyzing: ./naa/ChEMBL_1614027/ChEMBL_1614027_curated.csv\n",
- "Identifier Column found: ID\n",
- "Smiles column found: SMILES\n",
- "Activity column #1: VALUE\n",
- "Generating MMP Fragments for ./naa/ChEMBL_1614027/ChEMBL_1614027_curated.csv\n",
- "Indexing MMP Fragments for ./naa/ChEMBL_1614027/ChEMBL_1614027_curated.csv\n",
- "WARNING: Neither ujson nor cjson installed. Falling back to Python's slower built-in json decoder.\n",
- "Analyzing neighborhoods \n",
+ "Analyzing neighborhoods\n",
"Assembling circles\n",
"Writing Output.\n",
"Estimated Experimental Uncertainty\n",
- "for property: VALUE\n",
+ "for property: VALUE\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Output: 100%|██████████████████████████████████████████████████████████| 4086/4086 [00:00<00:00, 53079.47it/s]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
"based on 4086 cycles.\n",
"0.36 from normal SD\n",
- "0.30 from MAD\n",
+ "0.30 from MAD\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
"0.30 from Median of Medians\n",
- "\n",
- "Done analysing.\n",
"\n"
]
},
{
"data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Compound1 | \n",
+ " Compound2 | \n",
+ " Compound3 | \n",
+ " Compound4 | \n",
+ " SMILES1 | \n",
+ " SMILES2 | \n",
+ " SMILES3 | \n",
+ " SMILES4 | \n",
+ " Series | \n",
+ " Transformation1 | \n",
+ " Transformation2 | \n",
+ " Property | \n",
+ " Prop_Cpd1 | \n",
+ " Prop_Cpd2 | \n",
+ " Prop_Cpd3 | \n",
+ " Prop_Cpd4 | \n",
+ " Nonadditivity | \n",
+ " Circle_ID | \n",
+ " Theo_Quantile | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " CHEMBL1531070 | \n",
+ " CHEMBL1442087 | \n",
+ " CHEMBL1555369 | \n",
+ " CHEMBL1330718 | \n",
+ " CCC/C=C(\\CCC)C(NS(=O)(=O)c1ccc(C(F)(F)F)cc1)c1... | \n",
+ " CCC/C=C(\\CCC)C(NS(=O)(=O)c1ccc(C(F)(F)F)cc1)c1... | \n",
+ " CCC/C=C(\\CCC)C(NS(=O)(=O)c1ccc(Cl)cc1)c1ccc(C(... | \n",
+ " CCC/C=C(\\CCC)C(NS(=O)(=O)c1ccc(Cl)cc1)c1ccccc1 | \n",
+ " | \n",
+ " [*:1][H]>>[*:1]C(=O)OC | \n",
+ " [*:1]C(F)(F)F>>[*:1]Cl | \n",
+ " VALUE | \n",
+ " 5.1 | \n",
+ " 5.5 | \n",
+ " 5.5 | \n",
+ " 5.6 | \n",
+ " -0.5 | \n",
+ " CHEMBL1531070_CHEMBL1442087_CHEMBL1555369_CHEM... | \n",
+ " -0.872 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " CHEMBL1566556 | \n",
+ " CHEMBL1398066 | \n",
+ " CHEMBL1486399 | \n",
+ " CHEMBL1396358 | \n",
+ " CCCC/C=C/C(NC(=O)c1ccccc1)c1ccccc1 | \n",
+ " CCCC[C@@H]1C[C@H]1C(NC(=O)c1ccccc1)c1ccccc1 | \n",
+ " CCCC[C@@H]1C[C@H]1C(NC(=O)c1ccco1)c1ccccc1 | \n",
+ " CCCC/C=C/C(NC(=O)c1ccco1)c1ccccc1 | \n",
+ " | \n",
+ " [*:1]/C=C/CCCC>>[*:1][C@@H]1C[C@H]1CCCC | \n",
+ " [*:1]c1ccccc1>>[*:1]c1ccco1 | \n",
+ " VALUE | \n",
+ " 5.0 | \n",
+ " 5.1 | \n",
+ " 4.8 | \n",
+ " 4.8 | \n",
+ " -0.1 | \n",
+ " CHEMBL1566556_CHEMBL1398066_CHEMBL1486399_CHEM... | \n",
+ " -0.160 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " CHEMBL1592533 | \n",
+ " CHEMBL1359291 | \n",
+ " CHEMBL1437906 | \n",
+ " CHEMBL1403280 | \n",
+ " COc1ccc(C(=O)N2CCC3(CC2)CCN(c2ccccn2)CC3)cc1 | \n",
+ " Cn1cccc1C(=O)N1CCC2(CC1)CCN(c1ccccn1)CC2 | \n",
+ " Cn1cccc1C(=O)N1CCC2(CCN(Cc3ccncc3)CC2)CC1 | \n",
+ " COc1ccc(C(=O)N2CCC3(CCN(Cc4ccncc4)CC3)CC2)cc1 | \n",
+ " | \n",
+ " [*:1]c1ccc(OC)cc1>>[*:1]c1cccn1C | \n",
+ " [*:1]c1ccccn1>>[*:1]Cc1ccncc1 | \n",
+ " VALUE | \n",
+ " 4.4 | \n",
+ " 4.5 | \n",
+ " 5.0 | \n",
+ " 5.1 | \n",
+ " -0.2 | \n",
+ " CHEMBL1592533_CHEMBL1359291_CHEMBL1437906_CHEM... | \n",
+ " -0.327 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " CHEMBL1592533 | \n",
+ " CHEMBL1315700 | \n",
+ " CHEMBL1564545 | \n",
+ " CHEMBL1403280 | \n",
+ " COc1ccc(C(=O)N2CCC3(CC2)CCN(c2ccccn2)CC3)cc1 | \n",
+ " O=C(c1ccncc1)N1CCC2(CC1)CCN(c1ccccn1)CC2 | \n",
+ " O=C(c1ccncc1)N1CCC2(CCN(Cc3ccncc3)CC2)CC1 | \n",
+ " COc1ccc(C(=O)N2CCC3(CCN(Cc4ccncc4)CC3)CC2)cc1 | \n",
+ " | \n",
+ " [*:1]c1ccc(OC)cc1>>[*:1]c1ccncc1 | \n",
+ " [*:1]c1ccccn1>>[*:1]Cc1ccncc1 | \n",
+ " VALUE | \n",
+ " 4.4 | \n",
+ " 5.0 | \n",
+ " 4.7 | \n",
+ " 5.1 | \n",
+ " -1.0 | \n",
+ " CHEMBL1592533_CHEMBL1315700_CHEMBL1564545_CHEM... | \n",
+ " -1.530 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " CHEMBL1437906 | \n",
+ " CHEMBL1359291 | \n",
+ " CHEMBL1315700 | \n",
+ " CHEMBL1564545 | \n",
+ " Cn1cccc1C(=O)N1CCC2(CCN(Cc3ccncc3)CC2)CC1 | \n",
+ " Cn1cccc1C(=O)N1CCC2(CC1)CCN(c1ccccn1)CC2 | \n",
+ " O=C(c1ccncc1)N1CCC2(CC1)CCN(c1ccccn1)CC2 | \n",
+ " O=C(c1ccncc1)N1CCC2(CCN(Cc3ccncc3)CC2)CC1 | \n",
+ " | \n",
+ " [*:1]Cc1ccncc1>>[*:1]c1ccccn1 | \n",
+ " [*:1]c1cccn1C>>[*:1]c1ccncc1 | \n",
+ " VALUE | \n",
+ " 5.0 | \n",
+ " 4.5 | \n",
+ " 5.0 | \n",
+ " 4.7 | \n",
+ " 0.8 | \n",
+ " CHEMBL1437906_CHEMBL1359291_CHEMBL1315700_CHEM... | \n",
+ " 1.330 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 4081 | \n",
+ " CHEMBL1515287 | \n",
+ " CHEMBL1592760 | \n",
+ " CHEMBL1494529 | \n",
+ " CHEMBL1365979 | \n",
+ " O=c1c(CCc2ccccc2)nc2cncnc2n1-c1ccccc1 | \n",
+ " O=c1c(CCc2ccccc2)nc2cncnc2n1C1CC1 | \n",
+ " Cc1nc2cncnc2n(C2CC2)c1=O | \n",
+ " Cc1nc2cncnc2n(-c2ccccc2)c1=O | \n",
+ " | \n",
+ " [*:1]c1ccccc1>>[*:1]C1CC1 | \n",
+ " [*:1]CCc1ccccc1>>[*:1]C | \n",
+ " VALUE | \n",
+ " 4.6 | \n",
+ " 4.6 | \n",
+ " 5.1 | \n",
+ " 4.4 | \n",
+ " 0.7 | \n",
+ " CHEMBL1515287_CHEMBL1592760_CHEMBL1494529_CHEM... | \n",
+ " 1.180 | \n",
+ "
\n",
+ " \n",
+ " 4082 | \n",
+ " CHEMBL1494529 | \n",
+ " CHEMBL1365979 | \n",
+ " CHEMBL1490139 | \n",
+ " CHEMBL1358588 | \n",
+ " Cc1nc2cncnc2n(C2CC2)c1=O | \n",
+ " Cc1nc2cncnc2n(-c2ccccc2)c1=O | \n",
+ " O=c1c(-c2cccs2)nc2cncnc2n1-c1ccccc1 | \n",
+ " O=c1c(-c2cccs2)nc2cncnc2n1C1CC1 | \n",
+ " | \n",
+ " [*:1]C1CC1>>[*:1]c1ccccc1 | \n",
+ " [*:1]C>>[*:1]c1cccs1 | \n",
+ " VALUE | \n",
+ " 5.1 | \n",
+ " 4.4 | \n",
+ " 5.0 | \n",
+ " 4.8 | \n",
+ " 0.9 | \n",
+ " CHEMBL1494529_CHEMBL1365979_CHEMBL1490139_CHEM... | \n",
+ " 1.420 | \n",
+ "
\n",
+ " \n",
+ " 4083 | \n",
+ " CHEMBL1490139 | \n",
+ " CHEMBL1358588 | \n",
+ " CHEMBL1592760 | \n",
+ " CHEMBL1515287 | \n",
+ " O=c1c(-c2cccs2)nc2cncnc2n1-c1ccccc1 | \n",
+ " O=c1c(-c2cccs2)nc2cncnc2n1C1CC1 | \n",
+ " O=c1c(CCc2ccccc2)nc2cncnc2n1C1CC1 | \n",
+ " O=c1c(CCc2ccccc2)nc2cncnc2n1-c1ccccc1 | \n",
+ " | \n",
+ " [*:1]c1ccccc1>>[*:1]C1CC1 | \n",
+ " [*:1]c1cccs1>>[*:1]CCc1ccccc1 | \n",
+ " VALUE | \n",
+ " 5.0 | \n",
+ " 4.8 | \n",
+ " 4.6 | \n",
+ " 4.6 | \n",
+ " 0.2 | \n",
+ " CHEMBL1490139_CHEMBL1358588_CHEMBL1592760_CHEM... | \n",
+ " 0.357 | \n",
+ "
\n",
+ " \n",
+ " 4084 | \n",
+ " CHEMBL1326100 | \n",
+ " CHEMBL1512693 | \n",
+ " CHEMBL1358588 | \n",
+ " CHEMBL1494529 | \n",
+ " Cc1nc2cnc(N(C)C)nc2n(C2CC2)c1=O | \n",
+ " CN(C)c1ncc2nc(-c3cccs3)c(=O)n(C3CC3)c2n1 | \n",
+ " O=c1c(-c2cccs2)nc2cncnc2n1C1CC1 | \n",
+ " Cc1nc2cncnc2n(C2CC2)c1=O | \n",
+ " | \n",
+ " [*:1]C>>[*:1]c1cccs1 | \n",
+ " [*:1]N(C)C>>[*:1][H] | \n",
+ " VALUE | \n",
+ " 4.8 | \n",
+ " 5.5 | \n",
+ " 4.8 | \n",
+ " 5.1 | \n",
+ " -1.0 | \n",
+ " CHEMBL1326100_CHEMBL1512693_CHEMBL1358588_CHEM... | \n",
+ " -1.490 | \n",
+ "
\n",
+ " \n",
+ " 4085 | \n",
+ " CHEMBL1316562 | \n",
+ " CHEMBL1355909 | \n",
+ " CHEMBL1490139 | \n",
+ " CHEMBL1358588 | \n",
+ " O=c1c(-c2cccs2)nc2cnc(N3CCNCC3)nc2n1C1CC1 | \n",
+ " O=c1c(-c2cccs2)nc2cnc(N3CCNCC3)nc2n1-c1ccccc1 | \n",
+ " O=c1c(-c2cccs2)nc2cncnc2n1-c1ccccc1 | \n",
+ " O=c1c(-c2cccs2)nc2cncnc2n1C1CC1 | \n",
+ " | \n",
+ " [*:1]C1CC1>>[*:1]c1ccccc1 | \n",
+ " [*:1]N1CCNCC1>>[*:1][H] | \n",
+ " VALUE | \n",
+ " 6.9 | \n",
+ " 4.7 | \n",
+ " 5.0 | \n",
+ " 4.8 | \n",
+ " 2.4 | \n",
+ " CHEMBL1316562_CHEMBL1355909_CHEMBL1490139_CHEM... | \n",
+ " 2.590 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
4086 rows × 19 columns
\n",
+ "
"
+ ],
"text/plain": [
- "'./naa/ChEMBL_1614027/ChEMBL_1614027_NAA_output.csv'"
+ " Compound1 Compound2 Compound3 Compound4 \\\n",
+ "0 CHEMBL1531070 CHEMBL1442087 CHEMBL1555369 CHEMBL1330718 \n",
+ "1 CHEMBL1566556 CHEMBL1398066 CHEMBL1486399 CHEMBL1396358 \n",
+ "2 CHEMBL1592533 CHEMBL1359291 CHEMBL1437906 CHEMBL1403280 \n",
+ "3 CHEMBL1592533 CHEMBL1315700 CHEMBL1564545 CHEMBL1403280 \n",
+ "4 CHEMBL1437906 CHEMBL1359291 CHEMBL1315700 CHEMBL1564545 \n",
+ "... ... ... ... ... \n",
+ "4081 CHEMBL1515287 CHEMBL1592760 CHEMBL1494529 CHEMBL1365979 \n",
+ "4082 CHEMBL1494529 CHEMBL1365979 CHEMBL1490139 CHEMBL1358588 \n",
+ "4083 CHEMBL1490139 CHEMBL1358588 CHEMBL1592760 CHEMBL1515287 \n",
+ "4084 CHEMBL1326100 CHEMBL1512693 CHEMBL1358588 CHEMBL1494529 \n",
+ "4085 CHEMBL1316562 CHEMBL1355909 CHEMBL1490139 CHEMBL1358588 \n",
+ "\n",
+ " SMILES1 \\\n",
+ "0 CCC/C=C(\\CCC)C(NS(=O)(=O)c1ccc(C(F)(F)F)cc1)c1... \n",
+ "1 CCCC/C=C/C(NC(=O)c1ccccc1)c1ccccc1 \n",
+ "2 COc1ccc(C(=O)N2CCC3(CC2)CCN(c2ccccn2)CC3)cc1 \n",
+ "3 COc1ccc(C(=O)N2CCC3(CC2)CCN(c2ccccn2)CC3)cc1 \n",
+ "4 Cn1cccc1C(=O)N1CCC2(CCN(Cc3ccncc3)CC2)CC1 \n",
+ "... ... \n",
+ "4081 O=c1c(CCc2ccccc2)nc2cncnc2n1-c1ccccc1 \n",
+ "4082 Cc1nc2cncnc2n(C2CC2)c1=O \n",
+ "4083 O=c1c(-c2cccs2)nc2cncnc2n1-c1ccccc1 \n",
+ "4084 Cc1nc2cnc(N(C)C)nc2n(C2CC2)c1=O \n",
+ "4085 O=c1c(-c2cccs2)nc2cnc(N3CCNCC3)nc2n1C1CC1 \n",
+ "\n",
+ " SMILES2 \\\n",
+ "0 CCC/C=C(\\CCC)C(NS(=O)(=O)c1ccc(C(F)(F)F)cc1)c1... \n",
+ "1 CCCC[C@@H]1C[C@H]1C(NC(=O)c1ccccc1)c1ccccc1 \n",
+ "2 Cn1cccc1C(=O)N1CCC2(CC1)CCN(c1ccccn1)CC2 \n",
+ "3 O=C(c1ccncc1)N1CCC2(CC1)CCN(c1ccccn1)CC2 \n",
+ "4 Cn1cccc1C(=O)N1CCC2(CC1)CCN(c1ccccn1)CC2 \n",
+ "... ... \n",
+ "4081 O=c1c(CCc2ccccc2)nc2cncnc2n1C1CC1 \n",
+ "4082 Cc1nc2cncnc2n(-c2ccccc2)c1=O \n",
+ "4083 O=c1c(-c2cccs2)nc2cncnc2n1C1CC1 \n",
+ "4084 CN(C)c1ncc2nc(-c3cccs3)c(=O)n(C3CC3)c2n1 \n",
+ "4085 O=c1c(-c2cccs2)nc2cnc(N3CCNCC3)nc2n1-c1ccccc1 \n",
+ "\n",
+ " SMILES3 \\\n",
+ "0 CCC/C=C(\\CCC)C(NS(=O)(=O)c1ccc(Cl)cc1)c1ccc(C(... \n",
+ "1 CCCC[C@@H]1C[C@H]1C(NC(=O)c1ccco1)c1ccccc1 \n",
+ "2 Cn1cccc1C(=O)N1CCC2(CCN(Cc3ccncc3)CC2)CC1 \n",
+ "3 O=C(c1ccncc1)N1CCC2(CCN(Cc3ccncc3)CC2)CC1 \n",
+ "4 O=C(c1ccncc1)N1CCC2(CC1)CCN(c1ccccn1)CC2 \n",
+ "... ... \n",
+ "4081 Cc1nc2cncnc2n(C2CC2)c1=O \n",
+ "4082 O=c1c(-c2cccs2)nc2cncnc2n1-c1ccccc1 \n",
+ "4083 O=c1c(CCc2ccccc2)nc2cncnc2n1C1CC1 \n",
+ "4084 O=c1c(-c2cccs2)nc2cncnc2n1C1CC1 \n",
+ "4085 O=c1c(-c2cccs2)nc2cncnc2n1-c1ccccc1 \n",
+ "\n",
+ " SMILES4 Series \\\n",
+ "0 CCC/C=C(\\CCC)C(NS(=O)(=O)c1ccc(Cl)cc1)c1ccccc1 \n",
+ "1 CCCC/C=C/C(NC(=O)c1ccco1)c1ccccc1 \n",
+ "2 COc1ccc(C(=O)N2CCC3(CCN(Cc4ccncc4)CC3)CC2)cc1 \n",
+ "3 COc1ccc(C(=O)N2CCC3(CCN(Cc4ccncc4)CC3)CC2)cc1 \n",
+ "4 O=C(c1ccncc1)N1CCC2(CCN(Cc3ccncc3)CC2)CC1 \n",
+ "... ... ... \n",
+ "4081 Cc1nc2cncnc2n(-c2ccccc2)c1=O \n",
+ "4082 O=c1c(-c2cccs2)nc2cncnc2n1C1CC1 \n",
+ "4083 O=c1c(CCc2ccccc2)nc2cncnc2n1-c1ccccc1 \n",
+ "4084 Cc1nc2cncnc2n(C2CC2)c1=O \n",
+ "4085 O=c1c(-c2cccs2)nc2cncnc2n1C1CC1 \n",
+ "\n",
+ " Transformation1 Transformation2 \\\n",
+ "0 [*:1][H]>>[*:1]C(=O)OC [*:1]C(F)(F)F>>[*:1]Cl \n",
+ "1 [*:1]/C=C/CCCC>>[*:1][C@@H]1C[C@H]1CCCC [*:1]c1ccccc1>>[*:1]c1ccco1 \n",
+ "2 [*:1]c1ccc(OC)cc1>>[*:1]c1cccn1C [*:1]c1ccccn1>>[*:1]Cc1ccncc1 \n",
+ "3 [*:1]c1ccc(OC)cc1>>[*:1]c1ccncc1 [*:1]c1ccccn1>>[*:1]Cc1ccncc1 \n",
+ "4 [*:1]Cc1ccncc1>>[*:1]c1ccccn1 [*:1]c1cccn1C>>[*:1]c1ccncc1 \n",
+ "... ... ... \n",
+ "4081 [*:1]c1ccccc1>>[*:1]C1CC1 [*:1]CCc1ccccc1>>[*:1]C \n",
+ "4082 [*:1]C1CC1>>[*:1]c1ccccc1 [*:1]C>>[*:1]c1cccs1 \n",
+ "4083 [*:1]c1ccccc1>>[*:1]C1CC1 [*:1]c1cccs1>>[*:1]CCc1ccccc1 \n",
+ "4084 [*:1]C>>[*:1]c1cccs1 [*:1]N(C)C>>[*:1][H] \n",
+ "4085 [*:1]C1CC1>>[*:1]c1ccccc1 [*:1]N1CCNCC1>>[*:1][H] \n",
+ "\n",
+ " Property Prop_Cpd1 Prop_Cpd2 Prop_Cpd3 Prop_Cpd4 Nonadditivity \\\n",
+ "0 VALUE 5.1 5.5 5.5 5.6 -0.5 \n",
+ "1 VALUE 5.0 5.1 4.8 4.8 -0.1 \n",
+ "2 VALUE 4.4 4.5 5.0 5.1 -0.2 \n",
+ "3 VALUE 4.4 5.0 4.7 5.1 -1.0 \n",
+ "4 VALUE 5.0 4.5 5.0 4.7 0.8 \n",
+ "... ... ... ... ... ... ... \n",
+ "4081 VALUE 4.6 4.6 5.1 4.4 0.7 \n",
+ "4082 VALUE 5.1 4.4 5.0 4.8 0.9 \n",
+ "4083 VALUE 5.0 4.8 4.6 4.6 0.2 \n",
+ "4084 VALUE 4.8 5.5 4.8 5.1 -1.0 \n",
+ "4085 VALUE 6.9 4.7 5.0 4.8 2.4 \n",
+ "\n",
+ " Circle_ID Theo_Quantile \n",
+ "0 CHEMBL1531070_CHEMBL1442087_CHEMBL1555369_CHEM... -0.872 \n",
+ "1 CHEMBL1566556_CHEMBL1398066_CHEMBL1486399_CHEM... -0.160 \n",
+ "2 CHEMBL1592533_CHEMBL1359291_CHEMBL1437906_CHEM... -0.327 \n",
+ "3 CHEMBL1592533_CHEMBL1315700_CHEMBL1564545_CHEM... -1.530 \n",
+ "4 CHEMBL1437906_CHEMBL1359291_CHEMBL1315700_CHEM... 1.330 \n",
+ "... ... ... \n",
+ "4081 CHEMBL1515287_CHEMBL1592760_CHEMBL1494529_CHEM... 1.180 \n",
+ "4082 CHEMBL1494529_CHEMBL1365979_CHEMBL1490139_CHEM... 1.420 \n",
+ "4083 CHEMBL1490139_CHEMBL1358588_CHEMBL1592760_CHEM... 0.357 \n",
+ "4084 CHEMBL1326100_CHEMBL1512693_CHEMBL1358588_CHEM... -1.490 \n",
+ "4085 CHEMBL1316562_CHEMBL1355909_CHEMBL1490139_CHEM... 2.590 \n",
+ "\n",
+ "[4086 rows x 19 columns]"
]
},
- "execution_count": 43,
+ "execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "infile = my_path+my_name+'_curated.csv'\n",
- "outfile = my_path+my_name+'_NAA_output.csv'\n",
- "\n",
- "naa(infile, outfile, 'VALUE', 'noconv', 'comma')"
+ "MAIN, PC, _ = run_nonadd_calculation_helper(\n",
+ " infile=infile,\n",
+ " props=['VALUE'],\n",
+ " units=['noconv'],\n",
+ ")\n",
+ "MAIN"
]
},
{
@@ -580,16 +804,6 @@
"### Generate plots for analysing NAA output"
]
},
- {
- "cell_type": "code",
- "execution_count": 18,
- "metadata": {},
- "outputs": [],
- "source": [
- "curated = my_path+my_name+'_curated.csv'\n",
- "curated = pd.read_csv(curated, sep=',')"
- ]
- },
{
"cell_type": "markdown",
"metadata": {},
@@ -601,185 +815,2785 @@
},
{
"cell_type": "code",
- "execution_count": 19,
+ "execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"add_thrs = 0\n",
"exp_noise = 0.5\n",
- "significant_thrs = 2*exp_noise\n",
- "strong_thrs = 2*significant_thrs"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 20,
- "metadata": {},
- "outputs": [],
- "source": [
- "MAIN = my_path+my_name+'_NAA_output.csv'\n",
- "MAIN = pd.read_csv(MAIN, sep='\\t')\n",
+ "significant_thrs = 2 * exp_noise\n",
+ "strong_thrs = 2 * significant_thrs\n",
"\n",
"MAIN['Nonadditivity_abs'] = MAIN['Nonadditivity'].abs()\n",
- "MAIN_log0 = MAIN[MAIN['Nonadditivity_abs'] > add_thrs]\n",
- "MAIN_log1 = MAIN[MAIN['Nonadditivity_abs'] > significant_thrs]\n",
- "MAIN_log2 = MAIN_log1[MAIN_log1['Nonadditivity_abs'] > strong_thrs]"
+ "MAIN_log0 = MAIN[MAIN['Nonadditivity'].abs() > add_thrs]\n",
+ "MAIN_log1 = MAIN[MAIN['Nonadditivity'].abs() > significant_thrs]\n",
+ "MAIN_log2 = MAIN[MAIN['Nonadditivity'].abs() > strong_thrs]"
]
},
{
- "cell_type": "code",
- "execution_count": 21,
+ "cell_type": "markdown",
"metadata": {},
- "outputs": [],
"source": [
- "PC = my_path+my_name+'_NAA_output_perCompound.txt'\n",
- "PC = pd.read_csv(PC, sep='\\t')\n",
- "\n",
- "PC['Nonadd_abs'] = PC['Nonadd_pC'].abs()\n",
- "PC['CI'] = (2*2*exp_noise/(np.sqrt(PC['nOccurence'])))\n",
- "PC['CI_2'] = (2*2*exp_noise/(np.sqrt(PC['nOccurence'])))*3\n",
- "PC = PC.sort_values(by=['CI'], ascending=False)\n",
- "\n",
- "PC_log0 = PC[PC['Nonadd_abs'] > add_thrs]\n",
- "PC_log1 = PC[PC['Nonadd_abs'] > significant_thrs]\n",
- "PC_log2 = PC_log1[PC_log1['Nonadd_abs'] > strong_thrs]"
+ "### Check for normality "
]
},
{
"cell_type": "code",
- "execution_count": 22,
+ "execution_count": 10,
"metadata": {},
"outputs": [
{
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "2876 compounds\n",
- "4086 cycles\n",
- "486 cycles with significant NA score ( 11.9 % )\n",
- "76 unique compounds show significant NA shift ( 2.6 % )\n",
- "13 unique compounds show strong NA ( 0.5 % )\n"
- ]
+ "data": {
+ "text/plain": [
+ "DescribeResult(nobs=4086, minmax=(-4.2, 3.4), mean=-0.006044999812457169, variance=0.528509343779737, skewness=-0.24855197856455374, kurtosis=3.212947149974866)"
+ ]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
}
],
"source": [
- "print(len(curated.iloc[:,0]), 'compounds')\n",
- "print(len(MAIN.iloc[:,0]), 'cycles')\n",
- "print(len(MAIN_log1.iloc[:,0]), 'cycles with significant NA score', '(',round(len(MAIN_log1.iloc[:,0])/len(MAIN.iloc[:,0])*100,1), '% )')\n",
- "print(len(PC_log1['Compound_ID'].value_counts()), 'unique compounds show significant NA shift', '(',round(len(PC_log1['Compound_ID'])/len(curated.iloc[:,0])*100,1), '% )')\n",
- "print(len(PC_log2['Compound_ID'].value_counts()), 'unique compounds show strong NA', '(',round(len(PC_log2['Compound_ID'])/len(curated.iloc[:,0])*100,1), '% )')"
+ "MAIN_array = MAIN['Nonadditivity'].values\n",
+ "stats.describe(MAIN_array, axis=0)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "### Check for normality "
+ "### Nonadditivity Distribution\n",
+ "\n",
+ "normal distribution parameters depend on significant treshold"
]
},
{
"cell_type": "code",
- "execution_count": 23,
+ "execution_count": 11,
"metadata": {},
"outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/Users/cthoyt/.virtualenvs/cheminf/lib/python3.8/site-packages/seaborn/distributions.py:2557: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).\n",
+ " warnings.warn(msg, FutureWarning)\n",
+ "/Users/cthoyt/.virtualenvs/cheminf/lib/python3.8/site-packages/seaborn/distributions.py:2557: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `kdeplot` (an axes-level function for kernel density plots).\n",
+ " warnings.warn(msg, FutureWarning)\n"
+ ]
+ },
{
"data": {
+ "image/svg+xml": [
+ "\n",
+ "\n",
+ "\n"
+ ],
"text/plain": [
- "DescribeResult(nobs=4086, minmax=(-4.2, 3.7), mean=-0.008981839302667646, variance=0.5284652015619267, skewness=-0.18666235101379203, kurtosis=3.212527050178692)"
+ ""
]
},
- "execution_count": 23,
"metadata": {},
- "output_type": "execute_result"
+ "output_type": "display_data"
}
],
"source": [
- "MAIN_array = MAIN['Nonadditivity'].values\n",
- "stats.describe(MAIN_array, axis=0)"
+ "fig, ax = NA_distribution(MAIN['Nonadditivity'], significant_thrs)\n",
+ "ax.set_title(f'{assay_chembl_id}\\n{ax.get_title()}')\n",
+ "# fig.savefig('nonadditivity_distribution.png', dpi=300)\n",
+ "plt.show()"
]
},
{
"cell_type": "code",
- "execution_count": 24,
+ "execution_count": 12,
"metadata": {},
"outputs": [
{
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Statistics=335.764, p=0.000\n",
- "Sample does not look Gaussian (reject H0)\n"
- ]
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Compound_ID | \n",
+ " SMILES | \n",
+ " Series | \n",
+ " Property | \n",
+ " Operator | \n",
+ " Measured | \n",
+ " Nonadd_pC | \n",
+ " nOccurence | \n",
+ " Nonadd_SD | \n",
+ " Nonadd_abs | \n",
+ " CI | \n",
+ " CI_2 | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " CHEMBL1531070 | \n",
+ " CCC/C=C(\\CCC)C(NS(=O)(=O)c1ccc(C(F)(F)F)cc1)c1... | \n",
+ " NaN | \n",
+ " VALUE | \n",
+ " NaN | \n",
+ " 5.1 | \n",
+ " -0.500000 | \n",
+ " 1 | \n",
+ " 0.00 | \n",
+ " 0.500000 | \n",
+ " 2.000000 | \n",
+ " 6.000000 | \n",
+ "
\n",
+ " \n",
+ " 297 | \n",
+ " CHEMBL1568735 | \n",
+ " COc1cccc(-c2nc(=NCc3ccccc3OC)cc[nH]2)c1 | \n",
+ " NaN | \n",
+ " VALUE | \n",
+ " NaN | \n",
+ " 5.9 | \n",
+ " -1.800000 | \n",
+ " 1 | \n",
+ " 0.00 | \n",
+ " 1.800000 | \n",
+ " 2.000000 | \n",
+ " 6.000000 | \n",
+ "
\n",
+ " \n",
+ " 281 | \n",
+ " CHEMBL1552519 | \n",
+ " c1ccc(CN=c2[nH]cnc3ccc(-c4ccoc4)cc23)cc1 | \n",
+ " NaN | \n",
+ " VALUE | \n",
+ " NaN | \n",
+ " 5.8 | \n",
+ " 0.000047 | \n",
+ " 1 | \n",
+ " 0.00 | \n",
+ " 0.000047 | \n",
+ " 2.000000 | \n",
+ " 6.000000 | \n",
+ "
\n",
+ " \n",
+ " 282 | \n",
+ " CHEMBL1370296 | \n",
+ " Cc1cccc(CN=c2[nH]cnc3ccc(-c4ccoc4)cc23)c1 | \n",
+ " NaN | \n",
+ " VALUE | \n",
+ " NaN | \n",
+ " 6.0 | \n",
+ " -0.000047 | \n",
+ " 1 | \n",
+ " 0.00 | \n",
+ " 0.000047 | \n",
+ " 2.000000 | \n",
+ " 6.000000 | \n",
+ "
\n",
+ " \n",
+ " 283 | \n",
+ " CHEMBL491771 | \n",
+ " Cc1cccc(CN=c2[nH]cnc3ccc(-c4cccc(NS(C)(=O)=O)c... | \n",
+ " NaN | \n",
+ " VALUE | \n",
+ " NaN | \n",
+ " 6.3 | \n",
+ " 1.700000 | \n",
+ " 1 | \n",
+ " 0.00 | \n",
+ " 1.700000 | \n",
+ " 2.000000 | \n",
+ " 6.000000 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 531 | \n",
+ " CHEMBL1490528 | \n",
+ " COCCn1c(=O)c(-c2cccs2)nc2cnc(N3CCNCC3)nc21 | \n",
+ " NaN | \n",
+ " VALUE | \n",
+ " NaN | \n",
+ " 5.7 | \n",
+ " 0.310000 | \n",
+ " 67 | \n",
+ " 0.76 | \n",
+ " 0.310000 | \n",
+ " 0.244339 | \n",
+ " 0.733017 | \n",
+ "
\n",
+ " \n",
+ " 464 | \n",
+ " CHEMBL1433704 | \n",
+ " COCCn1c(=O)c(-c2ccccc2)nc2cnc(Oc3ccccc3)nc21 | \n",
+ " NaN | \n",
+ " VALUE | \n",
+ " NaN | \n",
+ " 5.0 | \n",
+ " -0.210000 | \n",
+ " 67 | \n",
+ " 0.39 | \n",
+ " 0.210000 | \n",
+ " 0.244339 | \n",
+ " 0.733017 | \n",
+ "
\n",
+ " \n",
+ " 428 | \n",
+ " CHEMBL1481510 | \n",
+ " O=c1c(-c2ccccc2)nc2cnc(N3CCNCC3)nc2n1C1CC1 | \n",
+ " NaN | \n",
+ " VALUE | \n",
+ " NaN | \n",
+ " 6.8 | \n",
+ " 0.970000 | \n",
+ " 67 | \n",
+ " 0.84 | \n",
+ " 0.970000 | \n",
+ " 0.244339 | \n",
+ " 0.733017 | \n",
+ "
\n",
+ " \n",
+ " 470 | \n",
+ " CHEMBL1472732 | \n",
+ " COCCn1c(=O)c(-c2ccccc2)nc2cnc(N3CCNCC3)nc21 | \n",
+ " NaN | \n",
+ " VALUE | \n",
+ " NaN | \n",
+ " 5.5 | \n",
+ " 0.002900 | \n",
+ " 69 | \n",
+ " 0.68 | \n",
+ " 0.002900 | \n",
+ " 0.240772 | \n",
+ " 0.722315 | \n",
+ "
\n",
+ " \n",
+ " 545 | \n",
+ " CHEMBL1396809 | \n",
+ " COc1cccc(Cn2c(=O)c(CCc3ccccc3)nc3cncnc32)c1 | \n",
+ " NaN | \n",
+ " VALUE | \n",
+ " NaN | \n",
+ " 4.9 | \n",
+ " -0.047000 | \n",
+ " 73 | \n",
+ " 0.52 | \n",
+ " 0.047000 | \n",
+ " 0.234082 | \n",
+ " 0.702247 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
941 rows × 12 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Compound_ID SMILES Series \\\n",
+ "0 CHEMBL1531070 CCC/C=C(\\CCC)C(NS(=O)(=O)c1ccc(C(F)(F)F)cc1)c1... NaN \n",
+ "297 CHEMBL1568735 COc1cccc(-c2nc(=NCc3ccccc3OC)cc[nH]2)c1 NaN \n",
+ "281 CHEMBL1552519 c1ccc(CN=c2[nH]cnc3ccc(-c4ccoc4)cc23)cc1 NaN \n",
+ "282 CHEMBL1370296 Cc1cccc(CN=c2[nH]cnc3ccc(-c4ccoc4)cc23)c1 NaN \n",
+ "283 CHEMBL491771 Cc1cccc(CN=c2[nH]cnc3ccc(-c4cccc(NS(C)(=O)=O)c... NaN \n",
+ ".. ... ... ... \n",
+ "531 CHEMBL1490528 COCCn1c(=O)c(-c2cccs2)nc2cnc(N3CCNCC3)nc21 NaN \n",
+ "464 CHEMBL1433704 COCCn1c(=O)c(-c2ccccc2)nc2cnc(Oc3ccccc3)nc21 NaN \n",
+ "428 CHEMBL1481510 O=c1c(-c2ccccc2)nc2cnc(N3CCNCC3)nc2n1C1CC1 NaN \n",
+ "470 CHEMBL1472732 COCCn1c(=O)c(-c2ccccc2)nc2cnc(N3CCNCC3)nc21 NaN \n",
+ "545 CHEMBL1396809 COc1cccc(Cn2c(=O)c(CCc3ccccc3)nc3cncnc32)c1 NaN \n",
+ "\n",
+ " Property Operator Measured Nonadd_pC nOccurence Nonadd_SD \\\n",
+ "0 VALUE NaN 5.1 -0.500000 1 0.00 \n",
+ "297 VALUE NaN 5.9 -1.800000 1 0.00 \n",
+ "281 VALUE NaN 5.8 0.000047 1 0.00 \n",
+ "282 VALUE NaN 6.0 -0.000047 1 0.00 \n",
+ "283 VALUE NaN 6.3 1.700000 1 0.00 \n",
+ ".. ... ... ... ... ... ... \n",
+ "531 VALUE NaN 5.7 0.310000 67 0.76 \n",
+ "464 VALUE NaN 5.0 -0.210000 67 0.39 \n",
+ "428 VALUE NaN 6.8 0.970000 67 0.84 \n",
+ "470 VALUE NaN 5.5 0.002900 69 0.68 \n",
+ "545 VALUE NaN 4.9 -0.047000 73 0.52 \n",
+ "\n",
+ " Nonadd_abs CI CI_2 \n",
+ "0 0.500000 2.000000 6.000000 \n",
+ "297 1.800000 2.000000 6.000000 \n",
+ "281 0.000047 2.000000 6.000000 \n",
+ "282 0.000047 2.000000 6.000000 \n",
+ "283 1.700000 2.000000 6.000000 \n",
+ ".. ... ... ... \n",
+ "531 0.310000 0.244339 0.733017 \n",
+ "464 0.210000 0.244339 0.733017 \n",
+ "428 0.970000 0.244339 0.733017 \n",
+ "470 0.002900 0.240772 0.722315 \n",
+ "545 0.047000 0.234082 0.702247 \n",
+ "\n",
+ "[941 rows x 12 columns]"
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
}
],
"source": [
- "stat, p = normaltest(MAIN_array)\n",
- "print('Statistics=%.3f, p=%.3f' % (stat, p))\n",
- "\n",
- "alpha = 0.05\n",
- "if p > alpha:\n",
- " print('Sample looks Gaussian (fail to reject H0)')\n",
- "else:\n",
- " print('Sample does not look Gaussian (reject H0)')"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Nonadditivity Distribution\n",
- "\n",
- "normal distribution parameters depend on significant treshold"
+ "PC['Nonadd_abs'] = PC['Nonadd_pC'].abs()\n",
+ "PC['CI'] = (2*2*exp_noise/(np.sqrt(PC['nOccurence'])))\n",
+ "PC['CI_2'] = (2*2*exp_noise/(np.sqrt(PC['nOccurence']))) * 3\n",
+ "PC = PC.sort_values(by=['CI'], ascending=False)\n",
+ "PC"
]
},
{
"cell_type": "code",
- "execution_count": 25,
+ "execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
- "def NA_distribution(x, hist=True, kde=True, ins=int(100/5), color='crimson', kde_kws={'shade': True, 'linewidth': 2}):\n",
- " sns.set_style('ticks')\n",
- " fig, ax = plt.subplots(1,1)\n",
- " fig.set_size_inches(10, 8)\n",
- " sns.distplot(x, hist=True, kde=True,\n",
- " bins=int(100/5), color='crimson',\n",
- " kde_kws={'shade': True, 'linewidth': 2})\n",
- " sns.distplot(normal_dist, hist=False, kde=True,\n",
- " color='grey',\n",
- " kde_kws={'shade': True, 'linewidth': 2})\n",
- "\n",
- " legend = ['Real','Theoretical']\n",
- "\n",
- " plt.legend(legend, prop={'size': 20}) #title = '')\n",
- " plt.title('', size=30)\n",
- " plt.xlabel('Nonadditivity', size=25)\n",
- " plt.ylabel('Density', size=25)\n",
- " \n",
- " plt.show()"
+ "PC_log0 = PC[PC['Nonadd_pC'].abs() > add_thrs]\n",
+ "PC_log1 = PC[PC['Nonadd_pC'].abs() > significant_thrs]\n",
+ "PC_log2 = PC[PC['Nonadd_pC'].abs() > strong_thrs]"
]
},
{
"cell_type": "code",
- "execution_count": 26,
+ "execution_count": 14,
"metadata": {},
"outputs": [
{
- "data": {
- "image/png": "\n",
- "text/plain": [
- ""
- ]
- },
- "metadata": {},
- "output_type": "display_data"
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "2876 compounds\n",
+ "4086 cycles\n",
+ "486 cycles with significant NA score ( 11.9 % )\n",
+ "76 unique compounds show significant NA shift ( 2.6 % )\n",
+ "13 unique compounds show strong NA ( 0.5 % )\n"
+ ]
}
],
"source": [
- "n_of_cycles = len(MAIN.iloc[:,0])\n",
- "normal_dist = np.random.normal(loc=0,scale=significant_thrs,size=n_of_cycles)\n",
- "\n",
- "NA_distribution(MAIN['Nonadditivity'])"
+ "print(len(df.iloc[:,0]), 'compounds')\n",
+ "print(len(MAIN.iloc[:,0]), 'cycles')\n",
+ "print(len(MAIN_log1.iloc[:,0]), 'cycles with significant NA score', '(',round(len(MAIN_log1.iloc[:,0])/len(MAIN.iloc[:,0])*100,1), '% )')\n",
+ "print(len(PC_log1['Compound_ID'].value_counts()), 'unique compounds show significant NA shift', '(',round(len(PC_log1['Compound_ID'])/len(df.iloc[:,0])*100,1), '% )')\n",
+ "print(len(PC_log2['Compound_ID'].value_counts()), 'unique compounds show strong NA', '(',round(len(PC_log2['Compound_ID'])/len(df.iloc[:,0])*100,1), '% )')"
]
},
{
@@ -791,39 +3605,528 @@
},
{
"cell_type": "code",
- "execution_count": 27,
+ "execution_count": 15,
"metadata": {},
"outputs": [
{
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "395 compounds\n"
- ]
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Compound_ID | \n",
+ " SMILES | \n",
+ " Series | \n",
+ " Property | \n",
+ " Operator | \n",
+ " Measured | \n",
+ " Nonadd_pC | \n",
+ " nOccurence | \n",
+ " Nonadd_SD | \n",
+ " Nonadd_abs | \n",
+ " CI | \n",
+ " CI_2 | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 862 | \n",
+ " CHEMBL1396862 | \n",
+ " O=S(=O)(c1ccccc1)N1CCC2(CCCN(Cc3ccncc3)C2)CC1 | \n",
+ " NaN | \n",
+ " VALUE | \n",
+ " NaN | \n",
+ " 7.1 | \n",
+ " 2.80 | \n",
+ " 1 | \n",
+ " 0.000000e+00 | \n",
+ " 2.80 | \n",
+ " 2.000000 | \n",
+ " 6.000000 | \n",
+ "
\n",
+ " \n",
+ " 863 | \n",
+ " CHEMBL1473753 | \n",
+ " O=S(=O)(c1ccccc1)N1CCC2(CCCN(c3ccncc3)C2)CC1 | \n",
+ " NaN | \n",
+ " VALUE | \n",
+ " NaN | \n",
+ " 5.4 | \n",
+ " -2.80 | \n",
+ " 1 | \n",
+ " 0.000000e+00 | \n",
+ " 2.80 | \n",
+ " 2.000000 | \n",
+ " 6.000000 | \n",
+ "
\n",
+ " \n",
+ " 864 | \n",
+ " CHEMBL1553056 | \n",
+ " CS(=O)(=O)N1CCC2(CCCN(c3ccncc3)C2)CC1 | \n",
+ " NaN | \n",
+ " VALUE | \n",
+ " NaN | \n",
+ " 7.1 | \n",
+ " 2.80 | \n",
+ " 1 | \n",
+ " 0.000000e+00 | \n",
+ " 2.80 | \n",
+ " 2.000000 | \n",
+ " 6.000000 | \n",
+ "
\n",
+ " \n",
+ " 865 | \n",
+ " CHEMBL1435702 | \n",
+ " CS(=O)(=O)N1CCC2(CCCN(Cc3ccncc3)C2)CC1 | \n",
+ " NaN | \n",
+ " VALUE | \n",
+ " NaN | \n",
+ " 6.0 | \n",
+ " -2.80 | \n",
+ " 1 | \n",
+ " 0.000000e+00 | \n",
+ " 2.80 | \n",
+ " 2.000000 | \n",
+ " 6.000000 | \n",
+ "
\n",
+ " \n",
+ " 612 | \n",
+ " CHEMBL1316759 | \n",
+ " COc1ncc2nc(C)c(=O)n(CCC#N)c2n1 | \n",
+ " NaN | \n",
+ " VALUE | \n",
+ " NaN | \n",
+ " 6.5 | \n",
+ " 2.80 | \n",
+ " 2 | \n",
+ " 3.100000e-07 | \n",
+ " 2.80 | \n",
+ " 1.414214 | \n",
+ " 4.242641 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 532 | \n",
+ " CHEMBL1554236 | \n",
+ " COCCn1c(=O)c(-c2cc(F)cc(F)c2)nc2cnc(N3CCNCC3)nc21 | \n",
+ " NaN | \n",
+ " VALUE | \n",
+ " NaN | \n",
+ " 5.9 | \n",
+ " 0.66 | \n",
+ " 63 | \n",
+ " 7.800000e-01 | \n",
+ " 0.66 | \n",
+ " 0.251976 | \n",
+ " 0.755929 | \n",
+ "
\n",
+ " \n",
+ " 447 | \n",
+ " CHEMBL1434801 | \n",
+ " O=c1c(CCc2ccccc2)nc2cnc(N3CCNCC3)nc2n1C1CC1 | \n",
+ " NaN | \n",
+ " VALUE | \n",
+ " NaN | \n",
+ " 5.0 | \n",
+ " -0.50 | \n",
+ " 63 | \n",
+ " 9.900000e-01 | \n",
+ " 0.50 | \n",
+ " 0.251976 | \n",
+ " 0.755929 | \n",
+ "
\n",
+ " \n",
+ " 559 | \n",
+ " CHEMBL1405464 | \n",
+ " COc1cccc(Cn2c(=O)c(-c3cccs3)nc3cncnc32)c1 | \n",
+ " NaN | \n",
+ " VALUE | \n",
+ " NaN | \n",
+ " 5.6 | \n",
+ " 0.36 | \n",
+ " 64 | \n",
+ " 7.600000e-01 | \n",
+ " 0.36 | \n",
+ " 0.250000 | \n",
+ " 0.750000 | \n",
+ "
\n",
+ " \n",
+ " 531 | \n",
+ " CHEMBL1490528 | \n",
+ " COCCn1c(=O)c(-c2cccs2)nc2cnc(N3CCNCC3)nc21 | \n",
+ " NaN | \n",
+ " VALUE | \n",
+ " NaN | \n",
+ " 5.7 | \n",
+ " 0.31 | \n",
+ " 67 | \n",
+ " 7.600000e-01 | \n",
+ " 0.31 | \n",
+ " 0.244339 | \n",
+ " 0.733017 | \n",
+ "
\n",
+ " \n",
+ " 428 | \n",
+ " CHEMBL1481510 | \n",
+ " O=c1c(-c2ccccc2)nc2cnc(N3CCNCC3)nc2n1C1CC1 | \n",
+ " NaN | \n",
+ " VALUE | \n",
+ " NaN | \n",
+ " 6.8 | \n",
+ " 0.97 | \n",
+ " 67 | \n",
+ " 8.400000e-01 | \n",
+ " 0.97 | \n",
+ " 0.244339 | \n",
+ " 0.733017 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
168 rows × 12 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Compound_ID SMILES Series \\\n",
+ "862 CHEMBL1396862 O=S(=O)(c1ccccc1)N1CCC2(CCCN(Cc3ccncc3)C2)CC1 NaN \n",
+ "863 CHEMBL1473753 O=S(=O)(c1ccccc1)N1CCC2(CCCN(c3ccncc3)C2)CC1 NaN \n",
+ "864 CHEMBL1553056 CS(=O)(=O)N1CCC2(CCCN(c3ccncc3)C2)CC1 NaN \n",
+ "865 CHEMBL1435702 CS(=O)(=O)N1CCC2(CCCN(Cc3ccncc3)C2)CC1 NaN \n",
+ "612 CHEMBL1316759 COc1ncc2nc(C)c(=O)n(CCC#N)c2n1 NaN \n",
+ ".. ... ... ... \n",
+ "532 CHEMBL1554236 COCCn1c(=O)c(-c2cc(F)cc(F)c2)nc2cnc(N3CCNCC3)nc21 NaN \n",
+ "447 CHEMBL1434801 O=c1c(CCc2ccccc2)nc2cnc(N3CCNCC3)nc2n1C1CC1 NaN \n",
+ "559 CHEMBL1405464 COc1cccc(Cn2c(=O)c(-c3cccs3)nc3cncnc32)c1 NaN \n",
+ "531 CHEMBL1490528 COCCn1c(=O)c(-c2cccs2)nc2cnc(N3CCNCC3)nc21 NaN \n",
+ "428 CHEMBL1481510 O=c1c(-c2ccccc2)nc2cnc(N3CCNCC3)nc2n1C1CC1 NaN \n",
+ "\n",
+ " Property Operator Measured Nonadd_pC nOccurence Nonadd_SD \\\n",
+ "862 VALUE NaN 7.1 2.80 1 0.000000e+00 \n",
+ "863 VALUE NaN 5.4 -2.80 1 0.000000e+00 \n",
+ "864 VALUE NaN 7.1 2.80 1 0.000000e+00 \n",
+ "865 VALUE NaN 6.0 -2.80 1 0.000000e+00 \n",
+ "612 VALUE NaN 6.5 2.80 2 3.100000e-07 \n",
+ ".. ... ... ... ... ... ... \n",
+ "532 VALUE NaN 5.9 0.66 63 7.800000e-01 \n",
+ "447 VALUE NaN 5.0 -0.50 63 9.900000e-01 \n",
+ "559 VALUE NaN 5.6 0.36 64 7.600000e-01 \n",
+ "531 VALUE NaN 5.7 0.31 67 7.600000e-01 \n",
+ "428 VALUE NaN 6.8 0.97 67 8.400000e-01 \n",
+ "\n",
+ " Nonadd_abs CI CI_2 \n",
+ "862 2.80 2.000000 6.000000 \n",
+ "863 2.80 2.000000 6.000000 \n",
+ "864 2.80 2.000000 6.000000 \n",
+ "865 2.80 2.000000 6.000000 \n",
+ "612 2.80 1.414214 4.242641 \n",
+ ".. ... ... ... \n",
+ "532 0.66 0.251976 0.755929 \n",
+ "447 0.50 0.251976 0.755929 \n",
+ "559 0.36 0.250000 0.750000 \n",
+ "531 0.31 0.244339 0.733017 \n",
+ "428 0.97 0.244339 0.733017 \n",
+ "\n",
+ "[168 rows x 12 columns]"
+ ]
+ },
+ "execution_count": 15,
+ "metadata": {},
+ "output_type": "execute_result"
}
],
"source": [
- "ambiguous_compounds = PC.loc[PC['Nonadd_abs'] > PC['CI']]\n",
- "print(len(ambiguous_compounds.iloc[:,0]), 'compounds')\n",
- "outliers = ambiguous_compounds[['SMILES']]\n",
- "\n",
- "#outliers.to_csv(my_path+'outliers.csv', index = False)"
+ "ambiguous_compounds = PC.loc[PC['Nonadd_pC'].abs() > PC['CI']]\n",
+ "ambiguous_compounds"
]
},
{
"cell_type": "code",
- "execution_count": 28,
+ "execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
- "image/png": "\n",
+ "image/png": "\n",
+ "image/svg+xml": [
+ "\n",
+ "\n"
+ ],
"text/plain": [
- ""
+ ""
]
},
+ "execution_count": 16,
"metadata": {},
- "output_type": "display_data"
+ "output_type": "execute_result"
}
],
"source": [
@@ -831,19 +4134,7172 @@
"SMILES = list(ambiguous_compounds['SMILES'])\n",
"strangest = SMILES[-2]\n",
"\n",
- "display(Chem.MolFromSmiles(strangest))"
+ "Chem.MolFromSmiles(strangest)"
]
},
{
"cell_type": "code",
- "execution_count": 29,
+ "execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
- "image/png": "\n",
+ "image/svg+xml": [
+ "\n",
+ "\n",
+ "