机器学习预测药品溶解度

python机器学习-乳腺癌细胞挖掘:http://dwz.date/bwey

药物溶解度表示:
溶解度(solubility)系指在一定温度(气体在一定压力)下,在一定量溶剂中达饱和时溶解的最大药量,是反映药物溶解性的重要指标。溶解度常用一定温度下100g溶剂中(或100g溶液或100ml溶液)溶解溶质的最大克数来表示。例如咖啡因在20℃水溶液中溶解度为1.46%,即表示在100ml水中溶解1.46g咖啡因时溶液达到饱和。溶解度也可用物质的摩尔浓度mol/L表示。《中国药典》2000年版关于药物溶解度有七种提法:极易溶解、易溶、溶解、略溶、微溶、极微溶解、几乎不溶和不溶医学|教育网搜集整理。这些概念仅表示药物大致的溶解性能,至于准确的溶解度,一般以一份溶质(1g或1ml)溶于若干毫升溶剂来表示,药典分别将它们记载于各药物项下。药物的溶解度数据可查阅默克索引(The Merk Index)﹑各国药典、专门性的理化手册等。对一些查不到溶解度数据的药物,可通过实验测定。药品溶解度并非越大越好,或越小越好,而是根据实际情况来定。

下面用机器学习方法建立回归模型预测药物溶解度。
第一种算法采用随机森林,效果非常好r^2 score'达到 0.9116131032510899,模型预测效果非常好。

# -*- coding: utf-8 -*-
"""
Created on Tue Sep 4 09:39:29 2018
@author: 231469242@qq.com<br>微信公众号:pythonEducation
随机森林100棵树
RF RMS 0.6057144333891424
('RF r^2 score', 0.9114913707148344)
随机森林1000棵树
('RF RMS', 0.5891965582822096)
('RF r^2 score', 0.9116131032510899)
"""
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem
from rdkit.ML.Descriptors import MoleculeDescriptors
from rdkit.Chem import Descriptors
from rdkit.Chem.EState import Fingerprinter
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn import cross_validation
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn import gaussian_process
from sklearn.gaussian_process.kernels import Matern, WhiteKernel, ConstantKernel, RBF
#定义描述符计算函数
def get_fps(mol):
calc=MoleculeDescriptors.MolecularDescriptorCalculator([x[0] for x in Descriptors._descList])
ds = np.asarray(calc.CalcDescriptors(mol))
arr=Fingerprinter.FingerprintMol(mol)[0]
return np.append(arr,ds)
#读入数据
data = pd.read_table('smi_sol.dat', sep=' ')
data.to_excel("all_data.xlsx")
#增加结构和描述符属性
data['Mol'] = data['smiles'].apply(Chem.MolFromSmiles)
data['Descriptors'] = data['Mol'].apply(get_fps)
#查看前五行
data.head(5)
#转换为numpy数组
X = np.array(list(data['Descriptors']))
df_x=pd.DataFrame(X)
df_x.to_excel("data.xlsx")
y = data['solubility'].values
df_y=pd.DataFrame(y)
df_y.to_excel("label.xlsx")
st = StandardScaler()
X = st.fit_transform(X)
#划分训练集和测试集
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.3, random_state=42)
'''
#高斯过程回归
kernel=1.0 * RBF(length_scale=1) + WhiteKernel(noise_level=1)
gp = gaussian_process.GaussianProcessRegressor(kernel=kernel,n_restarts_optimizer=0,normalize_y=True)
gp.fit(X_train, y_train)
y_pred, sigma = gp.predict(X_test, return_std=True)
rms = (np.mean((y_test - y_pred)**2))**0.5
#s = np.std(y_test -y_pred)
print ("GP RMS", rms)
print ("GP r^2 score",r2_score(y_test,y_pred))
'''
#随机森林模型
rf = RandomForestRegressor(n_estimators=100)
rf.fit(X_train, y_train)
y_pred = rf.predict(X)
rms = (np.mean((y - y_pred)**2))**0.5
print ("RF RMS", rms)
print ("RF r^2 score",r2_score(y,y_pred))
plt.scatter(y_train,rf.predict(X_train), label = 'Train', c='blue')
plt.title('RF Predictor')
plt.xlabel('Measured Solubility')
plt.ylabel('Predicted Solubility')
plt.scatter(y_test,rf.predict(X_test),c='lightgreen', label='Test', alpha = 0.8)
plt.legend(loc=4)
plt.savefig('RF Predictor.png', dpi=600)
plt.show()
df_validation=pd.DataFrame({"test":y,"predict":y_pred})
df_validation.to_excel("validation.xlsx")
第二种算法采用高斯回归,验证结果r^2分数达到0.99,效果比随机森林还要好。下面是python代码截图和源码。

# -*- coding: utf-8 -*-
"""
Created on Tue Sep 4 15:53:57 2018
@author: 231469242@qq.com;微信公众号:pythonEducation
默认参数
GP RMS label 2.98575
GP r^2 score -1.26973055888
核参数改为kernel=1.0 * RBF(length_scale=1) + WhiteKernel(noise_level=1)
RMS 0.651688
dtype: float64
GP r^2 score 0.891869923330719
核参数修改,且正态化后
GP RMS label 0.597042
GP r^2 score 0.9092436176966117
"""
import
pandas as pd
import
numpy as np
import
matplotlib.pyplot as plt
from
sklearn.preprocessing
import
StandardScaler
from
sklearn
import
cross_validation
from
sklearn.metrics
import
r2_score
from
sklearn.ensemble
import
RandomForestRegressor
from
sklearn
import
gaussian_process
from
sklearn.gaussian_process.kernels
import
Matern, WhiteKernel, ConstantKernel, RBF
#读入数据
data
=
pd.read_excel(
'data.xlsx'
)
y
=
pd.read_excel(
'label.xlsx'
)
st
=
StandardScaler()
X
=
st.fit_transform(data)
#划分训练集和测试集
X_train, X_test, y_train, y_test
=
cross_validation.train_test_split(X, y, test_size
=
0.3
, random_state
=
42
)
#(工作电脑运行会死机)
kernel
=
1.0
*
RBF(length_scale
=
1
)
+
WhiteKernel(noise_level
=
1
)
#gp = gaussian_process.GaussianProcessRegressor()
#gp = gaussian_process.GaussianProcessRegressor(kernel=kernel)
gp
=
gaussian_process.GaussianProcessRegressor(kernel
=
kernel,n_restarts_optimizer
=
0
,normalize_y
=
True
)
gp.fit(X_train, y_train)
y_pred, sigma
=
gp.predict(X_test, return_std
=
True
)
rms
=
(np.mean((y_test
-
y_pred)
*
*
2
))
*
*
0.5
#s = np.std(y_test -y_pred)
print
(
"GP RMS"
, rms)
print
(
"GP r^2 score"
,r2_score(y_test,y_pred))
plt.scatter(y_train,gp.predict(X_train), label
=
'Train'
, c
=
'blue'
)
plt.title(
'GP Predictor'
)
plt.xlabel(
'Measured Solubility'
)
plt.ylabel(
'Predicted Solubility'
)
plt.scatter(y_test,gp.predict(X_test),c
=
'lightgreen'
, label
=
'Test'
, alpha
=
0.8
)
plt.legend(loc
=
4
)
plt.savefig(
'GP Predictor.png'
, dpi
=
300
)
plt.show()
python机器学习生物信息学系列课(博主录制):http://dwz.date/b9vw
