Sklearn快速指南

2017年9月1日 - 花瓣数据

Scikit-learn

Scikit-learn——一个开源的Python库,用于实现机器学习的各个过程,比如预处理、交叉验证、可视化和各种统一接口的算法。

一个基础的例子

In [1]:
# 导入近邻、数据集、预处理方法
from sklearn import neighbors,datasets,preprocessing

# 导入数据集分割方法
from sklearn.model_selection import train_test_split

# 导入模型评估方法
from sklearn.metrics import accuracy_score

# 实例化一个数据集
iris = datasets.load_iris()

# 提取特征数据与目标数据
X,y = iris.data[:,:2],iris.target

# 分割训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=33)

# 预处理-标准化特征数据
scaler = preprocessing.StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# 实例化一个K邻分类器模型
knn = neighbors.KNeighborsClassifier(n_neighbors=5)

# 训练模型
knn.fit(X_train, y_train)

# 测试模型
y_pred = knn.predict(X_test)

# 评估模型
accuracy_score(y_test, y_pred)
Out[1]:
0.63157894736842102

加载数据——同样可以参见NumPy和Pandas

In [2]:
import numpy as np
X = np.random.random((10,5))
y = np.array(['M','M','F','F','M','F','M','M','F','F'])
X[X < 0.7] = 0
print(X)
print(y)
[[ 0.          0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.          0.85899326]
 [ 0.          0.          0.          0.84703891  0.        ]
 [ 0.          0.          0.          0.99939435  0.        ]
 [ 0.          0.          0.          0.          0.        ]
 [ 0.78877465  0.          0.          0.70542769  0.        ]
 [ 0.          0.          0.78805685  0.90289882  0.        ]
 [ 0.          0.90173673  0.7551219   0.78177544  0.79135819]
 [ 0.          0.          0.          0.          0.        ]]
['M' 'M' 'F' 'F' 'M' 'F' 'M' 'M' 'F' 'F']

分割训练集和测试集

In [3]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3)

print("训练特征:",X_train)
print("测试特征:",X_test)
print("训练目标:",y_train)
print("测试目标",y_test)
训练特征: [[ 0.          0.          0.          0.          0.        ]
 [ 0.          0.90173673  0.7551219   0.78177544  0.79135819]
 [ 0.          0.          0.78805685  0.90289882  0.        ]
 [ 0.          0.          0.          0.          0.        ]
 [ 0.78877465  0.          0.          0.70542769  0.        ]
 [ 0.          0.          0.          0.          0.85899326]
 [ 0.          0.          0.          0.          0.        ]]
测试特征: [[ 0.          0.          0.          0.99939435  0.        ]
 [ 0.          0.          0.          0.84703891  0.        ]
 [ 0.          0.          0.          0.          0.        ]]
训练目标: ['F' 'F' 'M' 'F' 'M' 'F' 'M']
测试目标 ['M' 'F' 'M']

预处理数据

标准化

In [8]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X_train)
standardized_X = scaler.transform(X_train)
standardized_X_test = scaler.transform(X_test)

规范化

In [9]:
from sklearn.preprocessing import Normalizer
scaler = Normalizer().fit(X_train)
normalized_X = scaler.transform(X_train)
normalized_X_test = scaler.transform(X_test)

二值化

In [10]:
from sklearn.preprocessing import Binarizer
binarizer = Binarizer(threshold=0.0).fit(X)
binary_X = binarizer.transform(X)

编码分类特征

In [7]:
from sklearn.preprocessing import LabelEncoder
enc = LabelEncoder()
y = enc.fit_transform(y)
y
Out[7]:
array([1, 1, 0, 0, 1, 0, 1, 1, 0, 0], dtype=int64)

输入缺失值

In [11]:
from sklearn.preprocessing import Imputer
imp = Imputer(missing_values=0, strategy='mean', axis=0)
imp.fit_transform(X_train)
Out[11]:
array([[ 0.78877465,  0.90173673,  0.77158937,  0.79670065,  0.82517572],
       [ 0.78877465,  0.90173673,  0.7551219 ,  0.78177544,  0.79135819],
       [ 0.78877465,  0.90173673,  0.78805685,  0.90289882,  0.82517572],
       [ 0.78877465,  0.90173673,  0.77158937,  0.79670065,  0.82517572],
       [ 0.78877465,  0.90173673,  0.77158937,  0.70542769,  0.82517572],
       [ 0.78877465,  0.90173673,  0.77158937,  0.79670065,  0.85899326],
       [ 0.78877465,  0.90173673,  0.77158937,  0.79670065,  0.82517572]])

生成多项式特征

In [12]:
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(5)
poly.fit_transform(X)
Out[12]:
array([[ 1.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 1.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 1.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.46767997],
       ..., 
       [ 1.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 1.        ,  0.        ,  0.90173673, ...,  0.30288889,
         0.30660161,  0.31035983],
       [ 1.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

创建模型

监督学习估计器

In [15]:
# 线性回归
from sklearn.linear_model import LinearRegression
lr = LinearRegression(normalize=True)

# 支持向量机(SVM)
from sklearn.svm import SVC
svc = SVC(kernel='linear')

# 贝叶斯
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()

# KNN
from sklearn import neighbors
knn = neighbors.KNeighborsClassifier(n_neighbors=5)

非监督学习估计器

In [16]:
# 主成分分析(PCA)
from sklearn.decomposition import PCA
pca = PCA(n_components=0.95)

# K均值
from sklearn.cluster import KMeans
k_means = KMeans(n_clusters=3, random_state=0)

训练模型

监督学习

In [17]:
lr.fit(X, y)
knn.fit(X_train, y_train)
svc.fit(X_train, y_train)
Out[17]:
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

非监督学习

In [18]:
k_means.fit(X_train)
pca_model = pca.fit_transform(X_train)

预测数据

监督估计器

In [20]:
y_pred = svc.predict ( np.random.random((2,5)) )
y_pred = lr.predict(X_test)
y_pred = knn.predict_proba( X_test )

非监督估计器

In [21]:
y_pred = k_means.predict(X_test)

评估模型的性能

分类器度量

In [24]:
# 准确度分数
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

# 分类器报告
from sklearn.metrics import classification_report
classification_report(y_test, y_pred)

# 混洗矩阵
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

回归度量

In [25]:
# 平均绝对误差
from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_true, y_pred)

# 均方差
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test, y_pred)

# R²分数
from sklearn.metrics import r2_score
r2_score(y_true, y_pred)

聚类度量

In [26]:
# 调整后的兰德系数
from sklearn.metrics import adjusted_rand_score
adjusted_rand_score(y_true, y_pred)

# 均匀性
from sklearn.metrics import homogeneity_score
homogeneity_score(y_true, y_pred)

# V-measure
from sklearn.metrics import v_measure_score
metrics.v_measure_score(y_true, y_pred)

交叉验证

In [27]:
from sklearn.cross_validation import cross_val_score
cross_val_score(lr, X, y, cv=2)

文档下载:Sklearn快速指南.pdf