﻿ scikit-learn的主要模块和基本使用-一起大数据

# scikit-learn的主要模块和基本使用

## 引言

scikit-learn的实现使用了NumPy中的arrays，所以，我们要使用NumPy来载入csv文件。

``````import numpy as np
import urllib
# url with dataset
url = "http://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data"
raw_data = urllib.urlopen(url)
# load the CSV file as a numpy matrix
# separate the data from the target attributes
X = dataset[:,0:7]
y = dataset[:,8]``````

## 数据归一化(Data Normalization)

``````from sklearn import preprocessing
# normalize the data attributes
normalized_X = preprocessing.normalize(X)
# standardize the data attributes
standardized_X = preprocessing.scale(X)``````

## 特征选择(Feature Selection)

``````from sklearn import metrics
from sklearn.ensemble import ExtraTreesClassifier
model = ExtraTreesClassifier()
model.fit(X, y)
# display the relative importance of each attribute
print(model.feature_importances_)``````

## 算法的使用

scikit-learn实现了机器学习的大部分基础算法，让我们快速了解一下。

### 逻辑回归

``````from sklearn import metrics
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X, y)
print(model)
# make predictions
expected = y
predicted = model.predict(X)
# summarize the fit of the model
print(metrics.classification_report(expected, predicted))
print(metrics.confusion_matrix(expected, predicted))``````

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
intercept_scaling=1, penalty=l2, random_state=None, tol=0.0001)
precision recall f1-score support

``````   0.0       0.79      0.89      0.84       500
1.0       0.74      0.55      0.63       268
``````

avg / total 0.77 0.77 0.77 768

[[447 53]
[120 148]]

### 朴素贝叶斯

``````from sklearn import metrics
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
model.fit(X, y)
print(model)
# make predictions
expected = y
predicted = model.predict(X)
# summarize the fit of the model
print(metrics.classification_report(expected, predicted))
print(metrics.confusion_matrix(expected, predicted))``````

GaussianNB()
precision recall f1-score support

``````   0.0       0.80      0.86      0.83       500
1.0       0.69      0.60      0.64       268
``````

avg / total 0.76 0.77 0.76 768

[[429 71]
[108 160]]

### k近邻

k近邻算法常常被用作是分类算法一部分，比如可以用它来评估特征，在特征选择上我们可以用到它。

``````from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier
# fit a k-nearest neighbor model to the data
model = KNeighborsClassifier()
model.fit(X, y)
print(model)
# make predictions
expected = y
predicted = model.predict(X)
# summarize the fit of the model
print(metrics.classification_report(expected, predicted))
print(metrics.confusion_matrix(expected, predicted))``````

KNeighborsClassifier(algorithm=auto, leaf_size=30, metric=minkowski,
n_neighbors=5, p=2, weights=uniform)
precision recall f1-score support

``````   0.0       0.82      0.90      0.86       500
1.0       0.77      0.63      0.69       268
``````

avg / total 0.80 0.80 0.80 768

[[448 52]
[ 98 170]]

### 决策树

``````from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
# fit a CART model to the data
model = DecisionTreeClassifier()
model.fit(X, y)
print(model)
# make predictions
expected = y
predicted = model.predict(X)
# summarize the fit of the model
print(metrics.classification_report(expected, predicted))
print(metrics.confusion_matrix(expected, predicted))``````

DecisionTreeClassifier(compute_importances=None, criterion=gini,
max_depth=None, max_features=None, min_density=None,
min_samples_leaf=1, min_samples_split=2, random_state=None,
splitter=best)
precision recall f1-score support

``````   0.0       1.00      1.00      1.00       500
1.0       1.00      1.00      1.00       268
``````

avg / total 1.00 1.00 1.00 768

[[500 0]
[ 0 268]]

### 支持向量机

SVM是非常流行的机器学习算法，主要用于分类问题，如同逻辑回归问题，它可以使用一对多的方法进行多类别的分类。

``````from sklearn import metrics
from sklearn.svm import SVC
# fit a SVM model to the data
model = SVC()
model.fit(X, y)
print(model)
# make predictions
expected = y
predicted = model.predict(X)
# summarize the fit of the model
print(metrics.classification_report(expected, predicted))
print(metrics.confusion_matrix(expected, predicted))``````

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,
kernel=rbf, max_iter=-1, probability=False, random_state=None,
shrinking=True, tol=0.001, verbose=False)
precision recall f1-score support

``````   0.0       1.00      1.00      1.00       500
1.0       1.00      1.00      1.00       268
``````

avg / total 1.00 1.00 1.00 768

[[500 0]
[ 0 268]]

## 如何优化算法参数

``````import numpy as np
from sklearn.linear_model import Ridge
from sklearn.grid_search import GridSearchCV
# prepare a range of alpha values to test
alphas = np.array([1,0.1,0.01,0.001,0.0001,0])
# create and fit a ridge regression model, testing each alpha
model = Ridge()
grid = GridSearchCV(estimator=model, param_grid=dict(alpha=alphas))
grid.fit(X, y)
print(grid)
# summarize the results of the grid search
print(grid.best_score_)
print(grid.best_estimator_.alpha)``````

GridSearchCV(cv=None,
estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
normalize=False, solver=auto, tol=0.001),
estimator__alpha=1.0, estimator__copy_X=True,
estimator__fit_intercept=True, estimator__max_iter=None,
estimator__normalize=False, estimator__solver=auto,
estimator__tol=0.001, fit_params={}, iid=True, loss_func=None,
n_jobs=1,
param_grid={‘alpha’: array([ 1.00000e+00, 1.00000e-01, 1.00000e-02, 1.00000e-03,
1.00000e-04, 0.00000e+00])},
pre_dispatch=2*n_jobs, refit=True, score_func=None, scoring=None,
verbose=0)
0.282118955686
1.0

``````import numpy as np
from scipy.stats import uniform as sp_rand
from sklearn.linear_model import Ridge
from sklearn.grid_search import RandomizedSearchCV
# prepare a uniform distribution to sample for the alpha parameter
param_grid = {'alpha': sp_rand()}
# create and fit a ridge regression model, testing random alpha values
model = Ridge()
rsearch = RandomizedSearchCV(estimator=model, param_distributions=param_grid, n_iter=100)
rsearch.fit(X, y)
print(rsearch)
# summarize the results of the random parameter search
print(rsearch.best_score_)
print(rsearch.best_estimator_.alpha)``````

RandomizedSearchCV(cv=None,
estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
normalize=False, solver=auto, tol=0.001),
estimator__alpha=1.0, estimator__copy_X=True,
estimator__fit_intercept=True, estimator__max_iter=None,
estimator__normalize=False, estimator__solver=auto,
estimator__tol=0.001, fit_params={}, iid=True, n_iter=100,
n_jobs=1,
param_distributions={‘alpha’: