Bloco A, sala 513-2
Vamos ver como criar a curva de aprendizado (incrementando o conjunto de dados) e a curva de validação (alteranando os valores do parâmetro livre)
Pode ser usado para analisar o trade-off bias-variância.
import matplotlib.pyplot as plt # data sets from sklearn.datasets import load_digits, load_iris # Algoritmos from sklearn.linear_model import LogisticRegression from sklearn.svm import SVC from sklearn.model_selection import learning_curve, validation_curve import numpy as np
# Seleciona 2 classes do dataset digits X,y = load_digits(n_class=2,return_X_y=True) # cria uma lista de 0.1 a 1, contendo 10 pontos igualmente espaçados sizes = np.linspace(0.1,1,10) algs = { 'Logistic Regression' : LogisticRegression(solver='lbfgs'), 'SVM - Gaussian' : SVC()} for name, alg in algs.items(): train_sizes, train_scores, test_scores = learning_curve( alg, X, y, cv=3, train_sizes=sizes) plot_learning_curve(train_sizes, train_scores, test_scores,name)
X,y = load_digits(return_X_y=True) # cria pontos exponencialmente expaçados 10^-6 até 10^-1 param_range = np.logspace(-6, -1, 5) train_scores, test_scores = validation_curve( SVC(), X, y, param_name="gamma", param_range=param_range, cv=5, scoring="accuracy", n_jobs=1)
Em algumas situações queremos variar mais de um parâmetro livre do algoritmo.
GridSearch faz uma busca combinando todas as possibilidades para esses valores
X, y = load_iris(return_X_y=True) # faixa de valores de C C_range = np.logspace(-2, 10, 13) # faixa de valores de gamma gamma_range = np.logspace(-9, 3, 13) param_grid = dict(gamma=gamma_range, C=C_range) cv = StratifiedShuffleSplit(y, n_iter=5, test_size=0.2) grid = GridSearchCV(SVC(), param_grid=param_grid, cv=cv) grid.fit(X, y)
The best parameters are {'C': 1.0, 'gamma': 0.1} with a score of 0.97
from sklearn.model_selection import KFold from sklearn.metrics import accuracy_score X, y = load_iris(return_X_y=True) alg = LogisticRegression(solver='lbfgs',multi_class='ovr') accuracies = [] kf = KFold(n_splits=5) for train, test in kf.split(X): X_train = X[train] X_test = X[test] y_train = y[train] y_test = y[test] alg.fit(X_train,y_train) pred = alg.predict(X_test) accuracies.append(accuracy_score(y_test,pred))
print(accuracies) [1.0, 0.9, 0.5, 0.9333333333333333, 0.6333333333333333] print (np.mean(accuracies),np.std(accuracies)) 0.7933333333333332 0.19252705437591536
from sklearn.model_selection import StratifiedKFold from sklearn.metrics import accuracy_score X, y = load_iris(return_X_y=True) alg = LogisticRegression(solver='lbfgs',multi_class='ovr') accuracies = [] kf = StratifiedKFold(n_splits=5) for train, test in kf.split(X,y): X_train = X[train] X_test = X[test] y_train = y[train] y_test = y[test] alg.fit(X_train,y_train) pred = alg.predict(X_test) accuracies.append(accuracy_score(y_test,pred))
print(accuracies) [0.8666666666666667, 0.9666666666666667, 0.9333333333333333, 0.9333333333333333, 1.0] print (np.mean(accuracies),np.std(accuracies)) 0.9400000000000001 0.044221663871405324
from sklearn.model_selection import GroupKFold X = [0.1, 0.2, 2.2, 2.4, 2.3, 4.55, 5.8, 8.8, 9, 10] y = ["a", "b", "b", "b", "c", "c", "c", "d", "d", "d"] groups = [1, 1, 1, 2, 2, 2, 3, 3, 3, 3] gkf = GroupKFold(n_splits=3) for train, test in gkf.split(X, y, groups=groups): print("%s %s" % (train, test)) [0 1 2 3 4 5] [6 7 8 9] [0 1 2 6 7 8 9] [3 4 5] [3 4 5 6 7 8 9] [0 1 2]
Avalia cada classe individualmente
Vamos chamar a nossa classe de interesse positiva
precision=TP+FPTP
Avalia cada classe individualmente
Vamos chamar a nossa classe de interesse positiva
recall=TP+FNTP
F1=2⋅precision+recallprecision∗recall
from sklearn.metrics import precision_recall_curve from sklearn.model_selection import train_test_split from sklearn.datasets import make_classification from sklearn import svm X,y = make_classification(class_sep=0.5) X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=.5) alg = svm.LinearSVC() alg.fit(X_train, y_train) y_score = alg.decision_function(X_test) precision, recall, t = precision_recall_curve(y_test, y_score)