v Cross-Validation - Machine Learning

Cross-Validation

Preliminaries

# Load libraries
import numpy as np
from sklearn import datasets
from sklearn import metrics
from sklearn.model_selection import KFold, cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

Load Digits Dataset

# Load the digits dataset
digits = datasets.load_digits()

# Create the features matrix
X = digits.data

# Create the target vector
y = digits.target

Create Pipeline

# Create standardizer
standardizer = StandardScaler()

# Create logistic regression
logit = LogisticRegression()

# Create a pipeline that standardizes, then runs logistic regression
pipeline = make_pipeline(standardizer, logit)

Create k-Fold Cross-Validation

# Create k-Fold cross-validation
kf = KFold(n_splits=10, shuffle=True, random_state=1)

Conduct k-Fold Cross-Validation

# Do k-fold cross-validation
cv_results = cross_val_score(pipeline, # Pipeline
                             X, # Feature matrix
                             y, # Target vector
                             cv=kf, # Cross-validation technique
                             scoring="accuracy", # Loss function
                             n_jobs=-1) # Use all CPU scores

Calculate Mean Performance Score

# Calculate mean
cv_results.mean()
0.96493171942892597