scikit-learn
Machine learning in practice follows a consistent workflow: prepare features and labels, split the data, train a model, evaluate performance, and iterate. scikit-learn provides a unified API for all these steps.
import numpy as np import pandas as pd from sklearn.model_selection import train_test_split, cross_val_score from sklearn.preprocessing import StandardScaler from sklearn.linear_model import LinearRegression, LogisticRegression from sklearn.tree import DecisionTreeClassifier from sklearn.metrics import (mean_squared_error, r2_score, accuracy_score, classification_report, roc_auc_score) from sklearn.pipeline import Pipeline
Most ML algorithms work better when numeric features are on similar scales, and categorical features must be converted to numbers. Failing to scale can cause models like logistic regression and SVM to give disproportionate weight to large-magnitude features. Understanding why we scale matters: gradient-based algorithms converge faster when features are normalized, and distance-based algorithms (KNN, SVM) treat all features equally only when they are on the same scale.
from sklearn.preprocessing import StandardScaler, MinMaxScaler from sklearn.preprocessing import OneHotEncoder, LabelEncoder # StandardScaler: zero mean, unit variance (best for most algorithms) scaler = StandardScaler() data = np.array([[1000, 2], [2000, 4], [1500, 3], [3000, 5]]) scaled = scaler.fit_transform(data) print("StandardScaler:") print(f" Original: mean={data.mean(axis=0)}, std={data.std(axis=0).round(1)}") print(f" Scaled: mean={scaled.mean(axis=0).round(2)}, std={scaled.std(axis=0).round(2)}") # MinMaxScaler: maps to [0, 1] range (useful for neural networks) mm_scaler = MinMaxScaler() mm_scaled = mm_scaler.fit_transform(data) print(f"\nMinMaxScaler range: [{mm_scaled.min()}, {mm_scaled.max()}]") # OneHotEncoder: categorical to binary columns categories = np.array([["Red"], ["Blue"], ["Green"], ["Red"]]) ohe = OneHotEncoder(sparse_output=False) encoded = ohe.fit_transform(categories) print(f"\nOneHotEncoder categories: {ohe.categories_[0]}") print(encoded) # LabelEncoder: categorical to integers (for tree-based models only) le = LabelEncoder() labels = le.fit_transform(["Small", "Medium", "Large", "Small"]) print(f"\nLabelEncoder: {labels}")
Always hold out data that the model has never seen for evaluation. A typical split is 80% train / 20% test.
# Sample dataset np.random.seed(42) n = 300 df = pd.DataFrame({ "sq_ft": np.random.uniform(800, 3000, n), "bedrooms": np.random.randint(1, 6, n), "age": np.random.uniform(0, 50, n), }) df["price"] = 50 + 0.12 * df["sq_ft"] + 15 * df["bedrooms"] - 0.8 * df["age"] + np.random.normal(0, 20, n) X = df[["sq_ft", "bedrooms", "age"]] y = df["price"] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) print(f"Train: {X_train.shape[0]} rows, Test: {X_test.shape[0]} rows")
lr = LinearRegression() lr.fit(X_train, y_train) # Predictions and evaluation y_pred = lr.predict(X_test) rmse = mean_squared_error(y_test, y_pred, squared=False) r2 = r2_score(y_test, y_pred) print(f"RMSE: {rmse:.2f}") print(f"R2: {r2:.3f}") # Coefficients for name, coef in zip(X.columns, lr.coef_): print(f" {name:10}: {coef:+.4f}")
# Binary classification: will the customer churn? df["churn"] = (df["price"] > df["price"].median()).astype(int) X = df[["sq_ft", "bedrooms", "age"]] y = df["churn"] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) log_reg = LogisticRegression(max_iter=1000) log_reg.fit(X_train, y_train) y_pred = log_reg.predict(X_test) print(f"Accuracy: {accuracy_score(y_test, y_pred):.3f}") print(classification_report(y_test, y_pred))
tree = DecisionTreeClassifier(max_depth=4, random_state=42) tree.fit(X_train, y_train) y_pred_tree = tree.predict(X_test) print(f"Tree Accuracy: {accuracy_score(y_test, y_pred_tree):.3f}") # Feature importance for name, imp in zip(X.columns, tree.feature_importances_): print(f" {name:10}: {imp:.3f}")
max_depth or use cross-validation to find the right complexity.
Cross-validation gives a more robust performance estimate by averaging over multiple train/test splits.
# 5-fold cross-validation cv_scores = cross_val_score(log_reg, X, y, cv=5, scoring="accuracy") print(f"CV Accuracy: {cv_scores.mean():.3f} +/- {cv_scores.std():.3f}") print(f"Fold scores: {cv_scores}")
Pipelines chain preprocessing steps and models together, preventing data leakage and simplifying deployment.
pipe = Pipeline([ ("scaler", StandardScaler()), ("model", LogisticRegression(max_iter=1000)) ]) # Fit and predict in one go pipe.fit(X_train, y_train) y_pred_pipe = pipe.predict(X_test) print(f"Pipeline Accuracy: {accuracy_score(y_test, y_pred_pipe):.3f}") # Cross-validate the entire pipeline cv_pipe = cross_val_score(pipe, X, y, cv=5, scoring="accuracy") print(f"Pipeline CV: {cv_pipe.mean():.3f}")
Tree-based ensemble methods are among the most powerful out-of-the-box algorithms. They work by combining many decision trees, each trained slightly differently, so that the ensemble's errors cancel out. The two main approaches differ in how they combine trees.
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier # Random Forest: trains trees in PARALLEL on random subsets of data # Each tree votes; the majority wins. Reduces variance (overfitting). rf = RandomForestClassifier( n_estimators=200, # number of trees max_depth=6, # limit tree depth to prevent overfitting min_samples_leaf=5, # each leaf must have at least 5 samples random_state=42 ) rf.fit(X_train, y_train) print(f"Random Forest Accuracy: {rf.score(X_test, y_test):.3f}") # Gradient Boosting: trains trees SEQUENTIALLY, each one correcting # the errors of the previous. Reduces bias (underfitting). gb = GradientBoostingClassifier( n_estimators=200, learning_rate=0.1, # shrinkage: smaller = more robust, but needs more trees max_depth=4, random_state=42 ) gb.fit(X_train, y_train) print(f"Gradient Boosting Accuracy: {gb.score(X_test, y_test):.3f}")
learning_rate, n_estimators, and max_depth. In practice, try both and compare with cross-validation. For very large datasets, consider HistGradientBoostingClassifier (scikit-learn) or XGBoost/LightGBM libraries.
Model performance depends heavily on hyperparameters (settings you choose before training). GridSearchCV automates the search by trying every combination of parameters and selecting the one with the best cross-validation score.
from sklearn.model_selection import GridSearchCV # Define the parameter grid to search param_grid = { "n_estimators": [50, 100, 200], "max_depth": [3, 5, 8], "min_samples_leaf": [1, 5, 10] } # GridSearchCV fits 3x3x3 = 27 combinations x 5 folds = 135 models grid_search = GridSearchCV( RandomForestClassifier(random_state=42), param_grid, cv=5, scoring="accuracy", n_jobs=-1, # use all CPU cores verbose=1 ) grid_search.fit(X_train, y_train) print(f"Best parameters: {grid_search.best_params_}") print(f"Best CV accuracy: {grid_search.best_score_:.3f}") print(f"Test accuracy: {grid_search.score(X_test, y_test):.3f}") # The best model is already fitted and ready to use best_model = grid_search.best_estimator_
Accuracy alone can be misleading, especially with imbalanced classes. The confusion matrix shows exactly where your model makes mistakes, and the ROC curve visualizes the tradeoff between true positive rate and false positive rate at every classification threshold.
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, roc_curve, auc import matplotlib.pyplot as plt # Confusion matrix y_pred = best_model.predict(X_test) cm = confusion_matrix(y_test, y_pred) fig, axes = plt.subplots(1, 2, figsize=(12, 5)) # Left: Confusion matrix heatmap disp = ConfusionMatrixDisplay(cm, display_labels=["No Churn", "Churn"]) disp.plot(ax=axes[0], cmap="Blues") axes[0].set_title("Confusion Matrix") # Right: ROC curve y_prob = best_model.predict_proba(X_test)[:, 1] # probability of positive class fpr, tpr, thresholds = roc_curve(y_test, y_prob) roc_auc = auc(fpr, tpr) axes[1].plot(fpr, tpr, color="#3776AB", linewidth=2, label=f"ROC curve (AUC = {roc_auc:.3f})") axes[1].plot([0, 1], [0, 1], "k--", alpha=0.5, label="Random classifier") axes[1].set_xlabel("False Positive Rate") axes[1].set_ylabel("True Positive Rate") axes[1].set_title("ROC Curve") axes[1].legend(loc="lower right") plt.tight_layout() plt.show()
Tree-based models can tell you which features contribute most to predictions. This is valuable for understanding what drives the outcome and for simplifying the model by removing unimportant features.
import seaborn as sns # Extract feature importances from the best Random Forest model importances = pd.Series( best_model.feature_importances_, index=X.columns ).sort_values(ascending=True) # Horizontal bar chart fig, ax = plt.subplots(figsize=(8, 5)) importances.plot(kind="barh", ax=ax, color="#3776AB") ax.set_xlabel("Feature Importance (Gini)") ax.set_title("Which Features Drive the Prediction?") plt.tight_layout() plt.show() # Permutation importance (more reliable, works with any model) from sklearn.inspection import permutation_importance perm_imp = permutation_importance(best_model, X_test, y_test, n_repeats=10, random_state=42) perm_df = pd.DataFrame({ "Feature": X.columns, "Importance": perm_imp.importances_mean, "Std": perm_imp.importances_std }).sort_values("Importance", ascending=False) print(perm_df)
feature_importances_ (Gini/impurity-based) can be biased toward high-cardinality features and features with many possible split points. Permutation importance measures the actual drop in model performance when each feature is randomly shuffled, which gives a more honest assessment of each feature's contribution. Use permutation importance when you need to draw conclusions about feature relevance.
from sklearn.ensemble import RandomForestClassifier models = { "Logistic Regression": LogisticRegression(max_iter=1000), "Decision Tree": DecisionTreeClassifier(max_depth=4), "Random Forest": RandomForestClassifier(n_estimators=100, max_depth=5), } for name, model in models.items(): scores = cross_val_score(model, X, y, cv=5, scoring="accuracy") print(f"{name:25} CV Accuracy: {scores.mean():.3f} +/- {scores.std():.3f}")
Load the scikit-learn Boston-style dataset (use sklearn.datasets.fetch_california_housing()). Split 80/20, fit a LinearRegression and a DecisionTreeRegressor (max_depth=5). Compare their RMSE and R2 on the test set. Which model performs better and why?
Build a Pipeline that applies StandardScaler followed by a RandomForestClassifier. Use 5-fold cross-validation to evaluate accuracy on any classification dataset. Then try changing n_estimators and max_depth to see how performance changes.
Using the churn classification dataset from Section 8.5, apply GridSearchCV to tune a GradientBoostingClassifier over learning_rate (0.01, 0.1, 0.2), n_estimators (50, 100, 200), and max_depth (3, 5, 7). Print the best parameters and plot the confusion matrix and ROC curve for the best model on the test set.
Load the California housing dataset (sklearn.datasets.fetch_california_housing()). Use StandardScaler in a pipeline with a RandomForestRegressor. After fitting, plot both Gini-based and permutation-based feature importance charts side by side. Do the rankings agree? Which features are most important for predicting house prices?