Identify the 47 Critical Features from 2,000 Variables
šÆ Critical Mission: Save the $30M Contract
MedTech Analytics' flagship model is overfitting catastrophically. With 2,000 features
and only 5,000 patient records, you must use advanced regularization techniques to
identify the truly important predictors and restore model generalization.
Success Criteria: Reduce features to under 50, achieve 85%+ cross-validated
accuracy, and prove the model generalizes to new hospitals.
š Mission Progress Tracker
0%
Task 1: Load High-Dimensional Data10 pts
Task 2: Baseline Model (Overfitting)15 pts
Task 3: Apply L1 Regularization (Lasso)20 pts
Task 4: Optimize with Elastic Net20 pts
Task 5: Cross-Validation & Selection20 pts
Task 6: Generate Feature Report15 pts
š Task 1: Load High-Dimensional Medical Data
Load the patient dataset with 2,000 features and explore the dimensionality problem.
# Import required librariesimport numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression, Lasso, Ridge, ElasticNet
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')
# Simulate high-dimensional medical data
np.random.seed(42)
n_samples = 5000
n_features = 2000# Generate feature names
feature_names = [f"biomarker_{i}"for i inrange(n_features)]
important_features = ['glucose', 'blood_pressure', 'age', 'bmi', 'cholesterol']
feature_names[:5] = important_features
# Create dataset with only ~50 truly relevant features
X = np.random.randn(n_samples, n_features)
# Make first 50 features informative
y = (X[:, :50] @ np.random.randn(50) + np.random.randn(n_samples) * 0.1) > 0
y = y.astype(int)
# TODO: Print data dimensions and class balanceprint(f"Data shape: _______")
print(f"Features per sample ratio: _______")
print(f"Class distribution: {np.bincount(y)}")
Use X.shape and calculate n_samples / n_features
Output:
Ready to execute...
ā ļø Task 2: Demonstrate Overfitting Problem
Train an unregularized model to see the overfitting disaster.
# Split data
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.3, random_state=42
)
# Standardize features (important for regularization)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Train unregularized logistic regression
baseline_model = LogisticRegression(penalty='none', max_iter=1000)
baseline_model.fit(X_train_scaled, y_train)
# Evaluate on train and test
train_score = baseline_model.score(X_train_scaled, y_train)
test_score = baseline_model.score(X_test_scaled, y_test)
print("=" * 50)
print("BASELINE MODEL (No Regularization)")
print("=" * 50)
print(f"Training Accuracy: {train_score:.1%}")
print(f"Test Accuracy: {test_score:.1%}")
# TODO: Calculate the overfitting gap
overfitting_gap = _______
print(f"Overfitting Gap: {overfitting_gap:.1%}")
if overfitting_gap > 0.4:
print("\nšØ SEVERE OVERFITTING DETECTED!")
print("Model memorizing training data, not learning patterns!")
Overfitting gap = train_score - test_score
Output:
Ready to execute...
šÆ Task 3: Apply L1 Regularization (Lasso)
Use Lasso to automatically select important features.
# Test different L1 regularization strengths
alphas = [0.001, 0.01, 0.1, 0.5, 1.0]
lasso_results = []
for alpha in alphas:
# Train Lasso model
lasso = LogisticRegression(
penalty='l1',
C=1/alpha, # C is inverse of regularization strength
solver='saga',
max_iter=2000
)
lasso.fit(X_train_scaled, y_train)
# Count non-zero coefficients (selected features)
n_selected = np.sum(lasso.coef_[0] != 0)
test_accuracy = lasso.score(X_test_scaled, y_test)
lasso_results.append({
'alpha': alpha,
'features_selected': n_selected,
'test_accuracy': test_accuracy
})
print(f"Alpha: {alpha:6.3f} | Features: {n_selected:4d} | Test Acc: {test_accuracy:.1%}")
# TODO: Find the best alpha (best test accuracy)
best_result = _______
print(f"\nā Best L1: Alpha={best_result['alpha']}, Features={best_result['features_selected']}")
Use max(lasso_results, key=lambda x: x['test_accuracy'])
Output:
Ready to execute...
--
L1 Features
--
L1 Accuracy
š® Task 4: Optimize with Elastic Net
Combine L1 and L2 regularization for better stability.
# Grid search for Elastic Net parametersfrom sklearn.linear_model import ElasticNetCV
# Test different L1 ratios
l1_ratios = [0.1, 0.3, 0.5, 0.7, 0.9]
elastic_results = []
for l1_ratio in l1_ratios:
# Train Elastic Net
elastic = LogisticRegression(
penalty='elasticnet',
solver='saga',
l1_ratio=l1_ratio,
C=0.1, # Best C from previous task
max_iter=2000
)
elastic.fit(X_train_scaled, y_train)
# Evaluate
n_selected = np.sum(elastic.coef_[0] != 0)
test_acc = elastic.score(X_test_scaled, y_test)
train_acc = elastic.score(X_train_scaled, y_train)
# TODO: Calculate overfitting gap for Elastic Net
gap = _______
elastic_results.append({
'l1_ratio': l1_ratio,
'features': n_selected,
'test_acc': test_acc,
'gap': gap
})
print(f"L1 Ratio: {l1_ratio:.1f} | Features: {n_selected:3d} | "f"Test: {test_acc:.1%} | Gap: {gap:.1%}")
# Find optimal configuration
optimal = min(elastic_results, key=lambda x: x['gap'])
print(f"\nā Optimal Elastic Net: L1={optimal['l1_ratio']}, "f"Features={optimal['features']}, Gap={optimal['gap']:.1%}")
Gap = train_acc - test_acc
Output:
Ready to execute...
š Task 5: Rigorous Cross-Validation
Use K-fold CV to find the truly optimal regularization.
# Comprehensive cross-validationfrom sklearn.model_selection import GridSearchCV
# Define parameter grid
param_grid = {
'C': [0.001, 0.01, 0.1, 1.0, 10.0],
'l1_ratio': [0.3, 0.5, 0.7, 0.9]
}
# Grid search with cross-validation
model = LogisticRegression(
penalty='elasticnet',
solver='saga',
max_iter=2000,
random_state=42
)
# TODO: Set up GridSearchCV with 5-fold CV
grid_search = GridSearchCV(
model,
param_grid,
cv=_______, # Number of folds
scoring='accuracy',
n_jobs=-1,
verbose=1
)
print("Running cross-validation (this may take a moment)...")
grid_search.fit(X_train_scaled, y_train)
# Best parameters and scoreprint(f"\nā Best Parameters: {grid_search.best_params_}")
print(f"Best CV Score: {grid_search.best_score_:.3f}")
# Final evaluation on test set
best_model = grid_search.best_estimator_
final_test_score = best_model.score(X_test_scaled, y_test)
n_features_final = np.sum(best_model.coef_[0] != 0)
print(f"Final Test Accuracy: {final_test_score:.1%}")
print(f"Features Selected: {n_features_final} / {n_features}")
Use cv=5 for 5-fold cross-validation
Output:
Ready to execute...
š Task 6: Generate Feature Importance Report
Identify and report the most critical features for medical predictions.
# Extract feature importance from best model
coefficients = best_model.coef_[0]
feature_importance = np.abs(coefficients)
# Get top features
top_indices = np.argsort(feature_importance)[::-1][:20]
top_features = [(feature_names[i], feature_importance[i])
for i in top_indices
if feature_importance[i] > 0]
print("=" * 60)
print("FEATURE SELECTION REPORT - MEDTECH ANALYTICS")
print("=" * 60)
# Problem summaryprint("\nš PROBLEM SUMMARY:")
print(f"Original Features: {n_features}")
print(f"Training Samples: {len(X_train)}")
print(f"Features/Sample Ratio: {n_features/len(X_train):.2f}")
# Solution implementedprint("\nš§ SOLUTION:")
print(f"Regularization Method: Elastic Net")
print(f"Optimal C: {best_model.C}")
print(f"Optimal L1 Ratio: {best_model.l1_ratio}")
# Resultsprint("\nš RESULTS:")
print(f"Features Selected: {n_features_final}")
print(f"Dimension Reduction: {(1 - n_features_final/n_features)*100:.1f}%")
print(f"Test Accuracy: {final_test_score:.1%}")
# Top featuresprint("\nšÆ TOP 10 CRITICAL FEATURES:")
for i, (name, importance) inenumerate(top_features[:10]):
print(f"{i+1:2d}. {name:20s} | Importance: {importance:.3f}")
# TODO: Calculate cost savings# Each unnecessary feature costs $1000/year in data collection
features_eliminated = n_features - n_features_final
annual_savings = _______
print("\nš° BUSINESS IMPACT:")
print(f"Features Eliminated: {features_eliminated}")
print(f"Annual Data Collection Savings: ${annual_savings:,.0f}")
print(f"Contract Value Retained: $30,000,000")
print(f"Model Inference Speed Improvement: {n_features/n_features_final:.0f}x faster")
print("=" * 60)
print("ā MODEL READY FOR PRODUCTION DEPLOYMENT")
Annual savings = features_eliminated * 1000
Output:
Ready to execute...
š Mission Accomplished!
You've successfully tamed the 2,000-feature beast and saved MedTech Analytics!
47
Features Selected
88%
Test Accuracy
$30M
Contract Saved
43x
Speed Increase
The CEO is ecstatic! Your regularized model is being deployed across all 500 hospitals.