scipy and statsmodels
scipy.stats provides hypothesis tests and probability distributions. statsmodels provides regression models, diagnostics, and econometric tools. Together they cover the statistical analysis workflow from testing to modeling.
import numpy as np import pandas as pd from scipy import stats import statsmodels.api as sm import statsmodels.formula.api as smf
Many statistical tests assume that data follows a normal distribution. Before running a t-test or regression, you should check this assumption. The Shapiro-Wilk test is the most commonly used formal normality test, and Q-Q plots provide a visual diagnostic.
import matplotlib.pyplot as plt # Generate some data np.random.seed(42) normal_data = np.random.normal(loc=50, scale=10, size=100) skewed_data = np.random.exponential(scale=10, size=100) # Shapiro-Wilk test: H0 = data is normally distributed stat1, p1 = stats.shapiro(normal_data) stat2, p2 = stats.shapiro(skewed_data) print(f"Normal data - W={stat1:.4f}, p={p1:.4f}") # p > 0.05: fail to reject print(f"Skewed data - W={stat2:.4f}, p={p2:.4f}") # p < 0.05: reject normality # Q-Q plot: points should fall on the diagonal if data is normal fig, axes = plt.subplots(1, 2, figsize=(10, 4)) stats.probplot(normal_data, dist="norm", plot=axes[0]) axes[0].set_title("Q-Q Plot: Normal Data") stats.probplot(skewed_data, dist="norm", plot=axes[1]) axes[1].set_title("Q-Q Plot: Skewed Data") plt.tight_layout() plt.show()
Test whether two groups have different means. This is one of the most commonly used tests in business analytics.
# Delivery times for two carriers carrier_a = np.random.normal(loc=4.8, scale=0.9, size=50) carrier_b = np.random.normal(loc=5.3, scale=1.1, size=50) # Independent samples t-test t_stat, p_value = stats.ttest_ind(carrier_a, carrier_b) print(f"t-statistic: {t_stat:.3f}") print(f"p-value: {p_value:.4f}") if p_value < 0.05: print("Reject H0: The carriers have significantly different delivery times.") else: print("Fail to reject H0: No significant difference detected.")
ttest_ind() for independent samples (different groups). Use ttest_rel() for paired samples (same subjects measured twice, e.g., before/after a policy change).
Test for independence between two categorical variables.
# Contingency table: defect rate by shift observed = np.array([[80, 20], # Day shift: [good, defective] [65, 35]]) # Night shift chi2, p_val, dof, expected = stats.chi2_contingency(observed) print(f"Chi-square: {chi2:.2f}, p-value: {p_val:.4f}, df: {dof}") print(f"Expected frequencies:\n{expected}")
Compare means across three or more groups.
# Customer satisfaction scores by region east = [7.2, 6.8, 7.5, 7.0, 7.3, 6.9, 7.1] central = [6.5, 6.2, 6.8, 6.1, 6.7, 6.4, 6.3] west = [7.8, 8.1, 7.5, 7.9, 8.0, 7.6, 7.7] f_stat, p_val = stats.f_oneway(east, central, west) print(f"F-statistic: {f_stat:.2f}, p-value: {p_val:.4f}")
Correlation measures the strength and direction of a relationship between two variables. The right type of correlation depends on your data.
np.random.seed(42) n = 100 x = np.random.uniform(0, 100, n) y_linear = 2 * x + np.random.normal(0, 15, n) y_monotonic = np.log(x + 1) * 20 + np.random.normal(0, 5, n) # Pearson: measures LINEAR correlation (-1 to 1) r_pearson, p_pearson = stats.pearsonr(x, y_linear) print(f"Pearson r = {r_pearson:.3f}, p = {p_pearson:.4f}") # Spearman: measures MONOTONIC correlation (rank-based) # Better for non-linear but consistently increasing/decreasing relationships r_spearman, p_spearman = stats.spearmanr(x, y_monotonic) print(f"Spearman r = {r_spearman:.3f}, p = {p_spearman:.4f}") # Kendall: also rank-based, more robust with small samples or many ties r_kendall, p_kendall = stats.kendalltau(x, y_monotonic) print(f"Kendall tau = {r_kendall:.3f}, p = {p_kendall:.4f}") # Compute a full correlation matrix with pandas df_corr = pd.DataFrame({"x": x, "y_lin": y_linear, "y_mono": y_monotonic}) print("\nPearson correlation matrix:") print(df_corr.corr(method="pearson").round(3)) print("\nSpearman correlation matrix:") print(df_corr.corr(method="spearman").round(3))
Ordinary Least Squares (OLS) regression is the starting point for predictive modeling and causal inference in business research.
# Generate sample data np.random.seed(42) n = 200 df = pd.DataFrame({ "ad_spend": np.random.uniform(10, 100, n), "price": np.random.uniform(15, 45, n), "competitors": np.random.randint(1, 6, n), }) df["sales"] = 50 + 2.5 * df["ad_spend"] - 1.8 * df["price"] + np.random.normal(0, 15, n) # Formula API (R-style) model = smf.ols("sales ~ ad_spend + price + competitors", data=df).fit() print(model.summary())
The key sections of the OLS summary table include:
# Access specific results print(f"R-squared: {model.rsquared:.3f}") print(f"Adj R-sq: {model.rsquared_adj:.3f}") print(f"Coefficients:\n{model.params}") print(f"P-values:\n{model.pvalues}")
When you include multiple predictors, you need to check whether they are too highly correlated with each other. This condition, called multicollinearity, inflates the standard errors of your coefficients and makes individual predictors appear insignificant even when they are collectively important. The Variance Inflation Factor (VIF) quantifies the severity of multicollinearity for each predictor.
from statsmodels.stats.outliers_influence import variance_inflation_factor # Fit a model with correlated predictors np.random.seed(42) n = 200 x1 = np.random.uniform(10, 100, n) x2 = x1 * 0.8 + np.random.normal(0, 10, n) # highly correlated with x1 x3 = np.random.uniform(1, 50, n) # independent y = 20 + 3 * x1 - 1.5 * x3 + np.random.normal(0, 15, n) df_vif = pd.DataFrame({"x1": x1, "x2": x2, "x3": x3, "y": y}) model = smf.ols("y ~ x1 + x2 + x3", data=df_vif).fit() print(model.summary().tables[1]) # Calculate VIF for each predictor X_matrix = df_vif[["x1", "x2", "x3"]] X_with_const = sm.add_constant(X_matrix) vif_data = pd.DataFrame({ "Variable": X_with_const.columns, "VIF": [variance_inflation_factor(X_with_const.values, i) for i in range(X_with_const.shape[1])] }) print("\nVariance Inflation Factors:") print(vif_data)
# Residuals vs. fitted values (check for heteroskedasticity) import matplotlib.pyplot as plt residuals = model.resid fitted = model.fittedvalues plt.figure(figsize=(7, 4)) plt.scatter(fitted, residuals, alpha=0.5, s=20) plt.axhline(y=0, color="red", linestyle="--") plt.xlabel("Fitted Values") plt.ylabel("Residuals") plt.title("Residuals vs. Fitted") plt.tight_layout() plt.show()
# 95% confidence intervals for coefficients ci = model.conf_int(alpha=0.05) ci.columns = ["Lower 95%", "Upper 95%"] print(ci) # Confidence interval for the mean of a variable sample = df["sales"] mean = sample.mean() se = stats.sem(sample) ci_low, ci_high = stats.t.interval(0.95, df=len(sample)-1, loc=mean, scale=se) print(f"Sales mean: {mean:.1f} [{ci_low:.1f}, {ci_high:.1f}]")
When your dependent variable is binary (yes/no, churn/retain, pass/fail), OLS regression is inappropriate because it can produce predicted probabilities outside [0, 1]. Logistic regression models the log-odds of the outcome as a linear function of the predictors.
# Binary outcome: will the student pass? np.random.seed(42) n = 300 df_logit = pd.DataFrame({ "study_hours": np.random.uniform(0, 20, n), "gpa": np.random.uniform(2.0, 4.0, n), "attendance": np.random.uniform(50, 100, n), }) # True probability of passing depends on all three logit = -5 + 0.3 * df_logit["study_hours"] + 1.5 * df_logit["gpa"] + 0.02 * df_logit["attendance"] prob = 1 / (1 + np.exp(-logit)) df_logit["passed"] = np.random.binomial(1, prob) # Fit logistic regression using the formula API logit_model = smf.logit("passed ~ study_hours + gpa + attendance", data=df_logit).fit() print(logit_model.summary()) # Interpret coefficients as odds ratios odds_ratios = np.exp(logit_model.params) print("\nOdds Ratios:") print(odds_ratios.round(3)) # Predict probabilities for new observations new_student = pd.DataFrame({ "study_hours": [5, 15], "gpa": [2.5, 3.8], "attendance": [60, 95] }) print(f"\nPredicted pass probabilities: {logit_model.predict(new_student).values}")
Survival analysis studies the time until an event occurs: customer churn, machine failure, patient recovery. The key concept is the survival function S(t), which gives the probability that the event has not yet occurred by time t. Python's lifelines library provides tools for survival analysis.
# pip install lifelines from lifelines import KaplanMeierFitter, CoxPHFitter # Simulate customer churn data np.random.seed(42) n = 200 df_surv = pd.DataFrame({ "tenure_months": np.random.exponential(scale=18, size=n).round(1), "churned": np.random.binomial(1, 0.6, n), # 1 = churned, 0 = still active "plan": np.random.choice(["Basic", "Premium"], n), }) # Kaplan-Meier survival curve (non-parametric) kmf = KaplanMeierFitter() fig, ax = plt.subplots(figsize=(8, 5)) for plan in ["Basic", "Premium"]: mask = df_surv["plan"] == plan kmf.fit(df_surv.loc[mask, "tenure_months"], event_observed=df_surv.loc[mask, "churned"], label=plan) kmf.plot_survival_function(ax=ax) ax.set_xlabel("Months") ax.set_ylabel("Survival Probability") ax.set_title("Customer Retention by Plan Type") plt.tight_layout() plt.show()
Generate two samples of 100 observations each from normal distributions with means of 50 and 53 and a standard deviation of 10. Run a two-sample t-test. Increase the sample size to 500 and observe how the p-value changes. Discuss the relationship between sample size and statistical power.
Create a DataFrame with 300 rows and columns for square_footage, bedrooms, and age_years. Generate a house_price column as a linear function of those predictors plus noise. Fit an OLS model, print the summary, and plot the residuals vs. fitted values. Discuss whether the assumptions appear met.
Create two variables where one is the log of the other (with noise). Compute both Pearson and Spearman correlations. Which one is higher, and why? Create a scatter plot to visualize the non-linear relationship. Then compute VIF for a regression model that includes both x and x_squared as predictors. What do you observe?
Generate a binary outcome (e.g., customer churn) as a function of three predictors using the logistic function. Fit a logistic regression with smf.logit(), print the odds ratios, and predict the churn probability for two hypothetical customers: one with high engagement and one with low engagement. Interpret the odds ratios in plain language.