requests, JSON, and BeautifulSoup
An API (Application Programming Interface) lets your Python code communicate with external services over the internet. Most modern APIs follow the REST architecture: you send an HTTP request (usually GET or POST) and receive structured data back, typically in JSON format.
import requests # GET request to a public API url = "https://api.exchangerate-api.com/v4/latest/USD" response = requests.get(url) print(f"Status code: {response.status_code}") # 200 = success print(f"Content type: {response.headers['Content-Type']}")
# Convert JSON response to a Python dictionary data = response.json() print(f"Base currency: {data['base']}") print(f"EUR rate: {data['rates']['EUR']}") print(f"GBP rate: {data['rates']['GBP']}") # Convert rates to a DataFrame import pandas as pd rates_df = pd.DataFrame( list(data["rates"].items()), columns=["Currency", "Rate"] ) print(rates_df.head(10))
Most real-world APIs require authentication to track usage, enforce rate limits, and protect data. The two most common approaches are API keys (simple) and OAuth tokens (more secure).
import os # Method 1: API Key in query parameters # Many free APIs use this approach (e.g., OpenWeatherMap, Alpha Vantage) api_key = os.environ.get("WEATHER_API_KEY", "your_key_here") url = "https://api.openweathermap.org/data/2.5/weather" params = { "q": "New York", "appid": api_key, "units": "imperial" } # response = requests.get(url, params=params) # Method 2: API Key in headers (more secure, keeps key out of URLs) headers = { "Authorization": "Bearer YOUR_API_KEY", "Content-Type": "application/json" } # response = requests.get(url, headers=headers) # Method 3: OAuth 2.0 (used by Google, Twitter, GitHub, etc.) # OAuth involves getting a temporary access token, which is then sent in headers. # Libraries like 'requests-oauthlib' simplify this process. # pip install requests-oauthlib
os.environ["KEY"]) or a .env file with the python-dotenv package. If you commit an API key to GitHub, it can be stolen by automated bots within minutes. Add .env to your .gitignore file.
# Many APIs accept query parameters url = "https://jsonplaceholder.typicode.com/posts" params = {"userId": 1} response = requests.get(url, params=params) posts = response.json() print(f"Found {len(posts)} posts by user 1") for post in posts[:3]: print(f" - {post['title']}")
Many APIs return data in pages (e.g., 100 records at a time). You need to loop through pages to get the full dataset. Similarly, rate limiting means you must add delays between requests to avoid being blocked.
import time def fetch_all_pages(base_url, params=None, max_pages=10): """Fetch paginated API data with rate limiting.""" all_data = [] params = params or {} for page in range(1, max_pages + 1): params["_page"] = page params["_limit"] = 10 # records per page response = requests.get(base_url, params=params, timeout=10) response.raise_for_status() data = response.json() if not data: # empty page = we've reached the end break all_data.extend(data) print(f" Page {page}: fetched {len(data)} records") # Respect rate limits: wait between requests time.sleep(0.5) # 500ms delay return all_data # Example: fetch all posts from JSONPlaceholder all_posts = fetch_all_pages("https://jsonplaceholder.typicode.com/posts") print(f"Total records: {len(all_posts)}")
X-RateLimit-Remaining). Read these headers to dynamically adjust your request speed. For bulk data collection, consider running your script overnight with generous delays.
import json # Sending data to an API url = "https://jsonplaceholder.typicode.com/posts" new_post = { "title": "Supply Chain Optimization", "body": "Using Python for demand forecasting...", "userId": 1 } response = requests.post(url, json=new_post) print(f"Status: {response.status_code}") # 201 = created print(response.json())
Once you have fetched data from an API, you almost always want to save it locally so you do not need to re-fetch it every time you run your analysis. Saving to CSV or JSON provides a persistent, portable copy of the data.
import json # Fetch and save as CSV url = "https://jsonplaceholder.typicode.com/users" response = requests.get(url) users = response.json() # Nested JSON needs flattening before CSV users_df = pd.json_normalize(users) # flattens nested dicts print(users_df.columns.tolist()) # shows flattened column names # Save to CSV users_df.to_csv("api_users.csv", index=False) print(f"Saved {len(users_df)} users to api_users.csv") # Save raw JSON (preserves nested structure) with open("api_users.json", "w") as f: json.dump(users, f, indent=2) # Later: reload from file instead of re-fetching cached_df = pd.read_csv("api_users.csv") print(f"Loaded {len(cached_df)} rows from cache")
The Federal Reserve Economic Data (FRED) API provides access to hundreds of thousands of economic time series. Here is how to fetch GDP data programmatically.
# You need a free API key from https://fred.stlouisfed.org/docs/api/api_key.html FRED_API_KEY = os.environ.get("FRED_API_KEY", "your_key_here") def get_fred_series(series_id, api_key): """Fetch a time series from the FRED API.""" url = "https://api.stlouisfed.org/fred/series/observations" params = { "series_id": series_id, "api_key": api_key, "file_type": "json", "observation_start": "2015-01-01" } response = requests.get(url, params=params, timeout=10) response.raise_for_status() data = response.json()["observations"] df = pd.DataFrame(data) df["date"] = pd.to_datetime(df["date"]) df["value"] = pd.to_numeric(df["value"], errors="coerce") return df[["date", "value"]].dropna() # Fetch US GDP (quarterly, billions of dollars) # gdp = get_fred_series("GDP", FRED_API_KEY) # gdp.plot(x="date", y="value", title="US GDP")
# pandas can scrape HTML tables directly url = "https://en.wikipedia.org/wiki/List_of_largest_companies_by_revenue" tables = pd.read_html(url) print(f"Found {len(tables)} tables on the page") df = tables[0] # First table print(df.head())
When data is embedded in HTML (not in a table or API), BeautifulSoup lets you parse and extract it.
from bs4 import BeautifulSoup # Fetch and parse a webpage url = "https://quotes.toscrape.com/" response = requests.get(url) soup = BeautifulSoup(response.text, "html.parser") # Extract all quotes quotes = soup.find_all("span", class_="text") authors = soup.find_all("small", class_="author") for q, a in zip(quotes[:5], authors[:5]): print(f"{a.get_text()}: {q.get_text()[:60]}...")
Network requests can fail for many reasons: timeouts, server errors, rate limits, dropped connections. Production-grade code needs to handle these gracefully, with retries for transient failures and clear error messages for permanent ones.
def safe_api_call(url, params=None, max_retries=3): """Make an API call with error handling and retries.""" for attempt in range(1, max_retries + 1): try: response = requests.get(url, params=params, timeout=10) response.raise_for_status() # Raises exception for 4xx/5xx return response.json() except requests.exceptions.Timeout: print(f" Attempt {attempt}: timed out, retrying...") except requests.exceptions.HTTPError as e: if response.status_code == 429: # rate limited wait = 2 ** attempt # exponential backoff: 2s, 4s, 8s print(f" Rate limited. Waiting {wait}s...") time.sleep(wait) elif response.status_code >= 500: # server error, retry print(f" Server error {response.status_code}, retrying...") else: # 4xx client error, don't retry print(f" Client error: {e}") return None except requests.exceptions.ConnectionError: print(f" Connection failed, retrying...") except requests.exceptions.RequestException as e: print(f" Request failed: {e}") return None time.sleep(1) # wait before retry print(" All retries exhausted.") return None data = safe_api_call("https://api.exchangerate-api.com/v4/latest/USD") if data: print(f"Success: {len(data['rates'])} currencies")
wait = base ** attempt where base is typically 2.
Before scraping any website, you should check its robots.txt file, which specifies which paths are allowed or disallowed for automated access. Respecting robots.txt is both an ethical obligation and often a legal requirement.
# Check robots.txt before scraping from urllib.robotparser import RobotFileParser rp = RobotFileParser() rp.set_url("https://quotes.toscrape.com/robots.txt") rp.read() # Check if a specific path is allowed test_url = "https://quotes.toscrape.com/page/1/" allowed = rp.can_fetch("*", test_url) print(f"Allowed to scrape {test_url}: {allowed}") # Best practices for ethical scraping: # 1. Always check robots.txt first # 2. Add delays between requests (1-3 seconds minimum) # 3. Identify yourself with a User-Agent header # 4. Don't scrape behind login pages without permission # 5. Cache results locally to avoid repeated requests # 6. Prefer APIs over scraping when available headers = { "User-Agent": "MyResearchBot/1.0 (contact: your@email.com)" } # response = requests.get(url, headers=headers)
Use the JSONPlaceholder API (https://jsonplaceholder.typicode.com/users) to fetch all users. Convert the response to a pandas DataFrame and extract the columns: name, email, and company name. Sort by company name.
Use pd.read_html() to scrape a table from a Wikipedia page of your choice. Clean the column names (strip whitespace, lowercase), drop any rows with all NaN values, and export the result to a CSV file.
Write a function that fetches data from a paginated API (use JSONPlaceholder's /comments endpoint with _page and _limit parameters). Implement pagination to collect all comments, save the result to a CSV file, and print summary statistics (total comments, average comment length). Add a 0.5-second delay between requests.
Build a robust API client: write a function that takes a URL, makes a GET request with retry logic (up to 3 attempts with exponential backoff), and handles timeouts, rate limits (429), and server errors (500+) gracefully. Test it by calling a valid API endpoint and then an intentionally invalid URL. Log each attempt's status.
.json() converts API responses to Python dictionaries..env files for credentials.time.sleep()) for production data collection.robots.txt and respect rate limits before scraping any website.pd.read_html() is the fastest way to grab tables from web pages; BeautifulSoup handles unstructured HTML.