Complete Pandas Tutorial

Master all Pandas essentials with short, practical examples!

๐Ÿผ Welcome to Pandas!

Pandas is like Excel for Python! It makes working with data super easy and powerful.


import pandas as pd
data = {'Name': ['Alice', 'Bob'], 'Age': [25, 30]}
df = pd.DataFrame(data)
print(df)
                                    
Easy
Data Handling
Powerful
Analysis
Popular
Library

Getting Started

Install and Import

# Install: pip install pandas
import pandas as pd
import numpy as np
print("Pandas version:", pd.__version__)

Pandas Data Structures

Series (1D)

# Series - like a column in Excel
ages = pd.Series([25, 30, 35], index=['Alice', 'Bob', 'Charlie'])
print(ages)
print("Alice's age:", ages['Alice'])
print("Average age:", ages.mean())
DataFrame (2D)

# DataFrame - like a spreadsheet
data = {
    'Name': ['Alice', 'Bob', 'Charlie'],
    'Age': [25, 30, 35],
    'City': ['NYC', 'LA', 'Chicago']
}
df = pd.DataFrame(data)
print(df)

Creating DataFrames

Different Creation Methods

# From dictionary
df1 = pd.DataFrame({'A': [1, 2], 'B': [3, 4]})

# From list of lists
df2 = pd.DataFrame([[1, 2], [3, 4]], columns=['A', 'B'])

# From NumPy array
df3 = pd.DataFrame(np.random.randn(3, 2), columns=['X', 'Y'])
print("Random data:\n", df3)

Exploring Your Data

Basic Information

df = pd.DataFrame({
    'Name': ['Alice', 'Bob', 'Charlie', 'Diana'],
    'Age': [25, 30, 35, 28],
    'Salary': [50000, 60000, 70000, 55000]
})

print(df.head(2))      # First 2 rows
print(df.tail(2))      # Last 2 rows
print(df.info())       # Data types and info
print(df.describe())   # Statistical summary
Quick Stats

print("Shape:", df.shape)           # (4, 3)
print("Columns:", df.columns.tolist())
print("Data types:\n", df.dtypes)
print("Memory usage:", df.memory_usage().sum())

Selecting Data

Column Selection

# Single column
names = df['Name']
print(names)

# Multiple columns
subset = df[['Name', 'Age']]
print(subset)

# Column operations
print("Average salary:", df['Salary'].mean())
Row Selection

# By position
print(df.iloc[0])      # First row
print(df.iloc[0:2])    # First 2 rows

# By label/condition
print(df.loc[df['Age'] > 30])  # Rows where Age > 30
print(df.loc[0, 'Name'])       # Specific cell

Filtering Data

Boolean Filtering

# Single condition
young = df[df['Age'] < 30]
print("Young employees:\n", young)

# Multiple conditions
high_earners = df[(df['Age'] > 25) & (df['Salary'] > 55000)]
print("High earners:\n", high_earners)

# Using isin()
selected = df[df['Name'].isin(['Alice', 'Bob'])]
print("Selected people:\n", selected)
String Filtering

# String operations
df['Email'] = ['[email protected]', '[email protected]', 
               '[email protected]', '[email protected]']

gmail_users = df[df['Email'].str.contains('gmail')]
print("Gmail users:\n", gmail_users)

Modifying Data

Adding and Modifying Columns

# Add new column
df['Bonus'] = df['Salary'] * 0.1
df['Full_Name'] = df['Name'] + ' Smith'

# Modify existing column
df['Age'] = df['Age'] + 1  # Everyone gets older

# Conditional assignment
df['Category'] = df['Age'].apply(lambda x: 'Senior' if x > 30 else 'Junior')
print(df[['Name', 'Age', 'Category']])
Dropping Data

# Drop columns
df_clean = df.drop(['Bonus', 'Full_Name'], axis=1)

# Drop rows
df_filtered = df.drop(0)  # Drop first row

# Drop by condition
df_no_seniors = df[df['Category'] != 'Senior']
print("No seniors:\n", df_no_seniors)

Missing Data

Detecting Missing Values

# Create data with missing values
df_missing = pd.DataFrame({
    'A': [1, 2, None, 4],
    'B': [5, None, 7, 8],
    'C': [9, 10, 11, None]
})

print("Missing values:\n", df_missing.isnull())
print("Count missing:\n", df_missing.isnull().sum())
Handling Missing Values

# Drop missing values
df_dropped = df_missing.dropna()

# Fill missing values
df_filled = df_missing.fillna(0)  # Fill with 0
df_mean_filled = df_missing.fillna(df_missing.mean())  # Fill with mean

print("Filled with mean:\n", df_mean_filled)

Grouping Data

Basic Grouping

# Sample data
sales = pd.DataFrame({
    'Region': ['North', 'South', 'North', 'South', 'North'],
    'Product': ['A', 'A', 'B', 'B', 'A'],
    'Sales': [100, 150, 200, 120, 180]
})

# Group by region
region_sales = sales.groupby('Region')['Sales'].sum()
print("Sales by region:\n", region_sales)

# Multiple aggregations
stats = sales.groupby('Region')['Sales'].agg(['sum', 'mean', 'count'])
print("Region stats:\n", stats)
Multiple Grouping

# Group by multiple columns
multi_group = sales.groupby(['Region', 'Product'])['Sales'].sum()
print("Sales by region and product:\n", multi_group)

# Pivot table
pivot = sales.pivot_table(values='Sales', 
                         index='Region', 
                         columns='Product', 
                         aggfunc='sum')
print("Pivot table:\n", pivot)

Combining DataFrames

Concatenating

df1 = pd.DataFrame({'A': [1, 2], 'B': [3, 4]})
df2 = pd.DataFrame({'A': [5, 6], 'B': [7, 8]})

# Vertical concatenation (stack rows)
vertical = pd.concat([df1, df2], ignore_index=True)
print("Vertical:\n", vertical)

# Horizontal concatenation (side by side)
horizontal = pd.concat([df1, df2], axis=1)
print("Horizontal:\n", horizontal)
Merging/Joining

employees = pd.DataFrame({'ID': [1, 2, 3], 'Name': ['Alice', 'Bob', 'Charlie']})
salaries = pd.DataFrame({'ID': [1, 2, 4], 'Salary': [50000, 60000, 70000]})

# Inner join (only matching records)
merged = pd.merge(employees, salaries, on='ID', how='inner')
print("Inner join:\n", merged)

# Left join (all from left table)
left_merged = pd.merge(employees, salaries, on='ID', how='left')
print("Left join:\n", left_merged)

Reading and Writing Files

CSV Files

# Save to CSV
df.to_csv('data.csv', index=False)

# Read from CSV
df_loaded = pd.read_csv('data.csv')
print("Loaded data:\n", df_loaded.head())

# Read with options
df_custom = pd.read_csv('data.csv', usecols=['Name', 'Age'])
print("Selected columns:\n", df_custom)
Excel Files

# Save to Excel
df.to_excel('data.xlsx', sheet_name='Employees', index=False)

# Read from Excel
df_excel = pd.read_excel('data.xlsx', sheet_name='Employees')

# Multiple sheets
with pd.ExcelWriter('multi_sheet.xlsx') as writer:
    df.to_excel(writer, sheet_name='Sheet1', index=False)
    df.to_excel(writer, sheet_name='Sheet2', index=False)

Working with Dates

Date Operations

# Create date range
dates = pd.date_range('2024-01-01', periods=5, freq='D')
df_time = pd.DataFrame({'Date': dates, 'Value': [1, 2, 3, 4, 5]})

# Set date as index
df_time.set_index('Date', inplace=True)
print("Time series:\n", df_time)

# Date filtering
recent = df_time['2024-01-03':]
print("Recent data:\n", recent)
Date Parsing

# Parse dates from strings
df_dates = pd.DataFrame({
    'Date_String': ['2024-01-01', '2024-01-02', '2024-01-03'],
    'Value': [10, 20, 30]
})

df_dates['Date'] = pd.to_datetime(df_dates['Date_String'])
df_dates['Year'] = df_dates['Date'].dt.year
df_dates['Month'] = df_dates['Date'].dt.month
print("Parsed dates:\n", df_dates)

Basic Plotting

Simple Plots

import matplotlib.pyplot as plt

# Sample data
data = pd.DataFrame({
    'Month': ['Jan', 'Feb', 'Mar', 'Apr'],
    'Sales': [100, 120, 140, 110]
})

# Line plot
data.plot(x='Month', y='Sales', kind='line')
plt.title('Monthly Sales')
plt.show()

# Bar plot
data.plot(x='Month', y='Sales', kind='bar')
plt.show()

๐ŸŽฏ Real-World Examples

Sales Analysis

# Sales data analysis
sales_data = pd.DataFrame({
    'Date': pd.date_range('2024-01-01', periods=10),
    'Product': ['A', 'B'] * 5,
    'Quantity': [10, 15, 12, 18, 14, 16, 11, 19, 13, 17],
    'Price': [100, 150] * 5
})

sales_data['Revenue'] = sales_data['Quantity'] * sales_data['Price']
summary = sales_data.groupby('Product').agg({
    'Quantity': 'sum',
    'Revenue': 'sum'
})
print("Product summary:\n", summary)
Customer Analysis

# Customer segmentation
customers = pd.DataFrame({
    'Customer': ['A', 'B', 'C', 'D', 'E'],
    'Age': [25, 35, 45, 30, 40],
    'Spending': [1000, 2000, 1500, 3000, 2500]
})

# Create age groups
customers['Age_Group'] = pd.cut(customers['Age'], 
                               bins=[0, 30, 40, 100], 
                               labels=['Young', 'Middle', 'Senior'])

segment_analysis = customers.groupby('Age_Group')['Spending'].mean()
print("Spending by age group:\n", segment_analysis)

Best Practices

๐Ÿ“Š Explore First

Always use .head(), .info(), .describe()

๐Ÿงน Clean Data

Handle missing values and duplicates

โšก Use Vectorization

Avoid loops, use pandas operations

๐Ÿ’พ Save Progress

Save intermediate results regularly

๐Ÿง  Quick Quiz

What does df.head() show?

How do you select rows where Age > 30?