Here’s a list of 28 tips for using pandas in Python, along with examples. These tips cover a wide range of functionalities to help you manipulate, analyze, and clean data more efficiently.
import pandas as pd
df = pd.read_csv('data.csv')
df.head(10) # Display first 10 rows
y | x1 | x2 | x3 | |
---|---|---|---|---|
0 | 100 | 8.71 | 1.052604 | 0.389265 |
1 | 99 | 6.12 | 1.052604 | 0.000000 |
2 | 240 | 6.62 | 1.052604 | 0.144570 |
3 | 144 | 3.92 | 1.052604 | 0.000000 |
4 | 100 | 9.04 | 1.052604 | 4.314023 |
5 | 155 | 16.58 | 1.052604 | 0.360805 |
6 | 671 | 21.50 | 1.052604 | 0.759203 |
7 | 137 | 10.22 | 1.052604 | 0.000000 |
8 | 104 | 11.90 | 1.052604 | 11.865809 |
9 | 852 | 159.01 | 1.052604 | 63.898479 |
df.info() # Overview of the DataFrame
<class 'pandas.core.frame.DataFrame'> RangeIndex: 294 entries, 0 to 293 Data columns (total 4 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 y 294 non-null int64 1 x1 294 non-null float64 2 x2 294 non-null float64 3 x3 294 non-null float64 dtypes: float64(3), int64(1) memory usage: 9.3 KB
df.columns = ['col1', 'col2', 'col3', 'col4'] # Rename all columns
print(df.head())
df.rename(columns={'col1': 'new_col1'}, inplace=True) # Rename specific column
print(df.head())
col1 col2 col3 col4 0 100 8.71 1.052604 0.389265 1 99 6.12 1.052604 0.000000 2 240 6.62 1.052604 0.144570 3 144 3.92 1.052604 0.000000 4 100 9.04 1.052604 4.314023 new_col1 col2 col3 col4 0 100 8.71 1.052604 0.389265 1 99 6.12 1.052604 0.000000 2 240 6.62 1.052604 0.144570 3 144 3.92 1.052604 0.000000 4 100 9.04 1.052604 4.314023
df_filtered = df[df['col2'] > 20] # Filter rows where column > 20
print(df_filtered.head())
print('The shape of df: ',df.shape)
print('The shape of filtered df: ',df_filtered.shape)
new_col1 col2 col3 col4 6 671 21.50 1.052604 0.759203 9 852 159.01 1.052604 63.898479 10 520 40.31 1.052604 8.828501 11 520 40.31 1.052604 8.828501 12 777 34.50 1.052604 3.495477 The shape of df: (294, 4) The shape of filtered df: (157, 4)
selected_df = df[['col2', 'col3']] # Select multiple columns
selected_df.head()
col2 | col3 | |
---|---|---|
0 | 8.71 | 1.052604 |
1 | 6.12 | 1.052604 |
2 | 6.62 | 1.052604 |
3 | 3.92 | 1.052604 |
4 | 9.04 | 1.052604 |
check_df = df.isnull().sum() # Count missing values in each column.
check_df
0 | |
---|---|
new_col1 | 0 |
col2 | 0 |
col3 | 0 |
col4 | 0 |
import numpy as np
# Create a sample DataFrame with missing values
data = {
'Name': ['Alice', 'Bob', 'Charlie'],
'Age': [25, np.nan, 30],
'Score': [85.5, 90.0, np.nan]
} # data is a dictionary.
df = pd.DataFrame(data)
print(df.head())
print("=========================")
df.fillna(0, inplace=True) # Fill NaN with 0, inplace means replace the old df with new one.
print(df.head())
Name Age Score 0 Alice 25.0 85.5 1 Bob NaN 90.0 2 Charlie 30.0 NaN ========================= Name Age Score 0 Alice 25.0 85.5 1 Bob 0.0 90.0 2 Charlie 30.0 0.0
data = {
'Name': ['Alice', 'Bob', 'Charlie'],
'Age': [25, np.nan, 30],
'Score': [85.5, 90.0, np.nan]
} # data is a dictionary.
df = pd.DataFrame(data)
print(df.head())
print("=========================")
df.dropna(inplace=True) # Remove rows with any missing data
print(df.head())
Name Age Score 0 Alice 25.0 85.5 1 Bob NaN 90.0 2 Charlie 30.0 NaN ========================= Name Age Score 0 Alice 25.0 85.5
df = pd.read_csv('data.csv')
df.describe() # Summary statistics for numerical columns
y | x1 | x2 | x3 | |
---|---|---|---|---|
count | 294.000000 | 294.000000 | 294.000000 | 294.000000 |
mean | 252.408163 | 28.940299 | 0.910343 | 7.712961 |
std | 224.354539 | 30.033040 | 0.382094 | 28.505172 |
min | 9.000000 | 3.480000 | 0.587929 | 0.000000 |
25% | 116.000000 | 12.998750 | 0.634467 | 0.000000 |
50% | 179.500000 | 20.913000 | 0.877854 | 0.108644 |
75% | 334.750000 | 31.735000 | 0.937007 | 5.051179 |
max | 1778.000000 | 291.650000 | 1.954588 | 419.122550 |
df_count = df['x3'].value_counts() # Count occurrences of unique values in a column
df_count.head()
count | |
---|---|
x3 | |
0.000000 | 141 |
8.828501 | 2 |
7.476962 | 1 |
20.092479 | 1 |
2.200531 | 1 |
df['new_x'] = df['x1'] + df['x2'] # Add values from two columns
df.head()
y | x1 | x2 | x3 | new_x | |
---|---|---|---|---|---|
0 | 100 | 8.71 | 1.052604 | 0.389265 | 9.762604 |
1 | 99 | 6.12 | 1.052604 | 0.000000 | 7.172604 |
2 | 240 | 6.62 | 1.052604 | 0.144570 | 7.672604 |
3 | 144 | 3.92 | 1.052604 | 0.000000 | 4.972604 |
4 | 100 | 9.04 | 1.052604 | 4.314023 | 10.092604 |
df = pd.read_csv('data.csv')
print(df.info())
df.drop_duplicates(inplace=True) # Remove duplicate rows
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 294 entries, 0 to 293 Data columns (total 4 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 y 294 non-null int64 1 x1 294 non-null float64 2 x2 294 non-null float64 3 x3 294 non-null float64 dtypes: float64(3), int64(1) memory usage: 9.3 KB None <class 'pandas.core.frame.DataFrame'> Index: 293 entries, 0 to 293 Data columns (total 4 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 y 293 non-null int64 1 x1 293 non-null float64 2 x2 293 non-null float64 3 x3 293 non-null float64 dtypes: float64(3), int64(1) memory usage: 11.4 KB
df_sorted = df.sort_values('x1', ascending=False) # Sort by column in descending order
df_sorted.head()
y | x1 | x2 | x3 | |
---|---|---|---|---|
16 | 1440 | 291.650 | 1.052604 | 66.268072 |
135 | 1778 | 224.800 | 0.634467 | 25.885085 |
9 | 852 | 159.010 | 1.052604 | 63.898479 |
247 | 403 | 149.229 | 0.761814 | 2.219219 |
246 | 601 | 124.946 | 0.761814 | 3.257444 |
data = {
'Category': ['A', 'A', 'A', 'B', 'B', 'B'],
'Values': [10, 20, 30, 40, 50, 60]
}
df = pd.DataFrame(data)
# Group by 'Category' and calculate the sum of 'Values'
print(df)
grouped_df = df.groupby('Category')['Values'].sum().reset_index()
grouped_df
Category Values 0 A 10 1 A 20 2 A 30 3 B 40 4 B 50 5 B 60
Category | Values | |
---|---|---|
0 | A | 60 |
1 | B | 150 |
# Convert column to integer. it simply ignore floating points.
# It takes the integer less than the value.
df = pd.read_csv('data.csv')
print(df.head())
df['x1'] = df['x1'].astype(int)
df.head()
y x1 x2 x3 0 100 8.71 1.052604 0.389265 1 99 6.12 1.052604 0.000000 2 240 6.62 1.052604 0.144570 3 144 3.92 1.052604 0.000000 4 100 9.04 1.052604 4.314023
y | x1 | x2 | x3 | |
---|---|---|---|---|
0 | 100 | 8 | 1.052604 | 0.389265 |
1 | 99 | 6 | 1.052604 | 0.000000 |
2 | 240 | 6 | 1.052604 | 0.144570 |
3 | 144 | 3 | 1.052604 | 0.000000 |
4 | 100 | 9 | 1.052604 | 4.314023 |
df = pd.read_csv('data.csv')
print(df.head())
df.drop('x1', axis=1, inplace=True) # Drop a column
df.head()
y x1 x2 x3 0 100 8.71 1.052604 0.389265 1 99 6.12 1.052604 0.000000 2 240 6.62 1.052604 0.144570 3 144 3.92 1.052604 0.000000 4 100 9.04 1.052604 4.314023
y | x2 | x3 | |
---|---|---|---|
0 | 100 | 1.052604 | 0.389265 |
1 | 99 | 1.052604 | 0.000000 |
2 | 240 | 1.052604 | 0.144570 |
3 | 144 | 1.052604 | 0.000000 |
4 | 100 | 1.052604 | 4.314023 |
df = pd.read_csv('data.csv')
print(df.head())
df.set_index('y', inplace=True) # Reset index and drop the old index
df.head()
y x1 x2 x3 0 100 8.71 1.052604 0.389265 1 99 6.12 1.052604 0.000000 2 240 6.62 1.052604 0.144570 3 144 3.92 1.052604 0.000000 4 100 9.04 1.052604 4.314023
x1 | x2 | x3 | |
---|---|---|---|
y | |||
100 | 8.71 | 1.052604 | 0.389265 |
99 | 6.12 | 1.052604 | 0.000000 |
240 | 6.62 | 1.052604 | 0.144570 |
144 | 3.92 | 1.052604 | 0.000000 |
100 | 9.04 | 1.052604 | 4.314023 |
print(df.head())
df.reset_index(drop=False, inplace=True) # Reset index and don't drop the old index
df.head()
x1 x2 x3 y 100 8.71 1.052604 0.389265 99 6.12 1.052604 0.000000 240 6.62 1.052604 0.144570 144 3.92 1.052604 0.000000 100 9.04 1.052604 4.314023
y | x1 | x2 | x3 | |
---|---|---|---|---|
0 | 100 | 8.71 | 1.052604 | 0.389265 |
1 | 99 | 6.12 | 1.052604 | 0.000000 |
2 | 240 | 6.62 | 1.052604 | 0.144570 |
3 | 144 | 3.92 | 1.052604 | 0.000000 |
4 | 100 | 9.04 | 1.052604 | 4.314023 |
data = {
'Date': ['2024-01-01', '2024-02-15', '2024-03-30', '2024-04-10'],
'Sales': [10, 20, 30, 40]
}
df = pd.DataFrame(data)
print(df)
# Convert 'date_column' to datetime format
df['Date'] = pd.to_datetime(df['Date'])
# Extract year from 'date_column' and create a new column 'year'
df['Year'] = df['Date'].dt.year
print(df)
Date Sales 0 2024-01-01 10 1 2024-02-15 20 2 2024-03-30 30 3 2024-04-10 40 Date Sales Year 0 2024-01-01 10 2024 1 2024-02-15 20 2024 2 2024-03-30 30 2024 3 2024-04-10 40 2024
df.to_csv('output.csv', index=False) # Save DataFrame to CSV without index
import matplotlib.pyplot as plt
df = pd.read_csv('data.csv')
plt.subplots(1,figsize=(4,2))
df['x1'].plot(kind='line') # Line plot of a column
plt.show()
# Create a sample DataFrame
data = {
'Date': ['2024-01-01', '2024-01-01', '2024-01-02', '2024-01-02', '2024-01-01'],
'Category': ['A', 'B', 'A', 'B', 'A'],
'Sales': [100, 150, 200, 250, 50]
}
df = pd.DataFrame(data)
print(df)
print("==================================")
# Create a pivot table
pivot_df = df.pivot_table(values='Sales', index='Date', columns='Category', aggfunc='sum', fill_value=0)
print(pivot_df)
Date Category Sales 0 2024-01-01 A 100 1 2024-01-01 B 150 2 2024-01-02 A 200 3 2024-01-02 B 250 4 2024-01-01 A 50 ================================== Category A B Date 2024-01-01 150 150 2024-01-02 200 250
df = pd.read_csv('data.csv')
print(df.head())
print("===============================================")
df['new_column'] = df['x1'].apply(lambda x: x * 2) # Apply a function to each element
df.head()
y x1 x2 x3 0 100 8.71 1.052604 0.389265 1 99 6.12 1.052604 0.000000 2 240 6.62 1.052604 0.144570 3 144 3.92 1.052604 0.000000 4 100 9.04 1.052604 4.314023 ===============================================
y | x1 | x2 | x3 | new_column | |
---|---|---|---|---|---|
0 | 100 | 8.71 | 1.052604 | 0.389265 | 17.42 |
1 | 99 | 6.12 | 1.052604 | 0.000000 | 12.24 |
2 | 240 | 6.62 | 1.052604 | 0.144570 | 13.24 |
3 | 144 | 3.92 | 1.052604 | 0.000000 | 7.84 |
4 | 100 | 9.04 | 1.052604 | 4.314023 | 18.08 |
## Only the common rows will remain
data1 = {
'common_column': [1, 2, 3, 4],
'value1': ['A', 'B', 'C', 'D']
}
data2 = {
'common_column': [3, 4, 5, 6],
'value2': ['W', 'X', 'Y', 'Z']
}
df1 = pd.DataFrame(data1)
df2 = pd.DataFrame(data2)
# Merge the two DataFrames on 'common_column'
df_merged = pd.merge(df1, df2, on='common_column')
print(df_merged)
common_column value1 value2 0 3 C W 1 4 D X
# Only the common rows in the df2 will ADD to the df1
data1 = {
'key': [1, 2, 3, 4],
'value1': ['A', 'B', 'C', 'D']
}
data2 = {
'common_column': [3, 4, 5, 6],
'value2': ['W', 'X', 'Y', 'Z']
}
df1 = pd.DataFrame(data1)
df2 = pd.DataFrame(data2)
# Set 'common_column' as the index for df2
df2.set_index('common_column', inplace=True)
# Join df1 and df2 on 'key'
df_joined = df1.join(df2, on='key')
print(df_joined)
key value1 value2 0 1 A NaN 1 2 B NaN 2 3 C W 3 4 D X
# Create a sample DataFrame in long format
data = {
'Category': ['Fruit', 'Fruit', 'Vegetable', 'Vegetable', 'Grain', 'Grain'],
'Subcategory': ['Apple', 'Banana', 'Carrot', 'Broccoli', 'Rice', 'Wheat'],
'Values': [10, 20, 30, 25, 15, 35]
}
df = pd.DataFrame(data)
print(df)
print("==============================================================")
# Pivot the DataFrame to reshape it from long to wide format
df_pivot = df.pivot(index='Category', columns='Subcategory', values='Values')
print(df_pivot)
Category Subcategory Values 0 Fruit Apple 10 1 Fruit Banana 20 2 Vegetable Carrot 30 3 Vegetable Broccoli 25 4 Grain Rice 15 5 Grain Wheat 35 ============================================================== Subcategory Apple Banana Broccoli Carrot Rice Wheat Category Fruit 10.0 20.0 NaN NaN NaN NaN Grain NaN NaN NaN NaN 15.0 35.0 Vegetable NaN NaN 25.0 30.0 NaN NaN