import pandas as pd
df = pd.read_csv('data.csv')


df.head(10)  # Display first 10 rows


df.info()  # Overview of the DataFrame

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 294 entries, 0 to 293
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   y       294 non-null    int64  
 1   x1      294 non-null    float64
 2   x2      294 non-null    float64
 3   x3      294 non-null    float64
dtypes: float64(3), int64(1)
memory usage: 9.3 KB


df.columns = ['col1', 'col2', 'col3', 'col4']  # Rename all columns
print(df.head())
df.rename(columns={'col1': 'new_col1'}, inplace=True)  # Rename specific column
print(df.head())

   col1  col2      col3      col4
0   100  8.71  1.052604  0.389265
1    99  6.12  1.052604  0.000000
2   240  6.62  1.052604  0.144570
3   144  3.92  1.052604  0.000000
4   100  9.04  1.052604  4.314023
   new_col1  col2      col3      col4
0       100  8.71  1.052604  0.389265
1        99  6.12  1.052604  0.000000
2       240  6.62  1.052604  0.144570
3       144  3.92  1.052604  0.000000
4       100  9.04  1.052604  4.314023


df_filtered = df[df['col2'] > 20]  # Filter rows where column > 20
print(df_filtered.head())
print('The shape of df: ',df.shape)
print('The shape of filtered df: ',df_filtered.shape)

    new_col1    col2      col3       col4
6        671   21.50  1.052604   0.759203
9        852  159.01  1.052604  63.898479
10       520   40.31  1.052604   8.828501
11       520   40.31  1.052604   8.828501
12       777   34.50  1.052604   3.495477
The shape of df:  (294, 4)
The shape of filtered df:  (157, 4)


selected_df = df[['col2', 'col3']]  # Select multiple columns
selected_df.head()


check_df = df.isnull().sum()  # Count missing values in each column.
check_df


import numpy as np
# Create a sample DataFrame with missing values
data = {
    'Name': ['Alice', 'Bob', 'Charlie'],
    'Age': [25, np.nan, 30],
    'Score': [85.5, 90.0, np.nan]
} # data is a dictionary.
df = pd.DataFrame(data)
print(df.head())
print("=========================")
df.fillna(0, inplace=True)  # Fill NaN with 0, inplace means replace the old df with new one.
print(df.head())

      Name   Age  Score
0    Alice  25.0   85.5
1      Bob   NaN   90.0
2  Charlie  30.0    NaN
=========================
      Name   Age  Score
0    Alice  25.0   85.5
1      Bob   0.0   90.0
2  Charlie  30.0    0.0


data = {
    'Name': ['Alice', 'Bob', 'Charlie'],
    'Age': [25, np.nan, 30],
    'Score': [85.5, 90.0, np.nan]
} # data is a dictionary.
df = pd.DataFrame(data)
print(df.head())
print("=========================")
df.dropna(inplace=True)  # Remove rows with any missing data
print(df.head())

      Name   Age  Score
0    Alice  25.0   85.5
1      Bob   NaN   90.0
2  Charlie  30.0    NaN
=========================
    Name   Age  Score
0  Alice  25.0   85.5


df = pd.read_csv('data.csv')
df.describe()  # Summary statistics for numerical columns


df_count = df['x3'].value_counts()  # Count occurrences of unique values in a column
df_count.head()


df['new_x'] = df['x1'] + df['x2']  # Add values from two columns
df.head()


df = pd.read_csv('data.csv')
print(df.info())
df.drop_duplicates(inplace=True)  # Remove duplicate rows
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 294 entries, 0 to 293
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   y       294 non-null    int64  
 1   x1      294 non-null    float64
 2   x2      294 non-null    float64
 3   x3      294 non-null    float64
dtypes: float64(3), int64(1)
memory usage: 9.3 KB
None
<class 'pandas.core.frame.DataFrame'>
Index: 293 entries, 0 to 293
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   y       293 non-null    int64  
 1   x1      293 non-null    float64
 2   x2      293 non-null    float64
 3   x3      293 non-null    float64
dtypes: float64(3), int64(1)
memory usage: 11.4 KB


df_sorted = df.sort_values('x1', ascending=False)  # Sort by column in descending order
df_sorted.head()


data = {
    'Category': ['A', 'A', 'A', 'B', 'B', 'B'],
    'Values': [10, 20, 30, 40, 50, 60]
}

df = pd.DataFrame(data)

# Group by 'Category' and calculate the sum of 'Values'
print(df)
grouped_df = df.groupby('Category')['Values'].sum().reset_index()
grouped_df

  Category  Values
0        A      10
1        A      20
2        A      30
3        B      40
4        B      50
5        B      60


# Convert column to integer. it simply ignore floating points.
# It takes the integer less than the value.
df = pd.read_csv('data.csv')
print(df.head())
df['x1'] = df['x1'].astype(int)
df.head()

     y    x1        x2        x3
0  100  8.71  1.052604  0.389265
1   99  6.12  1.052604  0.000000
2  240  6.62  1.052604  0.144570
3  144  3.92  1.052604  0.000000
4  100  9.04  1.052604  4.314023


df = pd.read_csv('data.csv')
print(df.head())
df.drop('x1', axis=1, inplace=True)  # Drop a column
df.head()

     y    x1        x2        x3
0  100  8.71  1.052604  0.389265
1   99  6.12  1.052604  0.000000
2  240  6.62  1.052604  0.144570
3  144  3.92  1.052604  0.000000
4  100  9.04  1.052604  4.314023


df = pd.read_csv('data.csv')
print(df.head())
df.set_index('y', inplace=True)  # Reset index and drop the old index
df.head()

     y    x1        x2        x3
0  100  8.71  1.052604  0.389265
1   99  6.12  1.052604  0.000000
2  240  6.62  1.052604  0.144570
3  144  3.92  1.052604  0.000000
4  100  9.04  1.052604  4.314023


print(df.head())
df.reset_index(drop=False, inplace=True)  # Reset index and don't drop the old index
df.head()

       x1        x2        x3
y                            
100  8.71  1.052604  0.389265
99   6.12  1.052604  0.000000
240  6.62  1.052604  0.144570
144  3.92  1.052604  0.000000
100  9.04  1.052604  4.314023


data = {
    'Date': ['2024-01-01', '2024-02-15', '2024-03-30', '2024-04-10'],
    'Sales': [10, 20, 30, 40]
}

df = pd.DataFrame(data)
print(df)
# Convert 'date_column' to datetime format
df['Date'] = pd.to_datetime(df['Date'])

# Extract year from 'date_column' and create a new column 'year'
df['Year'] = df['Date'].dt.year

print(df)

         Date  Sales
0  2024-01-01     10
1  2024-02-15     20
2  2024-03-30     30
3  2024-04-10     40
        Date  Sales  Year
0 2024-01-01     10  2024
1 2024-02-15     20  2024
2 2024-03-30     30  2024
3 2024-04-10     40  2024


df.to_csv('output.csv', index=False)  # Save DataFrame to CSV without index


import matplotlib.pyplot as plt
df = pd.read_csv('data.csv')
plt.subplots(1,figsize=(4,2))
df['x1'].plot(kind='line')  # Line plot of a column
plt.show()


# Create a sample DataFrame
data = {
    'Date': ['2024-01-01', '2024-01-01', '2024-01-02', '2024-01-02', '2024-01-01'],
    'Category': ['A', 'B', 'A', 'B', 'A'],
    'Sales': [100, 150, 200, 250, 50]
}

df = pd.DataFrame(data)
print(df)
print("==================================")
# Create a pivot table
pivot_df = df.pivot_table(values='Sales', index='Date', columns='Category', aggfunc='sum', fill_value=0)

print(pivot_df)

         Date Category  Sales
0  2024-01-01        A    100
1  2024-01-01        B    150
2  2024-01-02        A    200
3  2024-01-02        B    250
4  2024-01-01        A     50
==================================
Category      A    B
Date                
2024-01-01  150  150
2024-01-02  200  250


df = pd.read_csv('data.csv')
print(df.head())
print("===============================================")
df['new_column'] = df['x1'].apply(lambda x: x * 2)  # Apply a function to each element
df.head()

     y    x1        x2        x3
0  100  8.71  1.052604  0.389265
1   99  6.12  1.052604  0.000000
2  240  6.62  1.052604  0.144570
3  144  3.92  1.052604  0.000000
4  100  9.04  1.052604  4.314023
===============================================


## Only the common rows will remain
data1 = {
    'common_column': [1, 2, 3, 4],
    'value1': ['A', 'B', 'C', 'D']
}
data2 = {
    'common_column': [3, 4, 5, 6],
    'value2': ['W', 'X', 'Y', 'Z']
}
df1 = pd.DataFrame(data1)
df2 = pd.DataFrame(data2)
# Merge the two DataFrames on 'common_column'
df_merged = pd.merge(df1, df2, on='common_column')

print(df_merged)

   common_column value1 value2
0              3      C      W
1              4      D      X


# Only the common rows in the df2 will ADD to the df1
data1 = {
    'key': [1, 2, 3, 4],
    'value1': ['A', 'B', 'C', 'D']
}

data2 = {
    'common_column': [3, 4, 5, 6],
    'value2': ['W', 'X', 'Y', 'Z']
}

df1 = pd.DataFrame(data1)
df2 = pd.DataFrame(data2)

# Set 'common_column' as the index for df2
df2.set_index('common_column', inplace=True)

# Join df1 and df2 on 'key'
df_joined = df1.join(df2, on='key')

print(df_joined)

   key value1 value2
0    1      A    NaN
1    2      B    NaN
2    3      C      W
3    4      D      X


# Create a sample DataFrame in long format
data = {
    'Category': ['Fruit', 'Fruit', 'Vegetable', 'Vegetable', 'Grain', 'Grain'],
    'Subcategory': ['Apple', 'Banana', 'Carrot', 'Broccoli', 'Rice', 'Wheat'],
    'Values': [10, 20, 30, 25, 15, 35]
}

df = pd.DataFrame(data)
print(df)
print("==============================================================")
# Pivot the DataFrame to reshape it from long to wide format
df_pivot = df.pivot(index='Category', columns='Subcategory', values='Values')

print(df_pivot)

    Category Subcategory  Values
0      Fruit       Apple      10
1      Fruit      Banana      20
2  Vegetable      Carrot      30
3  Vegetable    Broccoli      25
4      Grain        Rice      15
5      Grain       Wheat      35
==============================================================
Subcategory  Apple  Banana  Broccoli  Carrot  Rice  Wheat
Category                                                 
Fruit         10.0    20.0       NaN     NaN   NaN    NaN
Grain          NaN     NaN       NaN     NaN  15.0   35.0
Vegetable      NaN     NaN      25.0    30.0   NaN    NaN

	y	x1	x2	x3
count	294.000000	294.000000	294.000000	294.000000
mean	252.408163	28.940299	0.910343	7.712961
std	224.354539	30.033040	0.382094	28.505172
min	9.000000	3.480000	0.587929	0.000000
25%	116.000000	12.998750	0.634467	0.000000
50%	179.500000	20.913000	0.877854	0.108644
75%	334.750000	31.735000	0.937007	5.051179
max	1778.000000	291.650000	1.954588	419.122550

	y	x1	x2	x3
16	1440	291.650	1.052604	66.268072
135	1778	224.800	0.634467	25.885085
9	852	159.010	1.052604	63.898479
247	403	149.229	0.761814	2.219219
246	601	124.946	0.761814	3.257444

Pandas¶

1. Read Data from a CSV¶

2. Display First Few Rows¶

3. Display DataFrame Info¶

4. Change Column Names¶

5. Filter Rows Based on Conditions¶

6. Select Specific Columns¶

7. Check for Missing Data¶

8. Fill Missing Values¶

9. Drop Rows with Missing Data¶

10. Get Summary Statistics¶

11. Value Counts¶

13. Add a New Column¶

14. Remove Duplicate Rows¶

15. Sort Data¶

16. Group Data and Aggregate¶

17. Change Data Types¶

18. Drop a column in a dataframe¶

19. Set Column as Index¶

20. Reset index of a df¶

21. Handle Date and Time¶

22. Save to csv (also support to_excel)¶

Create a plot from df¶

The following parts are advance usage of Pandas¶

24. Pivot Table¶

25. Apply Functions to Columns¶

26. Merge DataFrames¶

27.Join DataFrames¶

28. Pivot Table¶

	y	x1	x2	x3
0	100	8.71	1.052604	0.389265
1	99	6.12	1.052604	0.000000
2	240	6.62	1.052604	0.144570
3	144	3.92	1.052604	0.000000
4	100	9.04	1.052604	4.314023
5	155	16.58	1.052604	0.360805
6	671	21.50	1.052604	0.759203
7	137	10.22	1.052604	0.000000
8	104	11.90	1.052604	11.865809
9	852	159.01	1.052604	63.898479

	count
x3
0.000000	141
8.828501	2
7.476962	1
20.092479	1
2.200531	1