Pandas example

Pandas example

basic information

import pandas as pd

df.head(5)
df.tail(5)
df.sample(5)
df.dtypes
df.info()
df.shape
df.columns
df.values

df.describe()

import matplotlib.pyplot as plt

df['var_1'].hist(bins=20)
plt.show()

df.plot(x='var_1', y='var_2', kind='scatter')

series.plot(kind='bar')
plt.show()

select columns, filter rows

import pandas as pd

df['var_1']
df[['var_1', 'var_2']]

df[df['var_1'] > 10]
df[(df['var_1'] > 10) & (df['var_2'] > '2021-01-01')]

df.iloc[0:50, 0:4]
  • bracket + str = select column
  • bracket + list(str) = select columns
  • bracket + list(bool) = select rows (!)
  • iloc + bracket + list(int) + list(int) = filter rows and select columns

add/drop column

import pandas as pd

df['var_3'] = df['var_1'] / df['var_2']

df.drop(columns=['var_1', 'var_2'])

sorting

import pandas as pd

df.sort_values('var_1')
df.sort_values('var_1', ascending=True)
df.sort_values(['var_1', 'var_2'], ascending=[True, False])

aggregating

import pandas as pd
import numpy as np

df['categorial_var_1'].value_counts(sort=True, normalize=True)

df.groupby('categorial_var_1')['var_1'].mean()
df.groupby('categorial_var_1')['var_1'].agg([np.mean, np.max])

df.groupby('categorial_var_1')[['var_1', 'var_2']].mean()
df.groupby('categorial_var_1')[['var_1', 'var_2']].agg([np.mean, np.max])

# c.f. pivot_tables
# values = agg-column
# index = groupby-column 1
# columns = groupby-column 2
df.pivot_table(values='var_1', index='cate_var_1', aggfunc=[np.mean, np.max])
df.pivot_table(values='var_1', index='cate_var_1', columns='cate_var_2', aggfunc=[np.mean, np.max])

# c.f. category variable counting
df['category_col_1'].value_counts(sort=True)

missing values and duplicates

import pandas as pd

df.isna()
df.isna().sum()

df.fillna(0)
df.dropna()

df.drop_duplicates(subset=['col_1', 'col_2'])

io and type cast

import pandas as pd

df = pd.read_csv('input_file.csv')

df = df.astype({'var_1': 'int64', 'var_2': 'str'})

df.to_csv('output_file.csv')

create from dictionary or list

import pandas as pd

d = {'var_1': [1, 2], 'var_2': [3, 4]}
df = pd.DataFrame(data=d)

d = [{'var_1': 1, 'var_2': 3}, {'var_1': 2, 'var_2': 4} ]
df = pd.DataFrame(data=d)