Pandas example
basic information
import pandas as pd
df.head(5)
df.tail(5)
df.sample(5)
df.dtypes
df.info()
df.shape
df.columns
df.values
df.describe()
import matplotlib.pyplot as plt
df['var_1'].hist(bins=20)
plt.show()
df.plot(x='var_1', y='var_2', kind='scatter')
series.plot(kind='bar')
plt.show()
select columns, filter rows
import pandas as pd
df['var_1']
df[['var_1', 'var_2']]
df[df['var_1'] > 10]
df[(df['var_1'] > 10) & (df['var_2'] > '2021-01-01')]
df.iloc[0:50, 0:4]
- bracket + str = select column
- bracket + list(str) = select columns
- bracket + list(bool) = select rows (!)
- iloc + bracket + list(int) + list(int) = filter rows and select columns
add/drop column
import pandas as pd
df['var_3'] = df['var_1'] / df['var_2']
df.drop(columns=['var_1', 'var_2'])
sorting
import pandas as pd
df.sort_values('var_1')
df.sort_values('var_1', ascending=True)
df.sort_values(['var_1', 'var_2'], ascending=[True, False])
aggregating
import pandas as pd
import numpy as np
df['categorial_var_1'].value_counts(sort=True, normalize=True)
df.groupby('categorial_var_1')['var_1'].mean()
df.groupby('categorial_var_1')['var_1'].agg([np.mean, np.max])
df.groupby('categorial_var_1')[['var_1', 'var_2']].mean()
df.groupby('categorial_var_1')[['var_1', 'var_2']].agg([np.mean, np.max])
# c.f. pivot_tables
# values = agg-column
# index = groupby-column 1
# columns = groupby-column 2
df.pivot_table(values='var_1', index='cate_var_1', aggfunc=[np.mean, np.max])
df.pivot_table(values='var_1', index='cate_var_1', columns='cate_var_2', aggfunc=[np.mean, np.max])
# c.f. category variable counting
df['category_col_1'].value_counts(sort=True)
missing values and duplicates
import pandas as pd
df.isna()
df.isna().sum()
df.fillna(0)
df.dropna()
df.drop_duplicates(subset=['col_1', 'col_2'])
io and type cast
import pandas as pd
df = pd.read_csv('input_file.csv')
df = df.astype({'var_1': 'int64', 'var_2': 'str'})
df.to_csv('output_file.csv')
create from dictionary or list
import pandas as pd
d = {'var_1': [1, 2], 'var_2': [3, 4]}
df = pd.DataFrame(data=d)
d = [{'var_1': 1, 'var_2': 3}, {'var_1': 2, 'var_2': 4} ]
df = pd.DataFrame(data=d)