Some notes to remind me on how to do things in Pandas.
Version
import pandas as pd
pd.__version__
Creating Dataframes
df = pd.DataFrame()
topics = ["school", "school", "business"]
words = ["book", "study", "dollar"]
weights = [4,3,7]
df_topics = pd.DataFrame({
"topic": topics,
"word": words,
"weight": weights
})
topics = [["school", "book", 4],
["school", "study", 3],
["business", "dollar", 7]]
pd.DataFrame(topics, columns=["topic", "word", "significance"])
load only part of data: nrows
pd.read_csv("timeseries.csv", nrows=1000)
load only part of data: columns
pd.read_csv("timeseries.csv", usecols=["c1", "c2"]])
get column names from a large file:
df = pd.read_csv("timeseries.csv", nrows=5)
cols = df.columns
type_counts = df.dtypes.value_counts()
set datatypes on load for all columns:
pd.read_csv("timeseries.csv", dtype="float32")
set datatypes on load for specific columns:
pd.read_csv("timeseries.csv", dtype={"c1":str, "c2":int}])
set specific value as NaN ("?" here):
pd.read_csv("timeseries.csv", na_values=["?"], dtype={"c1":str, "c2":int}])
Selecting
select multiple columns as a dataframe from a bigger dataframe:
df2 = df[['Id', 'team', 'winPlacePerc']]
select a single column as a dataframe:
df2 = df[['name']]
in above, double square brackets make the results a dataframe, single makes it series
Selecting a single column by column names:
columnsData = df['Age']
columnsData = df.loc[ : , 'Age' ]
Select all but single column:
df_train = df_train.drop("Survived", axis=1)
df_train.loc[:, df_train.columns != 'Survived']
Select all but multiple columns:
df_train = df_train.drop(["Survived", "Pclass"], axis=1)
Select rows by index in list:
top10 = df_top_misses.iloc[0:10].index
df_train.iloc[top10]
select by datatype:
df.select_dtypes(include=['float64', 'int64'])
pandas axis:
axis 1 = columns, axis 0 = rows
get a series from a dataframe column filtered by value:
min_weight = df_topics["weight"].min()
df_topics[df_topics["weight"] == min_weight]
Show all rows with duplicates:
df = df_topics
#this requires both topic and word to be duplicate
df[df.duplicated(['topic', "word"],keep=False)]
Just show the first or last of each duplicate (else shows all duplicate rows):
df[df.duplicated(['key1','key2'],keep='first')]
df[df.duplicated(['key1','key2'],keep='last')]
Count duplicates:
df.duplicated().sum()
df.duplicated(['topic', "word"],keep=False).sum()
Filter all rows where the line_race columns is zero.
df = df[df.line_race != 0]
Filter all rows where the line_race columns is not zero.
df = df[df.line_race == 0]
Accessing Values
set/reset how many rows does pandas print with df.head(), iloc[], etc. before clipping display.
pd.set_option('display.max_rows', 5) #show specific count
pd.set_option('display.max_rows', None) #show all
pd.reset_option('display.max_rows') #to reset one
pd.reset_option('all') #to reset all
Make it local only:
with pd.option_context('display.height', 500, 'display.max_rows', 500):
df.head(100)
find row at 1111th index in dataframe:
df.iloc[1111]
get value in specific row and cell:
df["topic"].values[0]
find row with value 1111 for index in dataframe:
df.loc[1111]
select multiple ranges of rows:
def ranges(*args):
indices = []
for rt in args:
indices += list(range(rt[0], rt[1]))
return indices
df.iloc[ranges((0,10), (90,110))]
or just
df.iloc[[1,3,0]]
set value in cell (stackoverflow):
df.at[555, "visited"] = True
#555 here refers to row index
modify row while iterating over it:
for index, row in df.iterrows():
df.at[index,'topic'] = "new topic"
select 55th column:
df[df.columns[55]]
drop last 5 columns
df.drop(df.columns[-5:], axis=1)
Aggregates
calculate mean for ‘team’ and ‘winPlacePerc’ columns, after grouping them by match id and group id:
agg_cols = ['groupId', 'matchId', 'team', 'winPlacePerc']
df_mean = df_test[agg_cols].groupby(["groupId", "matchId"],
as_index=False).mean().add_suffix("_mean")
run multiple such aggregations at once:
agg = df_train.groupby(['matchId'])
.agg({'players_in_team': ['min', 'max', 'mean', 'median']})
specify the name suffixes of the generated aggregation column names:
agg_train = df_train[agg_cols].groupby(["groupId", "matchId"],
as_index=False)
.agg([('_min', 'min'), ('_max', 'max'), ('_mean', 'mean')])
multiple aggregations will create a 2-level column header (see df.head()). to change it into one level (for merge etc):
mi = agg_train.columns #mi for multi-index
ind = pd.Index([e[0] + e[1] for e in mi.tolist()])
agg_train.columns = ind
custom aggregation function:
def q90(x):
return x.quantile(0.9)
agg = df_train.groupby(['matchId'])
.agg({'players_in_team':
['min', 'max', 'mean', 'median', q90]})
create new column as number of rows in a group:
agg = df_train.groupby(['matchId'])
.size().to_frame('players_in_match')
group and sum column for groups:
revenues = df_train.groupby("geoNetwork_subContinent")
['totals_transactionRevenue'].sum()
.sort_values(ascending=False)
Merge
pandas concatenate dataframes (axis=1 is columns, so concat sideways):
pd.concat([df_a,df_b], axis=1)
merge two dataframes (here df_train and agg) by a single column:
df_train = df_train.merge(agg, how='left', on=['groupId'])
merge on multiple columns:
dfg_train = dfg_train.merge(agg_train, how='left',
on=["groupId", "matchId"])
set merge suffixes = ["", "_rank"] <- left and right side on overlapping columns
dfg_test = dfg_test.merge(agg_test_rank, suffixes=["", "_rank";],
how='left', on=['groupId', "matchId";])
above sets columns from dfg_test to have no suffix ("") and columns from agg_test_rank to have suffix "_rank"
merge columns by value:
df['weight'] = np.where(df['weight'] < 5, df["topic"], df["weight"])
above sets weight value as weight if weight is below five, else topic.
merge by defining the column names to match on left and right:
pd.merge(left, right, left_on='key1', right_on='key2')
merge types (the how=merge_type in pd.merge)(link):
- inner: keep rows that match in both left and right
- outer: keep all rows in both left and right
- left: keep all rows from left and matching ones from right
- right: keep all rows from right and matching ones from left
Rename, Delete, Compare Dataframes
rename columns to remove a suffix (here remove _mean):
df_mean = df_mean.rename(columns={'groupId_mean': 'groupId',
'matchId_mean': 'matchId'})
delete dataframes to save memory:
del df_agg
find all columns in one dataframe but not in another
diff_set = set(train_df.columns).difference(set(test_df.columns))
print(diff_set)
find all columns in both dataframes and drop them
common_set = set(train_df.columns).intersection(set(df2.columns))
train_df = train_df.drop(common_set, axis=1)
find all columns not in both dataframes and drop them
common_set = set(df_train.columns).intersection(set(df_test.columns))
df_train = df_train[list(common_set)]
drop rows with index in list (stackoverflow):
df.drop(df.index[[1,3]])
replace nulls/nans with 0:
X.fillna(0)
X_test.fillna(0)
only for specific columns:
df[['a', 'b']] = df[['a','b']].fillna(value=0)
drop nan, rows with null values for specific column:
df_train.dropna(subset=['winPlacePerc'], inplace=True)
drop columns B and C:
df.drop(['B', 'C'], axis=1)
df.drop(['B', 'C'], axis=1, inplace = True)
Dataframe Statistics
this df has 800k rows (values) and 999 columns (features):
df_train_subset.shape
(800000, 999)
data types:
df_train.dtypes
number of rows, columns, memory size (light, fast):
df.info()
statistics per feature/column (cpu/mem heavy):
df.describe()
number of unique values:
df.nunique()
bounds:
df.min()
df.max()
replace infinity with 0. esp scalers can have issues with inf:
df[col] = df[col].replace(np.inf, 0)
replace positive and negative inf with nan:
df_pct.replace([np.inf, -np.inf], np.nan)
number of non-nulls per row in dataframe:
df_non_null = df.notnull().sum(axis=1)
find the actual rows with null values:
df_train[df_train.isnull().any(axis=1)]
print columns with nulls in them
nan_cols = df.columns[df.isna().any()].tolist()
nan_cols
number of times different values appear in column:
df_train["device_operatingSystem"].value_counts()
sorted by index (e.g., 0,1,2,… or "school", "business")
phase_counts = df["phase"].value_counts().sort_index()
sorted by their value counts:
phase_counts = df["phase"].value_counts().sort_values()
number of nans per column in dataframe:
df.isnull().sum()
find columns with only one unique value
const_cols = [c for c in train_df.columns
if train_df[c].nunique(dropna=False)==1 ]
drop them
train_df = train_df.drop(const_cols, axis=1)
replace outlier values over 3std with 3std:
outlier_thresholds = {}
def remove_outliers(df, col):
upper = df[col].mean()+3*df[col].std()
if col in outlier_thresholds:
upper = outlier_thresholds[col]
else:
outlier_thresholds[col] = upper
mask = np.abs(df[col] - df[col].mean()) > (3*df[col].std())
df.loc[mask, col] = upper
upper = df_train['totals_transactionRevenue'].mean()
+3*df_train['totals_transactionRevenue'].std()
mask = np.abs(df_train['totals_transactionRevenue']
-df_train['totals_transactionRevenue'].mean()) >
(3*df_train['totals_transactionRevenue'].std())
df_train.loc[mask, 'totals_transactionRevenue'] = upper
or use zscore:
df['zscore'] = (df.a - df.a.mean())/df.a.std(ddof=0)
from scipy (stackoverflow)
from scipy import stats
import numpy as np
z = np.abs(stats.zscore(boston_df))
print(z)
show groupby object data statistics for each column by grouped element:
grouped.describe()
create dataframe from classifier column names and importance (where supported), sort by weight:
df_feats = pd.DataFrame()
df_feats["names"] = X.columns
df_feats["weights"] = clf.feature_importances_
df_feats.sort_values(by="weights")
lag columns to show absolute value they changed over rows:
for col in input_col_names:
df_diff[col] = df_sig[col].diff().abs()
unique datatypes in dataframe:
train_df.dtypes.unique()
Numpy
Get numpy matrix from dataframe:
data_diffs = df_diff.values
get column from numpy matrix as row
for sig in range(0, 3):
data_sig = data_measure[:,sig]
slice numpy array:
bin_data_raw = data_sig[i:i + bin_size]
calculate statistics over numpy array:
bin_avg_raw = bin_data_raw.mean()
bin_sum_raw = bin_data_raw.sum()
bin_std_raw = bin_data_raw.std()
bin_percentiles = np.percentile(bin_data_raw, [0, 1, 25, 50, 75, 99, 100])
bin_range = bin_percentiles[-1] - bin_percentiles[0]
bin_rel_perc = bin_percentiles - bin_avg_raw
count outliers at different scales:
outliers = []
for r in range(1, 7):
t = bin_std_diff * r
o_count = np.sum(np.abs(bin_data_diff-bin_avg_diff) >= t)
outliers.append(o_count)
concatenate multiple numpy arrays into one (rows=axis 0, cols=axis 1):
bin_row = np.concatenate([raw_features, diff_features, bin_percentiles, bin_rel_perc, outliers])
limit values between min/max:
df_sig.clip(upper=127, lower=-127)
value range in column (ptp = peak to peak):
df.groupby('GROUP')['VALUE'].agg(np.ptp)
replace nan:
my_arr[np.isnan(my_arr)] = 0
disable e-notation (scientific notation, decimals, fractions):
np.set_printoptions(suppress=True)
#above only works for numpy arrays, not specific numpy values, so this should work:
print(np.array([0.111111111]))
#but not this
print(np.float32(0.111111111))
disable e-notation and set precision locally:
with np.printoptions(precision=20, suppress=True):
print(np.array([d_min, d_max, d_mean, d_med]))
shuffle two+ arrays: #https://stackoverflow.com/questions/4601373/better-way-to-shuffle-two-numpy-arrays-in-unison
from sklearn.utils import shuffle
X, y = shuffle(X, y, random_state=0)
reshape numpy array:
my_arr.reshape((2096, 160, 22)) #not sure how this works wihtout extra parenthesis
Time-Series
create values for percentage changes over time (row to row):
df_pct = pd.DataFrame()
for col in df_train_subset.columns[:30]:
df_pct['pct_chg_'+col] =
df_train_subset[col].pct_change().abs()
pct_change() gives the change in percentage over time, abs() makes it absolute if just looking for overall change as in negative or positive.
also possible to set number of rows to count pct_change over:
df_pct['pct_chg_'+col] =
df_train_subset[col].pct_change(periods=10)
pct_change on a set of items with specific values:
df['pct_chg_open1'] = df.groupby('assetCode')['open']
.apply(lambda x: x.pct_change())
TODO: try different ways to calculate ewm to see how this all works ewm ([stackoverflow]( EWMA: https://stackoverflow.com/questions/37924377/does-pandas-calculate-ewm-wrong)):
df["close_ewma_10"] = df.groupby('assetName')['pct_chg_close1']
.apply(lambda x: x.ewm(span=10).mean())
shift by 10 backwards
y = mt_df["mket_close_10"].shift(-10)
Date Types
convert seconds into datetime
start_col = pd.to_datetime(df_train.visitStartTime, unit='s')
parse specific columns as specific date types:
df_train = pd.read_csv("../input/train-flat.csv.gz",
parse_dates=["date"],
dtype={'fullVisitorId': 'str',
'date': 'str'
},
)
or if need to parse specific format:
pd.to_datetime('13000101', format='%Y%m%d', errors='ignore')
access elements of datetime objects:
data_df["weekday"] = data_df['date'].dt.weekday
data_df["hour"] = data_df['date'].dt.hour
Data Type Manipulations
one-hot (one hot, onehot) encode / convert categorical:
dfg_train = pd.get_dummies( dfg_train, columns = ['matchType'] )
dfg_test = pd.get_dummies( dfg_test, columns = ['matchType'] )
multiple columns:
pd.get_dummies(data=df, columns=['A', 'B'])
set category value by range:
#here 1 = solo game, 2 = duo game, 3 = squad, 4 = custom
df_train['team'] =
[1 if i == 1 else 2 if i == 2 else 4 if i > 4 else 3
for i in df_train["players_in_team_q90"]]
calculate value over two columns and make it a new column:
dfg_train['killsNorm'] =
dfg_train['kills_mean']*
((100-dfg_train['players_in_match'])/100 + 1)
data_df['hit_view_ratio'] =
data_df['totals_pageviews']/data_df['totals_hits']
mark a set of columns as category type:
for col in X_cat_cols:
df_train[col] = df_train[col].astype('category')
set category value based on set of values in column:
X_test['matchType'] = X_test['matchType'].astype('str')
X_test.loc[X_test['matchType'] == "squad-fpp", 'matchType'] = "squad"
X_test['matchType'] = X_test['matchType'].astype('category')
how to get the indices from a dataframe as a list (e.g., collect a subset):
list(outlier_collection.index.values)
to see it all on one line in Jupyter (easier copying, e.g. to drop all in list):
print(list(outlier_collection.index.values))
drop all rows where a field is empty:
mail_df.drop(email_df.query(
"body == '' | to == '' | from_ == ''"
).index, inplace=True)
Multiprocessing
https://stackoverflow.com/questions/36794433/python-using-multiprocessing-on-a-pandas-dataframe
http://www.racketracer.com/2016/07/06/pandas-in-parallel/
Label transform Pandas column
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df["topic"] = le.fit_transform(df["topic"])
#if need in two parts
#df_test["Sex] = le.transform(df_text["Sex])
get the mappings generated by the labelencoder:
le_name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print(le_name_mapping)
le_value_mapping = dict(zip(le.transform(le.classes_), le.classes_))
Memory Reducer (From Kaggler):
def reduce_mem_usage(df):
""" iterate through all the columns of a dataframe and modify the data type
to reduce memory usage.
"""
start_mem = df.memory_usage().sum() / 1024**2
print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
for col in df.columns:
col_type = df[col].dtype
if col_type != object and col_type.name != 'category':
#print(col_type)
c_min = df[col].min()
c_max = df[col].max()
if str(col_type)[:3] == 'int':
if c_min > np.iinfo(np.int8).min and c_max np.iinfo(np.int16).min and c_max np.iinfo(np.int32).min and c_max np.iinfo(np.int64).min and c_max np.finfo(np.float16).min and c_max np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
df[col] = df[col].astype(np.float32)
else:
df[col] = df[col].astype(np.float64)
else:
df[col] = df[col].astype('category')
end_mem = df.memory_usage().sum() / 1024**2
print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
return df
#https://stackoverflow.com/questions/25351968/how-to-display-full-non-truncated-dataframe-information-in-html-when-convertin
def print_full(x):
pd.set_option('display.max_rows', len(x))
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 2000)
pd.set_option('display.float_format', '{:20,.2f}'.format)
pd.set_option('display.max_colwidth', -1)
print(x)
pd.reset_option('display.max_rows')
pd.reset_option('display.max_columns')
pd.reset_option('display.width')
pd.reset_option('display.float_format')
pd.reset_option('display.max_colwidth')