Pandas

Some notes to remind me on how to do things in Pandas.

Version

  import pandas as pd

  pd.__version__

Creating Dataframes

  df = pd.DataFrame()
topics = ["school", "school", "business"]
words = ["book", "study", "dollar"]
weights = [4,3,7]

df_topics = pd.DataFrame({
  "topic": topics,
  "word": words,
  "weight": weights
})
  topics = [["school", "book", 4], 
            ["school", "study", 3], 
            ["business", "dollar", 7]]
  pd.DataFrame(topics, columns=["topic", "word", "significance"])

load only part of data: nrows

  pd.read_csv("timeseries.csv", nrows=1000)

load only part of data: columns

  pd.read_csv("timeseries.csv", usecols=["c1", "c2"]])

get column names from a large file:

  df = pd.read_csv("timeseries.csv", nrows=5)
  cols = df.columns
  type_counts = df.dtypes.value_counts()

set datatypes on load for all columns:

  pd.read_csv("timeseries.csv", dtype="float32")

set datatypes on load for specific columns:

  pd.read_csv("timeseries.csv", dtype={"c1":str, "c2":int}])

set specific value as NaN ("?" here):

  pd.read_csv("timeseries.csv", na_values=["?"], dtype={"c1":str, "c2":int}])

Selecting

select multiple columns as a dataframe from a bigger dataframe:

  df2 = df[['Id', 'team', 'winPlacePerc']]

select a single column as a dataframe:

  df2 = df[['name']] 

in above, double square brackets make the results a dataframe, single makes it series

Selecting a single column by column names:

  columnsData = df['Age']
  columnsData = df.loc[ : , 'Age' ]

Select all but single column:

  df_train = df_train.drop("Survived", axis=1)
  df_train.loc[:, df_train.columns != 'Survived']

Select all but multiple columns:

  df_train = df_train.drop(["Survived", "Pclass"], axis=1)

Select rows by index in list:

  top10 = df_top_misses.iloc[0:10].index
  df_train.iloc[top10]

select by datatype:

  df.select_dtypes(include=['float64', 'int64'])

pandas axis:

  axis 1 = columns, axis 0 = rows

get a series from a dataframe column filtered by value:

min_weight = df_topics["weight"].min()
df_topics[df_topics["weight"] == min_weight]

Show all rows with duplicates:

df = df_topics
#this requires both topic and word to be duplicate
df[df.duplicated(['topic', "word"],keep=False)]

Just show the first or last of each duplicate (else shows all duplicate rows):

df[df.duplicated(['key1','key2'],keep='first')]
df[df.duplicated(['key1','key2'],keep='last')]

Count duplicates:

df.duplicated().sum()
df.duplicated(['topic', "word"],keep=False).sum()

Filter all rows where the line_race columns is zero.

df = df[df.line_race != 0]

Filter all rows where the line_race columns is not zero.

df = df[df.line_race == 0]

Accessing Values

set/reset how many rows does pandas print with df.head(), iloc[], etc. before clipping display.

  pd.set_option('display.max_rows', 5) #show specific count
  pd.set_option('display.max_rows', None) #show all
  pd.reset_option('display.max_rows') #to reset one
  pd.reset_option('all') #to reset all

Make it local only:

with pd.option_context('display.height', 500, 'display.max_rows', 500):
  df.head(100)

find row at 1111th index in dataframe:

  df.iloc[1111]

get value in specific row and cell:

  df["topic"].values[0]

find row with value 1111 for index in dataframe:

  df.loc[1111]

select multiple ranges of rows:

def ranges(*args):
    indices = []
    for rt in args:
        indices += list(range(rt[0], rt[1]))
    return indices

df.iloc[ranges((0,10), (90,110))]

or just

df.iloc[[1,3,0]]

set value in cell (stackoverflow):

  df.at[555, "visited"] = True 
    #555 here refers to row index

modify row while iterating over it:

  for index, row in df.iterrows():
    df.at[index,'topic'] = "new topic"

select 55th column:

  df[df.columns[55]]

drop last 5 columns

  df.drop(df.columns[-5:], axis=1)

Aggregates

calculate mean for ‘team’ and ‘winPlacePerc’ columns, after grouping them by match id and group id:

agg_cols = ['groupId', 'matchId', 'team', 'winPlacePerc']
df_mean = df_test[agg_cols].groupby(["groupId", "matchId"],
  as_index=False).mean().add_suffix("_mean")

run multiple such aggregations at once:

agg = df_train.groupby(['matchId'])
  .agg({'players_in_team': ['min', 'max', 'mean', 'median']})

specify the name suffixes of the generated aggregation column names:

agg_train = df_train[agg_cols].groupby(["groupId", "matchId"], 
                      as_index=False)
  .agg([('_min', 'min'), ('_max', 'max'), ('_mean', 'mean')])

multiple aggregations will create a 2-level column header (see df.head()). to change it into one level (for merge etc):

  mi = agg_train.columns #mi for multi-index
  ind = pd.Index([e[0] + e[1] for e in mi.tolist()])
  agg_train.columns = ind

custom aggregation function:

def q90(x):
    return x.quantile(0.9)

agg = df_train.groupby(['matchId'])
  .agg({'players_in_team': 
      ['min', 'max', 'mean', 'median', q90]})

create new column as number of rows in a group:

  agg = df_train.groupby(['matchId'])
          .size().to_frame('players_in_match')

group and sum column for groups:

revenues = df_train.groupby("geoNetwork_subContinent")
    ['totals_transactionRevenue'].sum()
    .sort_values(ascending=False)

Merge

pandas concatenate dataframes (axis=1 is columns, so concat sideways):

  pd.concat([df_a,df_b], axis=1)

merge two dataframes (here df_train and agg) by a single column:

  df_train = df_train.merge(agg, how='left', on=['groupId'])

merge on multiple columns:

  dfg_train = dfg_train.merge(agg_train, how='left',
     on=["groupId", "matchId"])

set merge suffixes = ["", "_rank"] <- left and right side on overlapping columns

  dfg_test = dfg_test.merge(agg_test_rank, suffixes=["", "_rank";],
        how='left', on=['groupId', "matchId";])

above sets columns from dfg_test to have no suffix ("") and columns from agg_test_rank to have suffix "_rank"

merge columns by value:

df['weight'] = np.where(df['weight'] < 5, df["topic"], df["weight"])

above sets weight value as weight if weight is below five, else topic.

merge by defining the column names to match on left and right:

  pd.merge(left, right, left_on='key1', right_on='key2')

merge types (the how=merge_type in pd.merge)(link):

  • inner: keep rows that match in both left and right
  • outer: keep all rows in both left and right
  • left: keep all rows from left and matching ones from right
  • right: keep all rows from right and matching ones from left

Rename, Delete, Compare Dataframes

rename columns to remove a suffix (here remove _mean):

  df_mean = df_mean.rename(columns={'groupId_mean': 'groupId',
               'matchId_mean': 'matchId'}) 

delete dataframes to save memory:

  del df_agg

find all columns in one dataframe but not in another

  diff_set = set(train_df.columns).difference(set(test_df.columns))
  print(diff_set)

find all columns in both dataframes and drop them

  common_set = set(train_df.columns).intersection(set(df2.columns))
  train_df = train_df.drop(common_set, axis=1)

find all columns not in both dataframes and drop them

  common_set = set(df_train.columns).intersection(set(df_test.columns))
  df_train = df_train[list(common_set)]

drop rows with index in list (stackoverflow):

  df.drop(df.index[[1,3]])

replace nulls/nans with 0:

  X.fillna(0)
  X_test.fillna(0)

only for specific columns:

  df[['a', 'b']] = df[['a','b']].fillna(value=0)

drop nan, rows with null values for specific column:

  df_train.dropna(subset=['winPlacePerc'], inplace=True)

drop columns B and C:

  df.drop(['B', 'C'], axis=1)
  df.drop(['B', 'C'], axis=1, inplace = True)

Dataframe Statistics

this df has 800k rows (values) and 999 columns (features):

  df_train_subset.shape
  (800000, 999)

data types:

  df_train.dtypes

number of rows, columns, memory size (light, fast):

  df.info()

statistics per feature/column (cpu/mem heavy):

  df.describe()

number of unique values:

  df.nunique()

bounds:

  df.min()
  df.max()

replace infinity with 0. esp scalers can have issues with inf:

    df[col] = df[col].replace(np.inf, 0)

replace positive and negative inf with nan:

  df_pct.replace([np.inf, -np.inf], np.nan)

number of non-nulls per row in dataframe:

  df_non_null = df.notnull().sum(axis=1)

find the actual rows with null values:

  df_train[df_train.isnull().any(axis=1)]

print columns with nulls in them

  nan_cols = df.columns[df.isna().any()].tolist()
  nan_cols

number of times different values appear in column:

  df_train["device_operatingSystem"].value_counts()

sorted by index (e.g., 0,1,2,… or "school", "business")

  phase_counts = df["phase"].value_counts().sort_index()

sorted by their value counts:

  phase_counts = df["phase"].value_counts().sort_values()

number of nans per column in dataframe:

  df.isnull().sum()

find columns with only one unique value

  const_cols = [c for c in train_df.columns 
                if train_df[c].nunique(dropna=False)==1 ]

drop them

  train_df = train_df.drop(const_cols, axis=1)

replace outlier values over 3std with 3std:

outlier_thresholds = {}
def remove_outliers(df, col):
    upper = df[col].mean()+3*df[col].std()
    if col in outlier_thresholds:
        upper = outlier_thresholds[col]
    else:
        outlier_thresholds[col] = upper
    mask = np.abs(df[col] - df[col].mean()) &gt; (3*df[col].std())
    df.loc[mask, col] = upper

  upper = df_train['totals_transactionRevenue'].mean()
            +3*df_train['totals_transactionRevenue'].std()
  mask = np.abs(df_train['totals_transactionRevenue']
           -df_train['totals_transactionRevenue'].mean()) &gt;
           (3*df_train['totals_transactionRevenue'].std())
  df_train.loc[mask, 'totals_transactionRevenue'] = upper

or use zscore:

  df['zscore'] = (df.a - df.a.mean())/df.a.std(ddof=0)

from scipy (stackoverflow)

  from scipy import stats
  import numpy as np
  z = np.abs(stats.zscore(boston_df))
  print(z)

show groupby object data statistics for each column by grouped element:

  grouped.describe()

create dataframe from classifier column names and importance (where supported), sort by weight:

  df_feats = pd.DataFrame()
  df_feats["names"] = X.columns
  df_feats["weights"] = clf.feature_importances_
  df_feats.sort_values(by="weights")

lag columns to show absolute value they changed over rows:

  for col in input_col_names:
    df_diff[col] = df_sig[col].diff().abs()

unique datatypes in dataframe:

  train_df.dtypes.unique()

Numpy

Get numpy matrix from dataframe:

  data_diffs = df_diff.values

get column from numpy matrix as row

  for sig in range(0, 3):
      data_sig = data_measure[:,sig]

slice numpy array:

  bin_data_raw = data_sig[i:i + bin_size]

calculate statistics over numpy array:

bin_avg_raw = bin_data_raw.mean()
bin_sum_raw = bin_data_raw.sum()
bin_std_raw = bin_data_raw.std()
bin_percentiles = np.percentile(bin_data_raw, [0, 1, 25, 50, 75, 99, 100])
bin_range = bin_percentiles[-1] - bin_percentiles[0]
bin_rel_perc = bin_percentiles - bin_avg_raw

count outliers at different scales:

  outliers = []
  for r in range(1, 7):
      t = bin_std_diff * r
      o_count = np.sum(np.abs(bin_data_diff-bin_avg_diff) &gt;= t)
      outliers.append(o_count)

concatenate multiple numpy arrays into one (rows=axis 0, cols=axis 1):

  bin_row = np.concatenate([raw_features, diff_features, bin_percentiles, bin_rel_perc, outliers])

limit values between min/max:

  df_sig.clip(upper=127, lower=-127)

value range in column (ptp = peak to peak):

  df.groupby('GROUP')['VALUE'].agg(np.ptp)

replace nan:

  my_arr[np.isnan(my_arr)] = 0

disable e-notation (scientific notation, decimals, fractions):

  np.set_printoptions(suppress=True)
  #above only works for numpy arrays, not specific numpy values, so this should work:
  print(np.array([0.111111111]))  
  #but not this
  print(np.float32(0.111111111))  

disable e-notation and set precision locally:

with np.printoptions(precision=20, suppress=True):
    print(np.array([d_min, d_max, d_mean, d_med]))

shuffle two+ arrays: #https://stackoverflow.com/questions/4601373/better-way-to-shuffle-two-numpy-arrays-in-unison

  from sklearn.utils import shuffle
  X, y = shuffle(X, y, random_state=0)

reshape numpy array:

  my_arr.reshape((2096, 160, 22)) #not sure how this works wihtout extra parenthesis

Time-Series

create values for percentage changes over time (row to row):

df_pct = pd.DataFrame()
for col in df_train_subset.columns[:30]:
    df_pct['pct_chg_'+col] =
        df_train_subset[col].pct_change().abs()

pct_change() gives the change in percentage over time, abs() makes it absolute if just looking for overall change as in negative or positive.

also possible to set number of rows to count pct_change over:

  df_pct['pct_chg_'+col] =
          df_train_subset[col].pct_change(periods=10)

pct_change on a set of items with specific values:

  df['pct_chg_open1'] = df.groupby('assetCode')['open']
           .apply(lambda x: x.pct_change())

TODO: try different ways to calculate ewm to see how this all works ewm ([stackoverflow]( EWMA: https://stackoverflow.com/questions/37924377/does-pandas-calculate-ewm-wrong)):

  df["close_ewma_10"] = df.groupby('assetName')['pct_chg_close1']
            .apply(lambda x: x.ewm(span=10).mean())

shift by 10 backwards

  y = mt_df["mket_close_10"].shift(-10)

Date Types

convert seconds into datetime

  start_col = pd.to_datetime(df_train.visitStartTime, unit='s')

parse specific columns as specific date types:

df_train = pd.read_csv("../input/train-flat.csv.gz",
                        parse_dates=["date"], 
                        dtype={'fullVisitorId': 'str',
                            'date': 'str'
                            },
                      )

or if need to parse specific format:

  pd.to_datetime('13000101', format='%Y%m%d', errors='ignore')

access elements of datetime objects:

  data_df["weekday"] = data_df['date'].dt.weekday
  data_df["hour"] = data_df['date'].dt.hour

Data Type Manipulations

one-hot (one hot, onehot) encode / convert categorical:

  dfg_train = pd.get_dummies( dfg_train, columns = ['matchType'] )
  dfg_test = pd.get_dummies( dfg_test, columns = ['matchType'] )

multiple columns:

  pd.get_dummies(data=df, columns=['A', 'B'])

set category value by range:

  #here 1 = solo game, 2 = duo game, 3 = squad, 4 = custom
  df_train['team'] = 
     [1 if i == 1 else 2 if i == 2 else 4 if i > 4 else 3 
              for i in df_train["players_in_team_q90"]]

calculate value over two columns and make it a new column:

dfg_train['killsNorm'] = 
    dfg_train['kills_mean']*
    ((100-dfg_train['players_in_match'])/100 + 1)

data_df['hit_view_ratio'] =
    data_df['totals_pageviews']/data_df['totals_hits']

mark a set of columns as category type:

for col in X_cat_cols:
    df_train[col] = df_train[col].astype('category')

set category value based on set of values in column:

X_test['matchType'] = X_test['matchType'].astype('str')
X_test.loc[X_test['matchType'] == "squad-fpp", 'matchType'] = "squad"
X_test['matchType'] = X_test['matchType'].astype('category')

how to get the indices from a dataframe as a list (e.g., collect a subset):

  list(outlier_collection.index.values)

to see it all on one line in Jupyter (easier copying, e.g. to drop all in list):

print(list(outlier_collection.index.values))

drop all rows where a field is empty:

  mail_df.drop(email_df.query(
      "body == '' | to == '' | from_ == ''"
  ).index, inplace=True)

Multiprocessing

https://stackoverflow.com/questions/36794433/python-using-multiprocessing-on-a-pandas-dataframe

http://www.racketracer.com/2016/07/06/pandas-in-parallel/

Label transform Pandas column

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df["topic"] = le.fit_transform(df["topic"])
#if need in two parts
#df_test["Sex] = le.transform(df_text["Sex])

get the mappings generated by the labelencoder:

le_name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print(le_name_mapping)
le_value_mapping = dict(zip(le.transform(le.classes_), le.classes_))

Memory Reducer (From Kaggler):

def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object and col_type.name != 'category':
            #print(col_type)
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min &gt; np.iinfo(np.int8).min and c_max  np.iinfo(np.int16).min and c_max  np.iinfo(np.int32).min and c_max  np.iinfo(np.int64).min and c_max  np.finfo(np.float16).min and c_max  np.finfo(np.float32).min and c_max &lt; np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

#https://stackoverflow.com/questions/25351968/how-to-display-full-non-truncated-dataframe-information-in-html-when-convertin

def print_full(x):
    pd.set_option('display.max_rows', len(x))
    pd.set_option('display.max_columns', None)
    pd.set_option('display.width', 2000)
    pd.set_option('display.float_format', '{:20,.2f}'.format)
    pd.set_option('display.max_colwidth', -1)
    print(x)
    pd.reset_option('display.max_rows')
    pd.reset_option('display.max_columns')
    pd.reset_option('display.width')
    pd.reset_option('display.float_format')
    pd.reset_option('display.max_colwidth')

Advertisement