Pandas vs Tidyverse

statistics
python
r
notation
Author

Aj Averett

Published

January 20, 2023

Python’s pandas and R’s tidyverse are incredible tools to manipulate tabular data. Below is a data science “Rosetta Stone” to compare the different but similar commands in both languages.

Create a dataframe

tidyverse

df <- data.frame(
      col_one = c('A','B','C','D'),
      col_two = c(1,2,3,4)
)

pandas

df = pd.DataFrame(
      {'col_one': ['A', 'B', 'C','D'],
       'col_two': [1, 2, 3, 4]}
)

Inspect a dataframe

tidyverse


    # Output column names
    df %>% colnames

    # Output column dimensions (row and column length)
    df %>% dim
    df %>% nrow
    df %>% ncol

    # Output first/last n rows
    df %>% head(n)
    df %>% tail(n)

    #Get information about a df
    df %>% summary
    df %>% glimpse

    #Clean column names
    df %>% janitor::clean_names()

pandas


    # Output column names
    df.columns

    # Output column dimensions (row and column length)
    df.shape
    df.shape[0]
    df.shape[1]

    # Output first/last n rows
    df.head(n)
    df.tail(n)

    #Get information about a df
    df.describe()
    df.info()

    #Clean column names
    from skimpy import clean_columns
    clean_columns(df)

Import data

tidyverse

df <- read_csv('data.csv')
df <- read_csv('data.csv', col_names = F)

pandas

df = pd.read_csv('data.csv')
df = pd.read_csv('data.csv', header=None)

Count frequency of vaules in a column

tidyverse

df %>% count(col_one)
df %>% count(col_one, sort = T)
df$col_one %>% janitor::tabyl()

pandas

df['col_one'].value_counts()
df['col_one'].value_counts(ascending = False)
df['col_one'].value_counts(dropna=False)
df['col_one'].value_counts(normalize=True)
df['col_one'].value_counts(bin = n)

Calculate summary statistics

tidyverse

df$col_one %>% mean
df$col_one %>% median
df$col_one %>% sd
df$col_one %>% min
df$col_one %>% max

pandas

df['col_one'].mean()
df['col_one'].median()
df['col_one'].std()
df['col_one'].min()
df['col_one'].max()

Keep columns

tidyverse

df %>% select(col_one)
df %>% select(col_one,col_two)
df %>%select(matches("[pt]al"))
df %>% select(starts_with("col"))
...
df %>% select(ends_with("two"))
...
df %>% select(contains("col"))
...

pandas

df.filter(items=['col_one'])
df.filter(items=['col_one','col_two'])
df.filter(regex='[pt]al')
df.loc[:,df.columns.str.startswith("col")]
df.filter(regex='^col')
df.loc[:,df.columns.str.endswith("two")]
df.filter(regex='two$')
df.loc[:,df.columns.str.contains("col")]
df.filter(like='col')

Drop columns

tidyverse

df %>% select(!col_one)
df %>% select(!c(col_one,col_two))

pandas

df.drop(columns=['col_one'])
df.drop(columns=['col_one','col_two'])

Rename a column

tidyverse

df %>% rename(col_1 = column_one)
df %>% rename(col_1 = column_one, 
              col_2 = column_two
)

pandas

df.rename(columns={"column_one": "col_1"})
df.rename(columns={"column_one": "col_1", 
                   "column_two": "col_2"}
)

Change datatype of a column

tidyverse

df %>% mutate(Race = as.character(Race), Age = as.numeric(Age))

pandas

df.astype({"Race":'category', "Age":'int64'})

Locate values

tidyverse

df[,]
df[1,]
df[c(1,6),]
df[c(1:6),]
df[,'col_one']
df[,c('col_one','col_three')]
df %>% select(col_one:col_three)
df[,c(1,3)]
df[,c('1:3')]

pandas

df.loc[:,:]
df.loc[1,:] 
df.loc[[1,6],:] 
df.loc[[1:6],:] 
df.loc[:,['col_one']]
df.loc[:,['col_one','col_three']] 
df.loc[:,'col_one':'col_three'] 
df.iloc[:,[1,3]]
df.iloc[:,1:3]

Query values in a column

tidyverse

df %>% filter(col_one >= 100)
df %>% filter(col_one != "Blue")
df %>% filter(col_one %in% c('A','B'))
df %>% filter(!(Race == "White" & Gender == "Male"))

pandas

df.query("col_one >= 100")
df.query("col_one != 'Blue'")
df.query("col_one in ['A', 'B']")
df.query("not (Race == 'White' and Gender == 'Male')")

Query by string

tidyverse

    df %>% filter(str_detect(col1, "string"))
    df %>% filter(str_detect(col1, c("string1", "string2")))
    df %>% filter(str_starts(col1, "string"))
    df %>% filter(str_ends(col1, "string"))
    df %>% filter(str_match(col1, "regex_pattern"))
    ```
    
**pandas**
``` python
    df.query('col1.str.contains("string").values')
    df.query('col1.str.contains(["string1", "string2"]).values')
    df.query('col1.str.startswith("string").values')
    df.query('col1.str.endswith("string").values')
    df.query('col1.str.match("regex_pattern").values')

Sort a dataframe by a column’s value

tidyverse

df %>% arrange('col_one')
df %>% arrange(col_one %>% desc())

pandas

df.sort_values('col_one')
df.sort_values('col_one', ascending=False)

Keep distinct values in a column

tidyverse

df %>% distinct(col_one, .keep_all = T)
df %>% distinct()

pandas

df.drop_duplicates(subset = ["col_one"])
df.drop_duplicates()

Replace values in a column

tidyverse

df %>% mutate(across(everything(), ~replace(., . ==  2 , "foo")))
df %>% mutate(across(c(col_one,col_two), ~replace(., . ==  2 , "foo")))
df %>% mutate(col_one = ifelse(col_one == 2, "foo", col_one))

pandas

df.replace(2,"foo")
df[['col_one','col_two']].replace(2,"foo")
df['col_one'].replace(2,"foo")

Drop NA’s in a column

tidyverse

df %>% drop_na
df %>% drop_na(c("col_one", "col_two"))
df %>% select(where(~mean(is.na(.)) < n)) #percent threshold

pandas

df.dropna()
df.dropna(subset=['col_one', 'col_two'])
df.dropna(thresh=n) #integer threshold

Fill NA’s in a column

tidyverse

df %>% replace(is.na(.), x)
df %>% mutate(col_one = ifelse(is.na(col_one),x,col_one))
df %>% fill(col_one, .direction = "up")
df %>% mutate(col_one= ifelse(is.na(col_one), mean(df$col_one, na.rm = T), col_one))

pandas

df.fillna(x)
df['col_one'].fillna(x)
df['col_one'].fillna(method = 'ffill')
df['col_two'].fillna(df['col_two'].mean())

Group and summarize columns

tidyverse

df %>% group_by(Race) %>% count()
df %>% group_by(Race) %>% summarize(new_col = median(Income))
df %>% group_by(Race, Sex) %>%
     summarize(
       new_col1 = median(Income),
       new_col2 = n(),
       new_col3 = mean(age)
)

pandas

df.groupby('Race', as_index=False).count()
df.groupby('Race', as_index=False)['Income'].median()
(df.groupby(['Race', 'Sex'], as_index=False)
   .agg(
      new_col1=pd.NamedAgg(column = 'Income', aggfunc = np.median),
      new_col1=pd.NamedAgg(column = 'Age', aggfunc = np.mean)
))

Pivot longer

tidyverse

df %>% 
  pivot_longer(
     cols = Belgium:`United Kingdom`,
     names_to = "Country",
     values_to = "Fatalities"
)

pandas

(df
  .melt(
    id_vars=['iyear'],
    var_name='Country', 
    value_name='Fatalities')
)

Pivot wider

tidyverse

df %>% pivot_wider(
      names_from = state, 
      values_from = number
)

pandas

df.pivot_table(index=['year','name','sex'],
      columns='state',
      values='number'
)

Bind two dataframes

tidyverse

df1 %>% bind_rows(df2)
df1 %>% bind_cols(df2)

pandas

pd.concat([df1,df2])
pd.concat([df1,df2], axis=1)

Inner join two dataframes

tidyverse

df1 %>% inner_join(
      df2, by = c(col_one = "first_col")
)

pandas

pd.merge(df1, df2, 
     left_on='col_one', right_on='first_col'
)

Left join dataframes

tidyverse

df1 %>% left_join(df2, 
      by = c(col_one = "first_col")
)

pandas

pd.merge(df1, df2, how = 'left',
     left_on='col_one', right_on='first_col'
)

Add a new column

tidyverse

df %>% mutate(
      twomore = x + 2,
      twoless    = x - 2
)

pandas

df.assign(
  twomore = lambda df: df.x + 2,
  twoless    = lambda df: df.x - 2
)