Python’s pandas and R’s tidyverse are incredible tools to manipulate tabular data. Below is a data science “Rosetta Stone” to compare the different but similar commands in both languages.
Create a dataframe
tidyverse
<- data.frame(
df col_one = c('A','B','C','D'),
col_two = c(1,2,3,4)
)
pandas
= pd.DataFrame(
df 'col_one': ['A', 'B', 'C','D'],
{'col_two': [1, 2, 3, 4]}
)
Inspect a dataframe
tidyverse
# Output column names
%>% colnames
df
# Output column dimensions (row and column length)
%>% dim
df %>% nrow
df %>% ncol
df
# Output first/last n rows
%>% head(n)
df %>% tail(n)
df
#Get information about a df
%>% summary
df %>% glimpse
df
#Clean column names
%>% janitor::clean_names() df
pandas
# Output column names
df.columns
# Output column dimensions (row and column length)
df.shape0]
df.shape[1]
df.shape[
# Output first/last n rows
df.head(n)
df.tail(n)
#Get information about a df
df.describe()
df.info()
#Clean column names
from skimpy import clean_columns
clean_columns(df)
Import data
tidyverse
<- read_csv('data.csv')
df <- read_csv('data.csv', col_names = F) df
pandas
= pd.read_csv('data.csv')
df = pd.read_csv('data.csv', header=None) df
Count frequency of vaules in a column
tidyverse
%>% count(col_one)
df %>% count(col_one, sort = T)
df $col_one %>% janitor::tabyl() df
pandas
'col_one'].value_counts()
df['col_one'].value_counts(ascending = False)
df['col_one'].value_counts(dropna=False)
df['col_one'].value_counts(normalize=True)
df['col_one'].value_counts(bin = n) df[
Calculate summary statistics
tidyverse
$col_one %>% mean
df$col_one %>% median
df$col_one %>% sd
df$col_one %>% min
df$col_one %>% max df
pandas
'col_one'].mean()
df['col_one'].median()
df['col_one'].std()
df['col_one'].min()
df['col_one'].max() df[
Keep columns
tidyverse
%>% select(col_one)
df %>% select(col_one,col_two)
df %>%select(matches("[pt]al"))
df %>% select(starts_with("col"))
df
...%>% select(ends_with("two"))
df
...%>% select(contains("col"))
df ...
pandas
filter(items=['col_one'])
df.filter(items=['col_one','col_two'])
df.filter(regex='[pt]al')
df.str.startswith("col")]
df.loc[:,df.columns.filter(regex='^col')
df.str.endswith("two")]
df.loc[:,df.columns.filter(regex='two$')
df.str.contains("col")]
df.loc[:,df.columns.filter(like='col') df.
Drop columns
tidyverse
%>% select(!col_one)
df %>% select(!c(col_one,col_two)) df
pandas
=['col_one'])
df.drop(columns=['col_one','col_two']) df.drop(columns
Rename a column
tidyverse
%>% rename(col_1 = column_one)
df %>% rename(col_1 = column_one,
df col_2 = column_two
)
pandas
={"column_one": "col_1"})
df.rename(columns={"column_one": "col_1",
df.rename(columns"column_two": "col_2"}
)
Change datatype of a column
tidyverse
%>% mutate(Race = as.character(Race), Age = as.numeric(Age)) df
pandas
"Race":'category', "Age":'int64'}) df.astype({
Locate values
tidyverse
df[,]1,]
df[c(1,6),]
df[c(1:6),]
df['col_one']
df[,c('col_one','col_three')]
df[,%>% select(col_one:col_three)
df c(1,3)]
df[,c('1:3')] df[,
pandas
df.loc[:,:]1,:]
df.loc[1,6],:]
df.loc[[1:6],:]
df.loc[['col_one']]
df.loc[:,['col_one','col_three']]
df.loc[:,['col_one':'col_three']
df.loc[:,1,3]]
df.iloc[:,[1:3] df.iloc[:,
Query values in a column
tidyverse
%>% filter(col_one >= 100)
df %>% filter(col_one != "Blue")
df %>% filter(col_one %in% c('A','B'))
df %>% filter(!(Race == "White" & Gender == "Male")) df
pandas
"col_one >= 100")
df.query("col_one != 'Blue'")
df.query("col_one in ['A', 'B']")
df.query("not (Race == 'White' and Gender == 'Male')") df.query(
Query by string
tidyverse
%>% filter(str_detect(col1, "string"))
df %>% filter(str_detect(col1, c("string1", "string2")))
df %>% filter(str_starts(col1, "string"))
df %>% filter(str_ends(col1, "string"))
df %>% filter(str_match(col1, "regex_pattern"))
df ```
**pandas**
``` python
df.query('col1.str.contains("string").values')
df.query('col1.str.contains(["string1", "string2"]).values')
df.query('col1.str.startswith("string").values')
df.query('col1.str.endswith("string").values')
df.query('col1.str.match("regex_pattern").values')
Sort a dataframe by a column’s value
tidyverse
%>% arrange('col_one')
df %>% arrange(col_one %>% desc()) df
pandas
'col_one')
df.sort_values('col_one', ascending=False) df.sort_values(
Keep distinct values in a column
tidyverse
%>% distinct(col_one, .keep_all = T)
df %>% distinct() df
pandas
= ["col_one"])
df.drop_duplicates(subset df.drop_duplicates()
Replace values in a column
tidyverse
%>% mutate(across(everything(), ~replace(., . == 2 , "foo")))
df %>% mutate(across(c(col_one,col_two), ~replace(., . == 2 , "foo")))
df %>% mutate(col_one = ifelse(col_one == 2, "foo", col_one)) df
pandas
2,"foo")
df.replace('col_one','col_two']].replace(2,"foo")
df[['col_one'].replace(2,"foo") df[
Drop NA’s in a column
tidyverse
%>% drop_na
df %>% drop_na(c("col_one", "col_two"))
df %>% select(where(~mean(is.na(.)) < n)) #percent threshold df
pandas
df.dropna()=['col_one', 'col_two'])
df.dropna(subset=n) #integer threshold df.dropna(thresh
Fill NA’s in a column
tidyverse
%>% replace(is.na(.), x)
df %>% mutate(col_one = ifelse(is.na(col_one),x,col_one))
df %>% fill(col_one, .direction = "up")
df %>% mutate(col_one= ifelse(is.na(col_one), mean(df$col_one, na.rm = T), col_one)) df
pandas
df.fillna(x)'col_one'].fillna(x)
df['col_one'].fillna(method = 'ffill')
df['col_two'].fillna(df['col_two'].mean()) df[
Group and summarize columns
tidyverse
%>% group_by(Race) %>% count()
df %>% group_by(Race) %>% summarize(new_col = median(Income))
df %>% group_by(Race, Sex) %>%
df summarize(
new_col1 = median(Income),
new_col2 = n(),
new_col3 = mean(age)
)
pandas
'Race', as_index=False).count()
df.groupby('Race', as_index=False)['Income'].median()
df.groupby('Race', 'Sex'], as_index=False)
(df.groupby([
.agg(=pd.NamedAgg(column = 'Income', aggfunc = np.median),
new_col1=pd.NamedAgg(column = 'Age', aggfunc = np.mean)
new_col1 ))
Pivot longer
tidyverse
%>%
df pivot_longer(
cols = Belgium:`United Kingdom`,
names_to = "Country",
values_to = "Fatalities"
)
pandas
(df
.melt(=['iyear'],
id_vars='Country',
var_name='Fatalities')
value_name )
Pivot wider
tidyverse
%>% pivot_wider(
df names_from = state,
values_from = number
)
pandas
=['year','name','sex'],
df.pivot_table(index='state',
columns='number'
values )
Bind two dataframes
tidyverse
%>% bind_rows(df2)
df1 %>% bind_cols(df2) df1
pandas
pd.concat([df1,df2])=1) pd.concat([df1,df2], axis
Inner join two dataframes
tidyverse
%>% inner_join(
df1 by = c(col_one = "first_col")
df2, )
pandas
pd.merge(df1, df2, ='col_one', right_on='first_col'
left_on )
Left join dataframes
tidyverse
%>% left_join(df2,
df1 by = c(col_one = "first_col")
)
pandas
= 'left',
pd.merge(df1, df2, how ='col_one', right_on='first_col'
left_on )
Add a new column
tidyverse
%>% mutate(
df twomore = x + 2,
twoless = x - 2
)
pandas
df.assign(= lambda df: df.x + 2,
twomore = lambda df: df.x - 2
twoless )