Python’s pandas and R’s tidyverse are incredible tools to manipulate tabular data. Below is a data science “Rosetta Stone” to compare the different but similar commands in both languages.
Create a dataframe
tidyverse
df <- data.frame(
col_one = c('A','B','C','D'),
col_two = c(1,2,3,4)
)pandas
df = pd.DataFrame(
{'col_one': ['A', 'B', 'C','D'],
'col_two': [1, 2, 3, 4]}
)Inspect a dataframe
tidyverse
# Output column names
df %>% colnames
# Output column dimensions (row and column length)
df %>% dim
df %>% nrow
df %>% ncol
# Output first/last n rows
df %>% head(n)
df %>% tail(n)
#Get information about a df
df %>% summary
df %>% glimpse
#Clean column names
df %>% janitor::clean_names()pandas
# Output column names
df.columns
# Output column dimensions (row and column length)
df.shape
df.shape[0]
df.shape[1]
# Output first/last n rows
df.head(n)
df.tail(n)
#Get information about a df
df.describe()
df.info()
#Clean column names
from skimpy import clean_columns
clean_columns(df)Import data
tidyverse
df <- read_csv('data.csv')
df <- read_csv('data.csv', col_names = F)pandas
df = pd.read_csv('data.csv')
df = pd.read_csv('data.csv', header=None)Count frequency of vaules in a column
tidyverse
df %>% count(col_one)
df %>% count(col_one, sort = T)
df$col_one %>% janitor::tabyl()pandas
df['col_one'].value_counts()
df['col_one'].value_counts(ascending = False)
df['col_one'].value_counts(dropna=False)
df['col_one'].value_counts(normalize=True)
df['col_one'].value_counts(bin = n)Calculate summary statistics
tidyverse
df$col_one %>% mean
df$col_one %>% median
df$col_one %>% sd
df$col_one %>% min
df$col_one %>% maxpandas
df['col_one'].mean()
df['col_one'].median()
df['col_one'].std()
df['col_one'].min()
df['col_one'].max()Keep columns
tidyverse
df %>% select(col_one)
df %>% select(col_one,col_two)
df %>%select(matches("[pt]al"))
df %>% select(starts_with("col"))
...
df %>% select(ends_with("two"))
...
df %>% select(contains("col"))
...pandas
df.filter(items=['col_one'])
df.filter(items=['col_one','col_two'])
df.filter(regex='[pt]al')
df.loc[:,df.columns.str.startswith("col")]
df.filter(regex='^col')
df.loc[:,df.columns.str.endswith("two")]
df.filter(regex='two$')
df.loc[:,df.columns.str.contains("col")]
df.filter(like='col')Drop columns
tidyverse
df %>% select(!col_one)
df %>% select(!c(col_one,col_two))pandas
df.drop(columns=['col_one'])
df.drop(columns=['col_one','col_two'])Rename a column
tidyverse
df %>% rename(col_1 = column_one)
df %>% rename(col_1 = column_one,
col_2 = column_two
)pandas
df.rename(columns={"column_one": "col_1"})
df.rename(columns={"column_one": "col_1",
"column_two": "col_2"}
)Change datatype of a column
tidyverse
df %>% mutate(Race = as.character(Race), Age = as.numeric(Age))pandas
df.astype({"Race":'category', "Age":'int64'})Locate values
tidyverse
df[,]
df[1,]
df[c(1,6),]
df[c(1:6),]
df[,'col_one']
df[,c('col_one','col_three')]
df %>% select(col_one:col_three)
df[,c(1,3)]
df[,c('1:3')]pandas
df.loc[:,:]
df.loc[1,:]
df.loc[[1,6],:]
df.loc[[1:6],:]
df.loc[:,['col_one']]
df.loc[:,['col_one','col_three']]
df.loc[:,'col_one':'col_three']
df.iloc[:,[1,3]]
df.iloc[:,1:3]Query values in a column
tidyverse
df %>% filter(col_one >= 100)
df %>% filter(col_one != "Blue")
df %>% filter(col_one %in% c('A','B'))
df %>% filter(!(Race == "White" & Gender == "Male"))pandas
df.query("col_one >= 100")
df.query("col_one != 'Blue'")
df.query("col_one in ['A', 'B']")
df.query("not (Race == 'White' and Gender == 'Male')")Query by string
tidyverse
df %>% filter(str_detect(col1, "string"))
df %>% filter(str_detect(col1, c("string1", "string2")))
df %>% filter(str_starts(col1, "string"))
df %>% filter(str_ends(col1, "string"))
df %>% filter(str_match(col1, "regex_pattern"))
```
**pandas**
``` python
df.query('col1.str.contains("string").values')
df.query('col1.str.contains(["string1", "string2"]).values')
df.query('col1.str.startswith("string").values')
df.query('col1.str.endswith("string").values')
df.query('col1.str.match("regex_pattern").values')Sort a dataframe by a column’s value
tidyverse
df %>% arrange('col_one')
df %>% arrange(col_one %>% desc())pandas
df.sort_values('col_one')
df.sort_values('col_one', ascending=False)Keep distinct values in a column
tidyverse
df %>% distinct(col_one, .keep_all = T)
df %>% distinct()pandas
df.drop_duplicates(subset = ["col_one"])
df.drop_duplicates()Replace values in a column
tidyverse
df %>% mutate(across(everything(), ~replace(., . == 2 , "foo")))
df %>% mutate(across(c(col_one,col_two), ~replace(., . == 2 , "foo")))
df %>% mutate(col_one = ifelse(col_one == 2, "foo", col_one))pandas
df.replace(2,"foo")
df[['col_one','col_two']].replace(2,"foo")
df['col_one'].replace(2,"foo")Drop NA’s in a column
tidyverse
df %>% drop_na
df %>% drop_na(c("col_one", "col_two"))
df %>% select(where(~mean(is.na(.)) < n)) #percent thresholdpandas
df.dropna()
df.dropna(subset=['col_one', 'col_two'])
df.dropna(thresh=n) #integer thresholdFill NA’s in a column
tidyverse
df %>% replace(is.na(.), x)
df %>% mutate(col_one = ifelse(is.na(col_one),x,col_one))
df %>% fill(col_one, .direction = "up")
df %>% mutate(col_one= ifelse(is.na(col_one), mean(df$col_one, na.rm = T), col_one))pandas
df.fillna(x)
df['col_one'].fillna(x)
df['col_one'].fillna(method = 'ffill')
df['col_two'].fillna(df['col_two'].mean())Group and summarize columns
tidyverse
df %>% group_by(Race) %>% count()
df %>% group_by(Race) %>% summarize(new_col = median(Income))
df %>% group_by(Race, Sex) %>%
summarize(
new_col1 = median(Income),
new_col2 = n(),
new_col3 = mean(age)
)pandas
df.groupby('Race', as_index=False).count()
df.groupby('Race', as_index=False)['Income'].median()
(df.groupby(['Race', 'Sex'], as_index=False)
.agg(
new_col1=pd.NamedAgg(column = 'Income', aggfunc = np.median),
new_col1=pd.NamedAgg(column = 'Age', aggfunc = np.mean)
))Pivot longer
tidyverse
df %>%
pivot_longer(
cols = Belgium:`United Kingdom`,
names_to = "Country",
values_to = "Fatalities"
)pandas
(df
.melt(
id_vars=['iyear'],
var_name='Country',
value_name='Fatalities')
)Pivot wider
tidyverse
df %>% pivot_wider(
names_from = state,
values_from = number
)pandas
df.pivot_table(index=['year','name','sex'],
columns='state',
values='number'
)Bind two dataframes
tidyverse
df1 %>% bind_rows(df2)
df1 %>% bind_cols(df2)pandas
pd.concat([df1,df2])
pd.concat([df1,df2], axis=1)Inner join two dataframes
tidyverse
df1 %>% inner_join(
df2, by = c(col_one = "first_col")
)pandas
pd.merge(df1, df2,
left_on='col_one', right_on='first_col'
)Left join dataframes
tidyverse
df1 %>% left_join(df2,
by = c(col_one = "first_col")
)pandas
pd.merge(df1, df2, how = 'left',
left_on='col_one', right_on='first_col'
)Add a new column
tidyverse
df %>% mutate(
twomore = x + 2,
twoless = x - 2
)pandas
df.assign(
twomore = lambda df: df.x + 2,
twoless = lambda df: df.x - 2
)