https://www.r-bloggers.com/2022/02/r-python-rosetta-stone-reading-files-and-column-transformations
libs <- c('dplyr', 'tidyr', 'stringr', # wrangling 'knitr','kableExtra', # table styling 'ggplot2','gridExtra', # plots 'viridis', # visuals styling 'reticulate') # e pluribus unum # devtools::install_cran(c('pacman', libs)) # pacman::p_load(libs) invisible(lapply(libs, library, character.only = TRUE))
install.packages(c("gt", "palmerpenguins", 'vroom', 'readr')) libs <- c('dplyr', 'stringr', # wrangling 'palmerpenguins', 'gt', # data, table styling 'vroom', 'readr', # read & write data 'tidyr', 'purrr', # wrangle & iterate 'fs', # file system 'reticulate') # python support invisible(lapply(libs, library, character.only = TRUE))
# use_python("/usr/bin/python") use_python("/usr/bin/python3") df_orig <- penguins %>% mutate_if(is.integer, as.double) %>% select(-contains("length"), -contains("depth"))
import pandas as pd import glob pd.set_option('display.max_columns', None)
df_orig %>% count(species) %>% gt()
fp <- "data/penguins/" df_orig %>% group_by(species) %>% # slicing off the first n entries (here by group slice_head(n = 1) %>% ungroup() %>% nest(data = c(-species)) %>% pwalk(function(species, data) write_csv(data, file.path(fp, str_c(species, "_penguins.csv"))))
single_file <- vroom( paste0(fp, "Adelie_penguins.csv", col_types = cols())) single_file %>% gt()
single_file = pd.read_csv(r.fp + "Adelie_penguins.csv") single_file
(files <- fs::dir_ls(fp, glob = "*penguins.csv")) ## Adelie_penguins.csv Chinstrap_penguins.csv Gentoo_penguins.csv df <- vroom(files, col_types = cols(), id = "name") df %>% gt()
In Python, we use glob to grab the file names:
files_py = glob.glob(r.fp + "*penguins.csv") files_py # df_py = pd.concat((pd.read_csv(f) for f in files_py)) # file name. Luckily, we can get that information through the nifty assign function: df_py = pd.concat((pd.read_csv(f).assign(name = f) for f in files_py)) df_py df_py
df %>% separate(name, into = c("name", "filetype"), sep = "\\.") %>% gt()
df %>% separate(name, into = c("name", "filetype"), sep = "\\.") %>% separate(name, into = c("species", "animal"), sep = "_") %>% gt()
df_py = pd.concat((pd.read_csv(f).assign(name = f) for f in files_py)) df_py[['name', 'filetype']] = df_py['name'].str.split('\\.', expand=True) df_py[['species', 'animal']] = df_py['name'].str.split('_', expand=True) df_py = df_py.drop('name', axis = 'columns') df_py
df %>% separate(name, into = c("name", "filetype"), sep = "\\.") %>% separate(name, into = c("species", "animal"), sep = "_") %>% gt()
df %>% separate(name, into = c("name", "filetype"), sep = "\\.") %>% separate(name, into = c("species", "animal"), sep = "_") %>% unite(species, animal, col = "name", sep = "_") %>% unite(name, filetype, col = "name", sep = ".") %>% gt()
df_py['name'] = df_py['species'] + "_" + df_py['animal'] + "." + df_py['filetype'] df_py = df_py.drop(['species', 'animal', 'filetype'], axis = 'columns') df_py
foo = [1, 2, 3] print(foo[0]) print(r.iris.loc[:5, ["Sepal.Length", "Species"]])
import pandas from sklearn.model_selection import train_test_split from sklearn.tree import DecisionTreeClassifier
train, test = train_test_split(r.iris, test_size = 0.4, random_state = 4321) X = train.drop('Species', axis = 1) y = train.loc[:, 'Species'].values X_test = test.drop('Species', axis = 1) y_test = test.loc[:, 'Species'].values
tree = DecisionTreeClassifier(random_state=4321) clf = tree.fit(X, y) pred = clf.predict(X_test)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.