E Base R vs. Tidyverse

We have worked primarily with the tidyverse tools in this course. While many of the advantages of the tidyverse tools for data wrangling are their simplicity and human readability, there is actually nothing that tidyverse does which cannot be similarly achieved with baseR commands. If you are interested in comparing and contrasting check out the examples below.

These all use the iris dataset, a dataset that comes with baseR (just type iris).

E.1 Extract variables

iris$Sepal.Length # single variable
iris[, c("Species", "Petal.Width")] # by name
iris[, c(5, 4)]  # by column index
select(iris, Species)
select(iris, Species, Petal.Width) # by name
select(iris, 5, 4)  # by column index

E.2 Make new variables

iris$Petal.Ratio <- iris$Petal.Length/iris$Petal.Width

iris$Sepal.Ratio <- iris$Sepal.Length/iris$Sepal.Width
mutate(iris, 
       Petal.Ratio = Petal.Length/Petal.Width,
       Sepal.Ratio = Sepal.Length/Sepal.Width)

Extract observations (rows)

# Using [,]
iris[iris$Petal.Width > 0.5 & iris$Species == "setosa", ]

# Using subset (works very much like dplyr::filter)
subset(iris, Petal.Width > 0.5 & Species == "setosa")
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
44 5 3.5 1.6 0.6 setosa
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
44 5 3.5 1.6 0.6 setosa
filter(iris, Petal.Width > 0.5 & Species == "setosa")
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
5 3.5 1.6 0.6 setosa

E.3 Arrange observations (rows)

# descending order of species (alphabetic) followed by ascending order of Petal.Width
iris[order(rev(iris$Species), iris$Petal.Width) , ]
# descending order of species (alphabetic) followed by ascending order of Petal.Width
arrange(iris, desc(Species), Petal.Width) 

E.4 Summarise observations (rows)

# Manually create a data.frame
data.frame(Petal.Length.mean = mean(iris$Petal.Length),
           Petal.Length.sd = sd(iris$Petal.Length),
           Sepal.Length.mean = mean(iris$Sepal.Length),
           Sepal.Length.sd = sd(iris$Sepal.Length))
Petal.Length.mean Petal.Length.sd Sepal.Length.mean Sepal.Length.sd
3.758 1.765298 5.843333 0.8280661
summarise(iris, 
          Petal.Length.mean = mean(Petal.Length),
          Petal.Length.sd = sd(Petal.Length),
          Sepal.Length.mean = mean(Sepal.Length),
          Sepal.Length.sd = sd(Sepal.Length))
Petal.Length.mean Petal.Length.sd Sepal.Length.mean Sepal.Length.sd
3.758 1.765298 5.843333 0.8280661

E.5 Grouped operations

# First operate in the data.frame by group (split-apply)
mtcars_by <- by(mtcars, 
   INDICES = list(mtcars$cyl, mtcars$gear),
   FUN = function(x){
     data.frame(cyl = unique(x$cyl),
                gear = unique(x$gear),
                mpg.mean = mean(x$mpg),
                mpg.sd = sd(x$mpg),
                wt.mean = mean(x$wt),
                wt.sd = sd(x$wt))
   })

# Then combine the results into a data.frame
do.call(rbind, mtcars_by)
cyl gear mpg.mean mpg.sd wt.mean wt.sd
4 3 21.500 NA 2.465000 NA
6 3 19.750 2.3334524 3.337500 0.1732412
8 3 15.050 2.7743959 4.104083 0.7683069
4 4 26.925 4.8073604 2.378125 0.6006243
6 4 19.750 1.5524175 3.093750 0.4131460
4 5 28.200 3.1112698 1.826500 0.4433560
6 5 19.700 NA 2.770000 NA
8 5 15.400 0.5656854 3.370000 0.2828427
mtcars %>% 
  group_by(cyl, gear) %>% 
  summarise(mpg.mean = mean(mpg),
            mpg.sd = sd(mpg),
            wt.mean = mean(wt),
            wt.sd = sd(wt)) %>% 
  ungroup() # remove any groupings from downstream analysis
cyl gear mpg.mean mpg.sd wt.mean wt.sd
4 3 21.500 NA 2.465000 NA
4 4 26.925 4.8073604 2.378125 0.6006243
4 5 28.200 3.1112698 1.826500 0.4433560
6 3 19.750 2.3334524 3.337500 0.1732412
6 4 19.750 1.5524175 3.093750 0.4131460
6 5 19.700 NA 2.770000 NA
8 3 15.050 2.7743959 4.104083 0.7683069
8 5 15.400 0.5656854 3.370000 0.2828427

E.6 Create new columns as calculations

# First operate in the data.frame by group (split-apply)
iris_by <- by(iris, 
              INDICES = iris$Species, 
              FUN = function(x){
                x$Petal.Width.centered <- x$Petal.Width - mean(x$Petal.Width)
                return(x)
              })

# Then combine the results into a data.frame
do.call(rbind, iris_by)
iris %>% 
  group_by(Species) %>% 
  mutate(Petal.Width.centered = Petal.Width - mean(Petal.Width)) %>% 
  ungroup() # remove any groupings from downstream analysis

E.7 Filter rows with conditions evaluated by group

# First operate in the data.frame by group (split-apply)
widest_petals <- by(iris, 
                    INDICES = iris$Species, 
                    FUN = function(x){
                      x[x$Petal.Width == max(x$Petal.Width), ] 
                    })

# Then combine the results into a data.frame
do.call(rbind, widest_petals)
Sepal.Length Sepal.Width Petal.Length Petal.Width Species Petal.Ratio Sepal.Ratio
setosa 5.0 3.5 1.6 0.6 setosa 2.666667 1.428571
versicolor 5.9 3.2 4.8 1.8 versicolor 2.666667 1.843750
virginica.101 6.3 3.3 6.0 2.5 virginica 2.400000 1.909091
virginica.110 7.2 3.6 6.1 2.5 virginica 2.440000 2.000000
virginica.145 6.7 3.3 5.7 2.5 virginica 2.280000 2.030303
iris %>% 
  group_by(Species) %>% 
  filter(Petal.Width == max(Petal.Width))
Sepal.Length Sepal.Width Petal.Length Petal.Width Species Petal.Ratio Sepal.Ratio
5.0 3.5 1.6 0.6 setosa 2.666667 1.428571
5.9 3.2 4.8 1.8 versicolor 2.666667 1.843750
6.3 3.3 6.0 2.5 virginica 2.400000 1.909091
7.2 3.6 6.1 2.5 virginica 2.440000 2.000000
6.7 3.3 5.7 2.5 virginica 2.280000 2.030303

E.8 Pivot data

reshape(iris, 
        varying = c("Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width"),
        timevar = "trait",
        idvar = "id",
        v.names = "measurement",
        direction = "long")
iris %>% 
 pivot_longer(cols = Sepal.Length:Petal.Width, values_to = "measurement", names_to = "trait" )