Select variables/cols (solutions)

Primary exercises

Apply the following to survey data:

Select personal information {name, age, gender, height} into a new tibble survey_personal_info.

survey_personal_info <- select(survey, name, age, gender, height)

Select personal information as previous exercise into a new tibble survey_personal_info but with variable names initials in uppercase, e.g. Name, Age etc.

survey_personal_info <- select(survey, Name=name, Age=age, Gender=gender, Height=height)

Reorder the variables in survey dataset as such that name,age and gender appear as first, second and the third column followed by the remaining variables.

select(survey, name,age,gender,everything())

# A tibble: 233 × 13
   name      age gender span1 span2 hand  fold    pulse clap    exercise smokes height m.i     
   <chr>   <dbl> <chr>  <dbl> <dbl> <chr> <chr>   <dbl> <chr>   <chr>    <chr>   <dbl> <chr>   
 1 Alyson   18.2 female  18.5  18   right right      92 left    some     never    173  metric  
 2 Todd     17.6 male    19.5  20.5 left  right     104 left    none     regul    178. imperial
 3 Gerald   16.9 male    18    13.3 right left       87 neither none     occas     NA  <NA>    
 4 Robert   20.3 male    18.8  18.9 right right      NA neither none     never    160  metric  
 5 Dustin   23.7 male    20    20   right neither    35 right   some     never    165  metric  
 6 Abby     21   female  18    17.7 right left       64 right   some     never    173. imperial
 7 Andre    18.8 male    17.7  17.7 right left       83 right   freq     never    183. imperial
 8 Michael  35.8 female  17    17.3 right right      74 right   freq     never    157  metric  
 9 Edward   19   male    20    19.5 right right      72 right   some     never    175  metric  
10 Carl     22.3 male    18.5  18.5 right right      90 right   some     never    167  metric  
# … with 223 more rows
# ℹ Use `print(n = ...)` to see more rows

Deselect variables that relate to hand and/or arm (e.g. span1, span2, hand, etc.). See also description survey data.

select(survey, -span1,-span2,-hand,-fold,-clap)

# A tibble: 233 × 8
   name    gender pulse exercise smokes height m.i        age
   <chr>   <chr>  <dbl> <chr>    <chr>   <dbl> <chr>    <dbl>
 1 Alyson  female    92 some     never    173  metric    18.2
 2 Todd    male     104 none     regul    178. imperial  17.6
 3 Gerald  male      87 none     occas     NA  <NA>      16.9
 4 Robert  male      NA none     never    160  metric    20.3
 5 Dustin  male      35 some     never    165  metric    23.7
 6 Abby    female    64 some     never    173. imperial  21  
 7 Andre   male      83 freq     never    183. imperial  18.8
 8 Michael female    74 freq     never    157  metric    35.8
 9 Edward  male      72 some     never    175  metric    19  
10 Carl    male      90 some     never    167  metric    22.3
# … with 223 more rows
# ℹ Use `print(n = ...)` to see more rows

Select the top 20 names along with gender.

# 1)
survey_sub <- select(survey, name,gender)
head( survey_sub , 20)

# A tibble: 20 × 2
   name    gender
   <chr>   <chr> 
 1 Alyson  female
 2 Todd    male  
 3 Gerald  male  
 4 Robert  male  
 5 Dustin  male  
 6 Abby    female
 7 Andre   male  
 8 Michael female
 9 Edward  male  
10 Carl    male  
11 Noemi   female
12 Alfred  male  
13 Bernice female
14 Velma   female
15 Eddie   male  
16 Fern    female
17 Carolyn female
18 Virgil  male  
19 Ken     male  
20 Richard male

# 2) shorter solution without intermediate variable 'survey_sub' :
head(select(survey,name),20)

# A tibble: 20 × 1
   name   
   <chr>  
 1 Alyson 
 2 Todd   
 3 Gerald 
 4 Robert 
 5 Dustin 
 6 Abby   
 7 Andre  
 8 Michael
 9 Edward 
10 Carl   
11 Noemi  
12 Alfred 
13 Bernice
14 Velma  
15 Eddie  
16 Fern   
17 Carolyn
18 Virgil 
19 Ken    
20 Richard

Reproduce the following tibbles (note that variables are renamed and reshuffled):

6.1 First 5 observations.

# Remark: by enclosing select(...) as the first argument of 'head' function you 
# can avoid creating intermediate variables.

head( select(survey, SPAN1=span1, SPAN2=span2, everything()), 5)

# A tibble: 5 × 13
  SPAN1 SPAN2 name   gender hand  fold    pulse clap    exercise smokes height m.i        age
  <dbl> <dbl> <chr>  <chr>  <chr> <chr>   <dbl> <chr>   <chr>    <chr>   <dbl> <chr>    <dbl>
1  18.5  18   Alyson female right right      92 left    some     never    173  metric    18.2
2  19.5  20.5 Todd   male   left  right     104 left    none     regul    178. imperial  17.6
3  18    13.3 Gerald male   right left       87 neither none     occas     NA  <NA>      16.9
4  18.8  18.9 Robert male   right right      NA neither none     never    160  metric    20.3
5  20    20   Dustin male   right neither    35 right   some     never    165  metric    23.7

6.1 Last 3 observations.

tail( select(survey, Hand=hand,Fold=fold,Clap=clap, everything()), 3)

# A tibble: 3 × 13
  Hand  Fold  Clap  name   gender span1 span2 pulse exercise smokes height m.i      age
  <chr> <chr> <chr> <chr>  <chr>  <dbl> <dbl> <dbl> <chr>    <chr>   <dbl> <chr>  <dbl>
1 right right right Tracey female  17.5  16.5    NA some     never    170  metric  18.6
2 right right right Keith  male    21    21.5    90 some     never    183  metric  17.2
3 right right right Celina female  17.6  17.3    85 freq     never    168. metric  17.8

Extra exercises

Rename the m.i variable to system.

# 1) Very tedious, you need to type all the variable names and 
# only rename the 'm.i' variable to 'system'
#
select(survey , name, gender, span1, span2, hand, fold, pulse, clap, 
       exercise, smokes, height, system=m.i, age)

# A tibble: 233 × 13
   name    gender span1 span2 hand  fold    pulse clap    exercise smokes height system     age
   <chr>   <chr>  <dbl> <dbl> <chr> <chr>   <dbl> <chr>   <chr>    <chr>   <dbl> <chr>    <dbl>
 1 Alyson  female  18.5  18   right right      92 left    some     never    173  metric    18.2
 2 Todd    male    19.5  20.5 left  right     104 left    none     regul    178. imperial  17.6
 3 Gerald  male    18    13.3 right left       87 neither none     occas     NA  <NA>      16.9
 4 Robert  male    18.8  18.9 right right      NA neither none     never    160  metric    20.3
 5 Dustin  male    20    20   right neither    35 right   some     never    165  metric    23.7
 6 Abby    female  18    17.7 right left       64 right   some     never    173. imperial  21  
 7 Andre   male    17.7  17.7 right left       83 right   freq     never    183. imperial  18.8
 8 Michael female  17    17.3 right right      74 right   freq     never    157  metric    35.8
 9 Edward  male    20    19.5 right right      72 right   some     never    175  metric    19  
10 Carl    male    18.5  18.5 right right      90 right   some     never    167  metric    22.3
# … with 223 more rows
# ℹ Use `print(n = ...)` to see more rows

# 2) Shorter but side-effect is that m.i (now system) comes at 
# the front.
select(survey, system=m.i, everything())

# A tibble: 233 × 13
   system   name    gender span1 span2 hand  fold    pulse clap    exercise smokes height   age
   <chr>    <chr>   <chr>  <dbl> <dbl> <chr> <chr>   <dbl> <chr>   <chr>    <chr>   <dbl> <dbl>
 1 metric   Alyson  female  18.5  18   right right      92 left    some     never    173   18.2
 2 imperial Todd    male    19.5  20.5 left  right     104 left    none     regul    178.  17.6
 3 <NA>     Gerald  male    18    13.3 right left       87 neither none     occas     NA   16.9
 4 metric   Robert  male    18.8  18.9 right right      NA neither none     never    160   20.3
 5 metric   Dustin  male    20    20   right neither    35 right   some     never    165   23.7
 6 imperial Abby    female  18    17.7 right left       64 right   some     never    173.  21  
 7 imperial Andre   male    17.7  17.7 right left       83 right   freq     never    183.  18.8
 8 metric   Michael female  17    17.3 right right      74 right   freq     never    157   35.8
 9 metric   Edward  male    20    19.5 right right      72 right   some     never    175   19  
10 metric   Carl    male    18.5  18.5 right right      90 right   some     never    167   22.3
# … with 223 more rows
# ℹ Use `print(n = ...)` to see more rows

# 3) Use rename function (see ?dplyr::rename). 
rename(survey,system=m.i)

# A tibble: 233 × 13
   name    gender span1 span2 hand  fold    pulse clap    exercise smokes height system     age
   <chr>   <chr>  <dbl> <dbl> <chr> <chr>   <dbl> <chr>   <chr>    <chr>   <dbl> <chr>    <dbl>
 1 Alyson  female  18.5  18   right right      92 left    some     never    173  metric    18.2
 2 Todd    male    19.5  20.5 left  right     104 left    none     regul    178. imperial  17.6
 3 Gerald  male    18    13.3 right left       87 neither none     occas     NA  <NA>      16.9
 4 Robert  male    18.8  18.9 right right      NA neither none     never    160  metric    20.3
 5 Dustin  male    20    20   right neither    35 right   some     never    165  metric    23.7
 6 Abby    female  18    17.7 right left       64 right   some     never    173. imperial  21  
 7 Andre   male    17.7  17.7 right left       83 right   freq     never    183. imperial  18.8
 8 Michael female  17    17.3 right right      74 right   freq     never    157  metric    35.8
 9 Edward  male    20    19.5 right right      72 right   some     never    175  metric    19  
10 Carl    male    18.5  18.5 right right      90 right   some     never    167  metric    22.3
# … with 223 more rows
# ℹ Use `print(n = ...)` to see more rows

Select name along with all categorical variables into a new tibble survey_cats.

# Categrical data: variables which take on categories as values, e.g. 
# 
# gender   : {male, female}
# hand     : {left, right}
# fold     : {left, right, neither} 
# clap     : {left, right, neither} 
# exercise : {freq, some, none}
# smokes   : {heavy, regul, occas, never}
# m.i      : {metric, imperial}
#
# 
survey_cats <- select(survey, name, gender, hand, fold, clap, exercise, smokes, m.i)
survey_cats

# A tibble: 233 × 8
   name    gender hand  fold    clap    exercise smokes m.i     
   <chr>   <chr>  <chr> <chr>   <chr>   <chr>    <chr>  <chr>   
 1 Alyson  female right right   left    some     never  metric  
 2 Todd    male   left  right   left    none     regul  imperial
 3 Gerald  male   right left    neither none     occas  <NA>    
 4 Robert  male   right right   neither none     never  metric  
 5 Dustin  male   right neither right   some     never  metric  
 6 Abby    female right left    right   some     never  imperial
 7 Andre   male   right left    right   freq     never  imperial
 8 Michael female right right   right   freq     never  metric  
 9 Edward  male   right right   right   some     never  metric  
10 Carl    male   right right   right   some     never  metric  
# … with 223 more rows
# ℹ Use `print(n = ...)` to see more rows

Create a new tibble survey_nums with name and all numerical variables.

survey_nums <- select(survey, name, span1, span2, pulse, height, age)
survey_nums

# A tibble: 233 × 6
   name    span1 span2 pulse height   age
   <chr>   <dbl> <dbl> <dbl>  <dbl> <dbl>
 1 Alyson   18.5  18      92   173   18.2
 2 Todd     19.5  20.5   104   178.  17.6
 3 Gerald   18    13.3    87    NA   16.9
 4 Robert   18.8  18.9    NA   160   20.3
 5 Dustin   20    20      35   165   23.7
 6 Abby     18    17.7    64   173.  21  
 7 Andre    17.7  17.7    83   183.  18.8
 8 Michael  17    17.3    74   157   35.8
 9 Edward   20    19.5    72   175   19  
10 Carl     18.5  18.5    90   167   22.3
# … with 223 more rows
# ℹ Use `print(n = ...)` to see more rows

For this exercise you’ll need an additional helper function where explained
here.

4.1 Reproduce the result from the previous exercise (3) without dictating all numerical variable names. Hint: you’ll also need is.numeric function (see ?is.numeric for help).

bind_cols(survey['name'], select(survey, where(is.numeric)))

# A tibble: 233 × 6
   name    span1 span2 pulse height   age
   <chr>   <dbl> <dbl> <dbl>  <dbl> <dbl>
 1 Alyson   18.5  18      92   173   18.2
 2 Todd     19.5  20.5   104   178.  17.6
 3 Gerald   18    13.3    87    NA   16.9
 4 Robert   18.8  18.9    NA   160   20.3
 5 Dustin   20    20      35   165   23.7
 6 Abby     18    17.7    64   173.  21  
 7 Andre    17.7  17.7    83   183.  18.8
 8 Michael  17    17.3    74   157   35.8
 9 Edward   20    19.5    72   175   19  
10 Carl     18.5  18.5    90   167   22.3
# … with 223 more rows
# ℹ Use `print(n = ...)` to see more rows

4.2 Select all non-numerical variables.

# 1) 
select(survey,! where(is.numeric))

# A tibble: 233 × 8
   name    gender hand  fold    clap    exercise smokes m.i     
   <chr>   <chr>  <chr> <chr>   <chr>   <chr>    <chr>  <chr>   
 1 Alyson  female right right   left    some     never  metric  
 2 Todd    male   left  right   left    none     regul  imperial
 3 Gerald  male   right left    neither none     occas  <NA>    
 4 Robert  male   right right   neither none     never  metric  
 5 Dustin  male   right neither right   some     never  metric  
 6 Abby    female right left    right   some     never  imperial
 7 Andre   male   right left    right   freq     never  imperial
 8 Michael female right right   right   freq     never  metric  
 9 Edward  male   right right   right   some     never  metric  
10 Carl    male   right right   right   some     never  metric  
# … with 223 more rows
# ℹ Use `print(n = ...)` to see more rows

# 2) Since there are no other non-numerical types the following is also a correct solution.
select(survey, where(is.character))

# A tibble: 233 × 8
   name    gender hand  fold    clap    exercise smokes m.i     
   <chr>   <chr>  <chr> <chr>   <chr>   <chr>    <chr>  <chr>   
 1 Alyson  female right right   left    some     never  metric  
 2 Todd    male   left  right   left    none     regul  imperial
 3 Gerald  male   right left    neither none     occas  <NA>    
 4 Robert  male   right right   neither none     never  metric  
 5 Dustin  male   right neither right   some     never  metric  
 6 Abby    female right left    right   some     never  imperial
 7 Andre   male   right left    right   freq     never  imperial
 8 Michael female right right   right   freq     never  metric  
 9 Edward  male   right right   right   some     never  metric  
10 Carl    male   right right   right   some     never  metric  
# … with 223 more rows
# ℹ Use `print(n = ...)` to see more rows

Selection by pattern matching

In data sets with large number of variables, finding variables will become tedious. Several helper functions are available to speed up the variable name search.

starts_with(), ends_with() and contains()

The functions help to find fixed patterns in variable names:

# select variables starting with character 'a'
select(pulse, starts_with("a"))

# A tibble: 110 × 2
     age alcohol
   <dbl> <chr>  
 1    18 yes    
 2    19 yes    
 3    18 yes    
 4    18 yes    
 5    18 yes    
 6    22 yes    
 7    20 yes    
 8    18 yes    
 9    19 yes    
10    23 yes    
# … with 100 more rows
# ℹ Use `print(n = ...)` to see more rows

# select variables ending with 'e'
select(pulse, ends_with("e"))

# A tibble: 110 × 3
   name        age exercise
   <chr>     <dbl> <chr>   
 1 Bonnie       18 moderate
 2 Melanie      19 moderate
 3 Consuelo     18 high    
 4 Travis       18 high    
 5 Lauri        18 low     
 6 George       22 low     
 7 Cherry       20 moderate
 8 Francesca    18 moderate
 9 Sonja        19 high    
10 Troy         23 moderate
# … with 100 more rows
# ℹ Use `print(n = ...)` to see more rows

# select variables containing character 'i' 
select(pulse, contains("i"))

# A tibble: 110 × 4
   id     height weight exercise
   <chr>   <dbl>  <dbl> <chr>   
 1 1993_A    173     57 moderate
 2 1993_B    179     58 moderate
 3 1993_C    167     62 high    
 4 1993_D    195     84 high    
 5 1993_E    173     64 low     
 6 1993_F    184     74 low     
 7 1993_G    162     57 moderate
 8 1993_H    169     55 moderate
 9 1993_I    164     56 high    
10 1993_J    168     60 moderate
# … with 100 more rows
# ℹ Use `print(n = ...)` to see more rows

The helper functions can be used with logical operators {!,|,&} which will be explained later. You have already encountered one in the lecture on Useful R functions, !, the negation operator. In short it complements the results. For example, above we could select variables which started with character ‘a’ with select(pulse, starts_with("a")) which resulted into a tibble with the two variables age and alcohol. Using ! in front of the helper function in the expression will produce the complement of the previous result, namely all variables that do not start with a:

select(pulse, ! starts_with("a"))

# A tibble: 110 × 11
   id     name      height weight gender smokes exercise ran   pulse1 pulse2  year
   <chr>  <chr>      <dbl>  <dbl> <chr>  <chr>  <chr>    <chr>  <dbl>  <dbl> <dbl>
 1 1993_A Bonnie       173     57 female no     moderate sat       86     88  1993
 2 1993_B Melanie      179     58 female no     moderate ran       82    150  1993
 3 1993_C Consuelo     167     62 female no     high     ran       96    176  1993
 4 1993_D Travis       195     84 male   no     high     sat       71     73  1993
 5 1993_E Lauri        173     64 female no     low      sat       90     88  1993
 6 1993_F George       184     74 male   no     low      ran       78    141  1993
 7 1993_G Cherry       162     57 female no     moderate sat       68     72  1993
 8 1993_H Francesca    169     55 female no     moderate sat       71     77  1993
 9 1993_I Sonja        164     56 female no     high     sat       68     68  1993
10 1993_J Troy         168     60 male   no     moderate ran       88    150  1993
# … with 100 more rows
# ℹ Use `print(n = ...)` to see more rows

Note that age and alcohol do not occur in the result.

There are several other helper functions which fall beyond the scope of this lecture, visit here for more details.

Select variables, from survey data, by pattern matching.

5.1 Select variables that end with ‘e’.

select(survey, ends_with('e'))

# A tibble: 233 × 4
   name    pulse exercise   age
   <chr>   <dbl> <chr>    <dbl>
 1 Alyson     92 some      18.2
 2 Todd      104 none      17.6
 3 Gerald     87 none      16.9
 4 Robert     NA none      20.3
 5 Dustin     35 some      23.7
 6 Abby       64 some      21  
 7 Andre      83 freq      18.8
 8 Michael    74 freq      35.8
 9 Edward     72 some      19  
10 Carl       90 some      22.3
# … with 223 more rows
# ℹ Use `print(n = ...)` to see more rows

5.2 Select variables that start with ‘s’.

select(survey, starts_with('s'))

# A tibble: 233 × 3
   span1 span2 smokes
   <dbl> <dbl> <chr> 
 1  18.5  18   never 
 2  19.5  20.5 regul 
 3  18    13.3 occas 
 4  18.8  18.9 never 
 5  20    20   never 
 6  18    17.7 never 
 7  17.7  17.7 never 
 8  17    17.3 never 
 9  20    19.5 never 
10  18.5  18.5 never 
# … with 223 more rows
# ℹ Use `print(n = ...)` to see more rows

5.3 Select hand span variables using a helper function.

# 1)
select(survey, contains('span'))

# A tibble: 233 × 2
   span1 span2
   <dbl> <dbl>
 1  18.5  18  
 2  19.5  20.5
 3  18    13.3
 4  18.8  18.9
 5  20    20  
 6  18    17.7
 7  17.7  17.7
 8  17    17.3
 9  20    19.5
10  18.5  18.5
# … with 223 more rows
# ℹ Use `print(n = ...)` to see more rows

# 2) 
# select(survey, starts_with('span'))

↑ Lecture ⇄ Practice