Skip to contents
library(ukbwranglr)
library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
library(tidyr)
library(tidyselect)
library(readr)
library(tibble)

Overview

The basic workflow is as follows:

  1. Create a data dictionary for your main UK Biobank dataset with make_data_dict().
  2. Read selected variables into R with read_ukb().
  3. Summarise continuous variables with summarise_numerical_variables().
  4. Tidy clinical events data with tidy_clinical_events() or make_clinical_events_db(), and extract outcomes of interest with extract_phenotypes().
  5. Analyse.

These steps are now illustrated with a dummy dataset included with ukbwranglr:

# path to dummy data
ukb_main_path <- get_ukb_dummy("dummy_ukb_main.tsv",
                                     path_only = TRUE)

# raw data
read_tsv(ukb_main_path)
#> Rows: 10 Columns: 71
#> ── Column specification ────────────────────────────────────────────────────────
#> Delimiter: "\t"
#> chr  (14): 41270-0.0, 41270-0.3, 41271-0.0, 40001-0.0, 40001-1.0, 40002-0.0,...
#> dbl  (43): eid, 31-0.0, 34-0.0, 52-0.0, 21000-0.0, 21000-1.0, 21000-2.0, 210...
#> date (14): 41280-0.0, 41280-0.3, 41281-0.0, 41281-0.3, 40000-0.0, 40000-1.0,...
#> 
#>  Use `spec()` to retrieve the full column specification for this data.
#>  Specify the column types or set `show_col_types = FALSE` to quiet this message.
#> # A tibble: 10 × 71
#>      eid `31-0.0` `34-0.0` `52-0.0` `21000-0.0` `21000-1.0` `21000-2.0`
#>    <dbl>    <dbl>    <dbl>    <dbl>       <dbl>       <dbl>       <dbl>
#>  1     1        0     1952        8          -1           2        3003
#>  2     2        0     1946        3          -3        2001        3004
#>  3     3        1     1951        4           1        2002          -1
#>  4     4        0     1956        9        1001        2003        4001
#>  5     5       NA       NA        4        1002        2004        4002
#>  6     6        1     1948        2        1003           3        4003
#>  7     7        0     1949       12          NA        3001           5
#>  8     8        1     1956       10          NA           5          NA
#>  9     9        0     1962        4        4001          NA          NA
#> 10    10        1     1953        2        4001          NA          NA
#> # ℹ 64 more variables: `21001-0.0` <dbl>, `21001-1.0` <dbl>, `21001-2.0` <dbl>,
#> #   `4080-0.0` <dbl>, `4080-0.1` <dbl>, `4080-0.2` <dbl>, `4080-0.3` <dbl>,
#> #   `4080-1.0` <dbl>, `4080-1.1` <dbl>, `4080-1.2` <dbl>, `4080-1.3` <dbl>,
#> #   `20001-0.0` <dbl>, `20001-0.3` <dbl>, `20001-2.0` <dbl>, `20001-2.3` <dbl>,
#> #   `20002-0.0` <dbl>, `20002-0.3` <dbl>, `20002-2.0` <dbl>, `20002-2.3` <dbl>,
#> #   `20006-0.0` <dbl>, `20006-0.3` <dbl>, `20006-2.0` <dbl>, `20006-2.3` <dbl>,
#> #   `20008-0.0` <dbl>, `20008-0.3` <dbl>, `20008-2.0` <dbl>, …

You can try out the steps either locally by installing ukbwranglr on your own machine, or online by clicking on the following link to RStudio Cloud1 and navigating to this Rmd file in the ‘vignettes’ directory: Launch RStudio Cloud

Basic workflow

1. Create data dictionary

First download a copy of the UK Biobank data dictionary and codings files from the UK Biobank data showcase website. Dummy versions are used here:

# get required metadata
ukb_data_dict <- get_ukb_dummy("dummy_Data_Dictionary_Showcase.tsv")
ukb_codings <- get_ukb_dummy("dummy_Codings.tsv")

Then create a data dictionary using make_data_dict():

# make data dictionary
data_dict <- make_data_dict(ukb_main_path, 
                            ukb_data_dict = ukb_data_dict)

data_dict
#> # A tibble: 71 × 22
#>    descriptive_colnames     colheaders_raw colheaders_processed FieldID instance
#>    <chr>                    <chr>          <chr>                <chr>   <chr>   
#>  1 eid                      eid            feid                 eid     NA      
#>  2 sex_f31_0_0              31-0.0         f31_0_0              31      0       
#>  3 year_of_birth_f34_0_0    34-0.0         f34_0_0              34      0       
#>  4 month_of_birth_f52_0_0   52-0.0         f52_0_0              52      0       
#>  5 ethnic_background_f2100… 21000-0.0      f21000_0_0           21000   0       
#>  6 ethnic_background_f2100… 21000-1.0      f21000_1_0           21000   1       
#>  7 ethnic_background_f2100… 21000-2.0      f21000_2_0           21000   2       
#>  8 body_mass_index_bmi_f21… 21001-0.0      f21001_0_0           21001   0       
#>  9 body_mass_index_bmi_f21… 21001-1.0      f21001_1_0           21001   1       
#> 10 body_mass_index_bmi_f21… 21001-2.0      f21001_2_0           21001   2       
#> # ℹ 61 more rows
#> # ℹ 17 more variables: array <chr>, Path <chr>, Category <chr>, Field <chr>,
#> #   Participants <chr>, Items <chr>, Stability <chr>, ValueType <chr>,
#> #   Units <chr>, ItemType <chr>, Strata <chr>, Sexed <chr>, Instances <chr>,
#> #   Array <chr>, Coding <chr>, Notes <chr>, Link <chr>

2. Read selected variables into R

Read a main UK Biobank dataset into R using read_ukb():

read_ukb(path = ukb_main_path,
         ukb_data_dict = ukb_data_dict,
         ukb_codings = ukb_codings) %>% 
  # (convert to tibble for concise print method)
  as_tibble()
#> Creating data dictionary
#> STEP 1 of 3
#> Reading data into R
#> STEP 2 of 3
#> Renaming with descriptive column names
#> STEP 3 of 3
#> Applying variable and value labels
#> Labelling dataset
#> Time taken: 0 minutes, 0 seconds.
#> # A tibble: 10 × 71
#>      eid sex_f31_0_0 year_of_birth_f34_0_0 month_of_birth_f52_0_0
#>    <int> <fct>                       <int> <fct>                 
#>  1     1 Female                       1952 August                
#>  2     2 Female                       1946 March                 
#>  3     3 Male                         1951 April                 
#>  4     4 Female                       1956 September             
#>  5     5 NA                             NA April                 
#>  6     6 Male                         1948 February              
#>  7     7 Female                       1949 December              
#>  8     8 Male                         1956 October               
#>  9     9 Female                       1962 April                 
#> 10    10 Male                         1953 February              
#> # ℹ 67 more variables: ethnic_background_f21000_0_0 <fct>,
#> #   ethnic_background_f21000_1_0 <fct>, ethnic_background_f21000_2_0 <fct>,
#> #   body_mass_index_bmi_f21001_0_0 <dbl>, body_mass_index_bmi_f21001_1_0 <dbl>,
#> #   body_mass_index_bmi_f21001_2_0 <dbl>,
#> #   systolic_blood_pressure_automated_reading_f4080_0_0 <int>,
#> #   systolic_blood_pressure_automated_reading_f4080_0_1 <int>,
#> #   systolic_blood_pressure_automated_reading_f4080_0_2 <int>, …

A UK Biobank main dataset file is typically too large to fit into memory on a personal computer. Often however, only a subset of the data is required. To read a selection of variables, filter the data dictionary created with make_data_dict() for a subset of variables and supply this to read_ukb():2

# Filter data dictionary for sex, year of birth, body mass index, systolic blood pressure, self-reported non-cancer illness and summary hospital diagnoses (ICD10) fields
data_dict_selected <- data_dict %>%
  filter(FieldID %in% c(
    # Participant ID
    "eid",
    
    # Sex
    "31",
    
    # Year of birth
    "34",
    
    # Body mass index
    "21001",
    
    # Systolic blood pressure
    "4080",
    
    # Self-reported non-cancer medical conditions
    "20002",
    "20008",
    
    # Summary hospital diagnoses (ICD10)
    "41270",
    "41280"
  ))

# Read selected variables into R
ukb_main <- read_ukb(path = ukb_main_path,
         data_dict = data_dict_selected,
         ukb_data_dict = ukb_data_dict,
         ukb_codings = ukb_codings)
#> STEP 1 of 3
#> Reading data into R
#> STEP 2 of 3
#> Renaming with descriptive column names
#> STEP 3 of 3
#> Applying variable and value labels
#> Labelling dataset
#> Time taken: 0 minutes, 0 seconds.

# Convert to tibble for concise print method
as_tibble(ukb_main)
#> # A tibble: 10 × 26
#>      eid sex_f31_0_0 year_of_birth_f34_0_0 body_mass_index_bmi_f21001_0_0
#>    <int> <fct>                       <int>                          <dbl>
#>  1     1 Female                       1952                           20.1
#>  2     2 Female                       1946                           30.2
#>  3     3 Male                         1951                           22.8
#>  4     4 Female                       1956                           NA  
#>  5     5 NA                             NA                           29.3
#>  6     6 Male                         1948                           28.3
#>  7     7 Female                       1949                           NA  
#>  8     8 Male                         1956                           NA  
#>  9     9 Female                       1962                           25.4
#> 10    10 Male                         1953                           NA  
#> # ℹ 22 more variables: body_mass_index_bmi_f21001_1_0 <dbl>,
#> #   body_mass_index_bmi_f21001_2_0 <dbl>,
#> #   systolic_blood_pressure_automated_reading_f4080_0_0 <int>,
#> #   systolic_blood_pressure_automated_reading_f4080_0_1 <int>,
#> #   systolic_blood_pressure_automated_reading_f4080_0_2 <int>,
#> #   systolic_blood_pressure_automated_reading_f4080_0_3 <int>,
#> #   systolic_blood_pressure_automated_reading_f4080_1_0 <int>, …

3. Summarise continuous variables

Some variables such as body mass index and systolic blood pressure will have been measured on more than one occasion. In these cases it may be desirable to calculate a summary value (e.g. mean). Use summarise_numerical_variables():

# calculate the mean value across all repeated continuous variable measurements
ukb_main_numerical_vars_summarised <- summarise_numerical_variables(
  ukb_main = ukb_main,
  ukb_data_dict = ukb_data_dict,
  summary_function = "mean", 
  .drop = TRUE
) %>% 
  # reorder variables
  select(eid,
         sex_f31_0_0,
         year_of_birth_f34_0_0,
         mean_body_mass_index_bmi_x21001,
         mean_systolic_blood_pressure_automated_reading_x4080)
#> Number of summary columns to make: 3
#> Time taken: 0 minutes, 0 seconds.

as_tibble(ukb_main_numerical_vars_summarised)
#> # A tibble: 10 × 5
#>      eid sex_f31_0_0 year_of_birth_f34_0_0 mean_body_mass_index_bmi_x21001
#>    <int> <fct>                       <int>                           <dbl>
#>  1     1 Female                       1952                            20.5
#>  2     2 Female                       1946                            26.0
#>  3     3 Male                         1951                            25.8
#>  4     4 Female                       1956                           NaN  
#>  5     5 NA                             NA                            21.2
#>  6     6 Male                         1948                            28.6
#>  7     7 Female                       1949                           NaN  
#>  8     8 Male                         1956                           NaN  
#>  9     9 Female                       1962                            23.9
#> 10    10 Male                         1953                            27.6
#> # ℹ 1 more variable: mean_systolic_blood_pressure_automated_reading_x4080 <dbl>

4. Tidy clinical events data and extract outcomes of interest

Use tidy_clinical_events() to reshape clinical events fields (such as self-reported non-cancer medical conditions) to long format:

# tidy clinical events
clinical_events <- tidy_clinical_events(
  ukb_main = ukb_main, 
  ukb_data_dict = ukb_data_dict, 
  ukb_codings = ukb_codings,
  clinical_events_sources = c("self_report_non_cancer",
                              "summary_hes_icd10")
)
#> Tidying clinical events for self_report_non_cancer
#> Time taken: 0 minutes, 0 seconds.
#> Tidying clinical events for summary_hes_icd10
#> Time taken: 0 minutes, 0 seconds.

# returns a named list of data frames
clinical_events
#> $self_report_non_cancer
#>      eid source  index   code       date
#>    <int> <char> <char> <char>     <char>
#> 1:     1 f20002    0_0   1665 1998-12-24
#> 2:     2 f20002    0_0   1383 2011-01-05
#> 3:     3 f20002    0_0   1665       <NA>
#> 4:     4 f20002    0_0   1383       <NA>
#> 5:     1 f20002    0_3   1223 2003-02-25
#> 6:     2 f20002    0_3   1352 2020-07-02
#> 7:     1 f20002    2_0   1514 2011-04-07
#> 8:     2 f20002    2_0   1447 1981-03-01
#> 9:     2 f20002    2_3   1165 1983-01-03
#> 
#> $summary_hes_icd10
#>      eid source  index   code       date
#>    <int> <char> <char> <char>     <char>
#> 1:     1 f41270    0_0   X715 1955-11-12
#> 2:     2 f41270    0_0    E11 1939-02-16
#> 3:     1 f41270    0_3    E10 1910-02-19
#> 4:     2 f41270    0_3  M0087 1965-08-08

# combine with dplyr
clinical_events <- dplyr::bind_rows(clinical_events)

clinical_events
#>       eid source  index   code       date
#>     <int> <char> <char> <char>     <char>
#>  1:     1 f20002    0_0   1665 1998-12-24
#>  2:     2 f20002    0_0   1383 2011-01-05
#>  3:     3 f20002    0_0   1665       <NA>
#>  4:     4 f20002    0_0   1383       <NA>
#>  5:     1 f20002    0_3   1223 2003-02-25
#>  6:     2 f20002    0_3   1352 2020-07-02
#>  7:     1 f20002    2_0   1514 2011-04-07
#>  8:     2 f20002    2_0   1447 1981-03-01
#>  9:     2 f20002    2_3   1165 1983-01-03
#> 10:     1 f41270    0_0   X715 1955-11-12
#> 11:     2 f41270    0_0    E11 1939-02-16
#> 12:     1 f41270    0_3    E10 1910-02-19
#> 13:     2 f41270    0_3  M0087 1965-08-08

To identify participants with a condition of interest, first decide which codes will capture this. For example, the following includes a (non-exhaustive) list of clinical codes for diabetes:3

example_clinical_codes()
#> # A tibble: 8 × 6
#>   disease  description                           category code_type code  author
#>   <chr>    <chr>                                 <chr>    <chr>     <chr> <chr> 
#> 1 Diabetes diabetes                              Diabete… data_cod… 1220  ukbwr 
#> 2 Diabetes gestational diabetes                  Gestati… data_cod… 1221  ukbwr 
#> 3 Diabetes type 1 diabetes                       Type 1 … data_cod… 1222  ukbwr 
#> 4 Diabetes type 2 diabetes                       Type 2 … data_cod… 1223  ukbwr 
#> 5 Diabetes Type 1 diabetes mellitus              Type 1 … icd10     E10   ukbwr 
#> 6 Diabetes Type 2 diabetes mellitus              Type 2 … icd10     E11   ukbwr 
#> 7 Diabetes Insulin dependent diabetes mellitus   Type 1 … read2     C108. ukbwr 
#> 8 Diabetes Non-insulin dependent diabetes melli… Type 2 … read2     C109. ukbwr

Supply this to extract_phenotypes() to filter for participants who have any matching clinical codes in their records. By default, only the earliest date is extracted:

# extract phenotypes
diabetes_cases <- extract_phenotypes(clinical_events = clinical_events,
                                     clinical_codes = example_clinical_codes())
#> Filtering for requested clinical codes/sources
#> Joining filtered events with clinical codelist
#> Time taken: 0 minutes, 0 seconds.

diabetes_cases
#>      eid source  index   code       date     code_type  disease  category
#>    <int> <char> <char> <char>     <char>        <char>   <char>    <char>
#> 1:     1 f20002    0_3   1223 2003-02-25 data_coding_6 Diabetes Type 2 DM
#> 2:     2 f41270    0_0    E11 1939-02-16         icd10 Diabetes Type 2 DM
#> 3:     1 f41270    0_3    E10 1910-02-19         icd10 Diabetes Type 1 DM
#>    author
#>    <char>
#> 1:  ukbwr
#> 2:  ukbwr
#> 3:  ukbwr

5. Analyse

Merge the output from steps 4 and 5:

ukb_main_processed <-
  # first summarise `diabetes_cases` - for each eid, get the earliest date
  diabetes_cases %>%
  group_by(eid,
           disease) %>%
  summarise(diabetes_min_date = min(date, na.rm = TRUE)) %>%
  ungroup() %>%
  
  # create indicator column for diabetes
  mutate(diabetes_indicator = case_when(!is.na(diabetes_min_date) ~ "Yes",
                                        TRUE ~ "No")) %>%
  
  # join with `ukb_main_numerical_vars_summarised`
  dplyr::full_join(ukb_main_numerical_vars_summarised,
                   by = "eid")
#> `summarise()` has grouped output by 'eid'. You can override using the `.groups`
#> argument.

ukb_main_processed
#> # A tibble: 10 × 8
#>      eid disease  diabetes_min_date diabetes_indicator sex_f31_0_0
#>    <int> <chr>    <chr>             <chr>              <fct>      
#>  1     1 Diabetes 1910-02-19        Yes                Female     
#>  2     2 Diabetes 1939-02-16        Yes                Female     
#>  3     3 NA       NA                NA                 Male       
#>  4     4 NA       NA                NA                 Female     
#>  5     5 NA       NA                NA                 NA         
#>  6     6 NA       NA                NA                 Male       
#>  7     7 NA       NA                NA                 Female     
#>  8     8 NA       NA                NA                 Male       
#>  9     9 NA       NA                NA                 Female     
#> 10    10 NA       NA                NA                 Male       
#> # ℹ 3 more variables: year_of_birth_f34_0_0 <int>,
#> #   mean_body_mass_index_bmi_x21001 <dbl>,
#> #   mean_systolic_blood_pressure_automated_reading_x4080 <dbl>

Describe:

ukb_main_processed %>%
  select(-eid) %>%
  group_by(diabetes_indicator) %>%
  summarise(pct_female = sum(sex_f31_0_0 == "Female", na.rm = TRUE) / n(),
            across(where(is.numeric), ~ mean(.x, na.rm = TRUE)))
#> # A tibble: 2 × 5
#>   diabetes_indicator pct_female year_of_birth_f34_0_0 mean_body_mass_index_bmi…¹
#>   <chr>                   <dbl>                 <dbl>                      <dbl>
#> 1 Yes                     1                     1949                        23.2
#> 2 NA                      0.375                 1954.                       25.4
#> # ℹ abbreviated name: ¹​mean_body_mass_index_bmi_x21001
#> # ℹ 1 more variable: mean_systolic_blood_pressure_automated_reading_x4080 <dbl>

Setup for multiple projects

The following setup is recommended to reduce duplicated steps between multiple projects that draw on the same datasets:

  1. Download the UK Biobank data dictionary and codings files (available from the UK Biobank data showcase) and for each new R project, place the following .Renviron file in the project root directory (replacing PATH/TO with the correct file paths):

    UKB_DATA_DICT=/PATH/TO/Data_Dictionary_Showcase.tsv
    
    UKB_CODINGS=/PATH/TO/Codings.tsv

    Functions with arguments ukb_data_dict and ukb_codings use get_ukb_data_dict() and get_ukb_codings() by default, which will automatically search for environmental variables UKB_DATA_DICT and UKB_CODINGS and read the files from these locations.

  2. Create a clinical events database using make_clinical_events_db(). This function includes the option to incorporate primary care data.4 Having connected to the database, phenotypes may be extracted with extract_phenotypes():

# build dummy clinical events SQLite DB in tempdir
ukb_db_path <- tempfile(fileext = ".db")

make_clinical_events_db(ukb_main_path = ukb_main_path,
                        ukb_db_path = ukb_db_path, 
                        gp_clinical_path = get_ukb_dummy("dummy_gp_clinical.txt",
                                                         path_only = TRUE),
                        gp_scripts_path = get_ukb_dummy("dummy_gp_scripts.txt",
                                                         path_only = TRUE), 
                        ukb_data_dict = ukb_data_dict,
                        ukb_codings = ukb_codings)
# Connect to the database
con <- DBI::dbConnect(RSQLite::SQLite(), 
                      ukb_db_path)

# Convert to a named list of dbplyr::tbl_dbi objects
ukbdb <- ukbwranglr::db_tables_to_list(con)

# Value columns (from `gp_clinical.txt`) and prescription names/quantities (from `gp_scripts.txt`) are stored separately from the main `clinical_events` table
ukbdb
#> $clinical_events
#> # Source:   table<`clinical_events`> [?? x 5]
#> # Database: sqlite 3.45.2 [/tmp/RtmpNoleal/file55e02bbba532.db]
#>      eid source index code       date      
#>    <int> <chr>  <chr> <chr>      <chr>     
#>  1     1 f40001 0_0   X095       1917-10-08
#>  2     2 f40001 0_0   A162       1955-02-11
#>  3     1 f40001 1_0   X095       1910-02-19
#>  4     2 f40001 1_0   A162       1965-08-08
#>  5     1 f40002 0_0   W192       1917-10-08
#>  6     2 f40002 0_0   V374       1955-02-11
#>  7     1 f40002 1_3   X715       1910-02-19
#>  8     1 f20003 0_0   1140861958 1955-02-11
#>  9     2 f20003 0_0   1141146234 1965-08-08
#> 10     1 f20003 2_0   1141146188 1910-02-19
#> # ℹ more rows
#> 
#> $gp_clinical_values
#> # Source:   table<`gp_clinical_values`> [?? x 4]
#> # Database: sqlite 3.45.2 [/tmp/RtmpNoleal/file55e02bbba532.db]
#>    index value1 value2 value3
#>    <chr> <chr>  <chr>  <chr> 
#>  1 1     1      2      3     
#>  2 2     1      2      3     
#>  3 3     1      2      3     
#>  4 4     1      2      3     
#>  5 5     1      2      3     
#>  6 6     1      2      3     
#>  7 7     NA     NA     NA    
#>  8 8     NA     NA     NA    
#>  9 9     NA     NA     NA    
#> 10 10    NA     NA     NA    
#> # ℹ more rows
#> 
#> $gp_scripts_names_and_quantities
#> # Source:   table<`gp_scripts_names_and_quantities`> [6 x 3]
#> # Database: sqlite 3.45.2 [/tmp/RtmpNoleal/file55e02bbba532.db]
#>   index drug_name quantity
#>   <chr> <chr>     <chr>   
#> 1 1     drug2     50      
#> 2 2     NA        NA      
#> 3 3     drug2     30      
#> 4 4     drug2     30      
#> 5 5     drug2     30      
#> 6 6     2         30
# extract phenotypes
diabetes_cases <-
  extract_phenotypes(clinical_events = ukbdb$clinical_events,
                     clinical_codes = example_clinical_codes(),
                     verbose = FALSE)

diabetes_cases
#> # A tibble: 8 × 9
#>     eid source       index code  date       code_type    disease category author
#>   <int> <chr>        <chr> <chr> <chr>      <chr>        <chr>   <chr>    <chr> 
#> 1     1 f20002       0_3   1223  2003-02-25 data_coding… Diabet… Type 2 … ukbwr 
#> 2     1 gpc1_r2      7     C108. 1990-10-01 read2        Diabet… Type 1 … ukbwr 
#> 3     1 gpc1_r2      11    C108. 1990-10-03 read2        Diabet… Type 1 … ukbwr 
#> 4     2 gpc2_r2      8     C109. 1990-10-02 read2        Diabet… Type 2 … ukbwr 
#> 5     2 gpc2_r2      12    C109. 1990-10-04 read2        Diabet… Type 2 … ukbwr 
#> 6     1 f41270       0_3   E10   1910-02-19 icd10        Diabet… Type 1 … ukbwr 
#> 7     1 f20002_icd10 0_3   E11   2003-02-25 icd10        Diabet… Type 2 … ukbwr 
#> 8     2 f41270       0_0   E11   1939-02-16 icd10        Diabet… Type 2 … ukbwr