library(ukbwranglr)
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
library(tidyr)
library(tidyselect)
library(readr)
library(tibble)
Overview
The basic workflow is as follows:
- Create a data dictionary for your main UK Biobank dataset with
make_data_dict()
. - Read selected variables into R with
read_ukb()
. - Summarise continuous variables with
summarise_numerical_variables()
. - Tidy clinical events data with
tidy_clinical_events()
ormake_clinical_events_db()
, and extract outcomes of interest withextract_phenotypes()
. - Analyse.
These steps are now illustrated with a dummy dataset included with ukbwranglr:
# path to dummy data
ukb_main_path <- get_ukb_dummy("dummy_ukb_main.tsv",
path_only = TRUE)
# raw data
read_tsv(ukb_main_path)
#> Rows: 10 Columns: 71
#> ── Column specification ────────────────────────────────────────────────────────
#> Delimiter: "\t"
#> chr (14): 41270-0.0, 41270-0.3, 41271-0.0, 40001-0.0, 40001-1.0, 40002-0.0,...
#> dbl (43): eid, 31-0.0, 34-0.0, 52-0.0, 21000-0.0, 21000-1.0, 21000-2.0, 210...
#> date (14): 41280-0.0, 41280-0.3, 41281-0.0, 41281-0.3, 40000-0.0, 40000-1.0,...
#>
#> ℹ Use `spec()` to retrieve the full column specification for this data.
#> ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#> # A tibble: 10 × 71
#> eid `31-0.0` `34-0.0` `52-0.0` `21000-0.0` `21000-1.0` `21000-2.0`
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 0 1952 8 -1 2 3003
#> 2 2 0 1946 3 -3 2001 3004
#> 3 3 1 1951 4 1 2002 -1
#> 4 4 0 1956 9 1001 2003 4001
#> 5 5 NA NA 4 1002 2004 4002
#> 6 6 1 1948 2 1003 3 4003
#> 7 7 0 1949 12 NA 3001 5
#> 8 8 1 1956 10 NA 5 NA
#> 9 9 0 1962 4 4001 NA NA
#> 10 10 1 1953 2 4001 NA NA
#> # ℹ 64 more variables: `21001-0.0` <dbl>, `21001-1.0` <dbl>, `21001-2.0` <dbl>,
#> # `4080-0.0` <dbl>, `4080-0.1` <dbl>, `4080-0.2` <dbl>, `4080-0.3` <dbl>,
#> # `4080-1.0` <dbl>, `4080-1.1` <dbl>, `4080-1.2` <dbl>, `4080-1.3` <dbl>,
#> # `20001-0.0` <dbl>, `20001-0.3` <dbl>, `20001-2.0` <dbl>, `20001-2.3` <dbl>,
#> # `20002-0.0` <dbl>, `20002-0.3` <dbl>, `20002-2.0` <dbl>, `20002-2.3` <dbl>,
#> # `20006-0.0` <dbl>, `20006-0.3` <dbl>, `20006-2.0` <dbl>, `20006-2.3` <dbl>,
#> # `20008-0.0` <dbl>, `20008-0.3` <dbl>, `20008-2.0` <dbl>, …
You can try out the steps either locally by installing ukbwranglr on your own machine, or online by clicking on the following link to RStudio Cloud1 and navigating to this Rmd file in the ‘vignettes’ directory:
Basic workflow
1. Create data dictionary
First download a copy of the UK Biobank data dictionary and codings files from the UK Biobank data showcase website. Dummy versions are used here:
# get required metadata
ukb_data_dict <- get_ukb_dummy("dummy_Data_Dictionary_Showcase.tsv")
ukb_codings <- get_ukb_dummy("dummy_Codings.tsv")
Then create a data dictionary using
make_data_dict()
:
# make data dictionary
data_dict <- make_data_dict(ukb_main_path,
ukb_data_dict = ukb_data_dict)
data_dict
#> # A tibble: 71 × 22
#> descriptive_colnames colheaders_raw colheaders_processed FieldID instance
#> <chr> <chr> <chr> <chr> <chr>
#> 1 eid eid feid eid NA
#> 2 sex_f31_0_0 31-0.0 f31_0_0 31 0
#> 3 year_of_birth_f34_0_0 34-0.0 f34_0_0 34 0
#> 4 month_of_birth_f52_0_0 52-0.0 f52_0_0 52 0
#> 5 ethnic_background_f2100… 21000-0.0 f21000_0_0 21000 0
#> 6 ethnic_background_f2100… 21000-1.0 f21000_1_0 21000 1
#> 7 ethnic_background_f2100… 21000-2.0 f21000_2_0 21000 2
#> 8 body_mass_index_bmi_f21… 21001-0.0 f21001_0_0 21001 0
#> 9 body_mass_index_bmi_f21… 21001-1.0 f21001_1_0 21001 1
#> 10 body_mass_index_bmi_f21… 21001-2.0 f21001_2_0 21001 2
#> # ℹ 61 more rows
#> # ℹ 17 more variables: array <chr>, Path <chr>, Category <chr>, Field <chr>,
#> # Participants <chr>, Items <chr>, Stability <chr>, ValueType <chr>,
#> # Units <chr>, ItemType <chr>, Strata <chr>, Sexed <chr>, Instances <chr>,
#> # Array <chr>, Coding <chr>, Notes <chr>, Link <chr>
2. Read selected variables into R
Read a main UK Biobank dataset into R using
read_ukb()
:
read_ukb(path = ukb_main_path,
ukb_data_dict = ukb_data_dict,
ukb_codings = ukb_codings) %>%
# (convert to tibble for concise print method)
as_tibble()
#> Creating data dictionary
#> STEP 1 of 3
#> Reading data into R
#> STEP 2 of 3
#> Renaming with descriptive column names
#> STEP 3 of 3
#> Applying variable and value labels
#> Labelling dataset
#> Time taken: 0 minutes, 0 seconds.
#> # A tibble: 10 × 71
#> eid sex_f31_0_0 year_of_birth_f34_0_0 month_of_birth_f52_0_0
#> <int> <fct> <int> <fct>
#> 1 1 Female 1952 August
#> 2 2 Female 1946 March
#> 3 3 Male 1951 April
#> 4 4 Female 1956 September
#> 5 5 NA NA April
#> 6 6 Male 1948 February
#> 7 7 Female 1949 December
#> 8 8 Male 1956 October
#> 9 9 Female 1962 April
#> 10 10 Male 1953 February
#> # ℹ 67 more variables: ethnic_background_f21000_0_0 <fct>,
#> # ethnic_background_f21000_1_0 <fct>, ethnic_background_f21000_2_0 <fct>,
#> # body_mass_index_bmi_f21001_0_0 <dbl>, body_mass_index_bmi_f21001_1_0 <dbl>,
#> # body_mass_index_bmi_f21001_2_0 <dbl>,
#> # systolic_blood_pressure_automated_reading_f4080_0_0 <int>,
#> # systolic_blood_pressure_automated_reading_f4080_0_1 <int>,
#> # systolic_blood_pressure_automated_reading_f4080_0_2 <int>, …
A UK Biobank main dataset file is typically too large to fit into
memory on a personal computer. Often however, only a subset of the data
is required. To read a selection of variables, filter the data
dictionary created with make_data_dict()
for a subset of
variables and supply this to read_ukb()
:2
# Filter data dictionary for sex, year of birth, body mass index, systolic blood pressure, self-reported non-cancer illness and summary hospital diagnoses (ICD10) fields
data_dict_selected <- data_dict %>%
filter(FieldID %in% c(
# Participant ID
"eid",
# Sex
"31",
# Year of birth
"34",
# Body mass index
"21001",
# Systolic blood pressure
"4080",
# Self-reported non-cancer medical conditions
"20002",
"20008",
# Summary hospital diagnoses (ICD10)
"41270",
"41280"
))
# Read selected variables into R
ukb_main <- read_ukb(path = ukb_main_path,
data_dict = data_dict_selected,
ukb_data_dict = ukb_data_dict,
ukb_codings = ukb_codings)
#> STEP 1 of 3
#> Reading data into R
#> STEP 2 of 3
#> Renaming with descriptive column names
#> STEP 3 of 3
#> Applying variable and value labels
#> Labelling dataset
#> Time taken: 0 minutes, 0 seconds.
# Convert to tibble for concise print method
as_tibble(ukb_main)
#> # A tibble: 10 × 26
#> eid sex_f31_0_0 year_of_birth_f34_0_0 body_mass_index_bmi_f21001_0_0
#> <int> <fct> <int> <dbl>
#> 1 1 Female 1952 20.1
#> 2 2 Female 1946 30.2
#> 3 3 Male 1951 22.8
#> 4 4 Female 1956 NA
#> 5 5 NA NA 29.3
#> 6 6 Male 1948 28.3
#> 7 7 Female 1949 NA
#> 8 8 Male 1956 NA
#> 9 9 Female 1962 25.4
#> 10 10 Male 1953 NA
#> # ℹ 22 more variables: body_mass_index_bmi_f21001_1_0 <dbl>,
#> # body_mass_index_bmi_f21001_2_0 <dbl>,
#> # systolic_blood_pressure_automated_reading_f4080_0_0 <int>,
#> # systolic_blood_pressure_automated_reading_f4080_0_1 <int>,
#> # systolic_blood_pressure_automated_reading_f4080_0_2 <int>,
#> # systolic_blood_pressure_automated_reading_f4080_0_3 <int>,
#> # systolic_blood_pressure_automated_reading_f4080_1_0 <int>, …
3. Summarise continuous variables
Some variables such as body mass index and systolic blood pressure
will have been measured on more than one occasion. In these cases it may
be desirable to calculate a summary value (e.g. mean). Use
summarise_numerical_variables()
:
# calculate the mean value across all repeated continuous variable measurements
ukb_main_numerical_vars_summarised <- summarise_numerical_variables(
ukb_main = ukb_main,
ukb_data_dict = ukb_data_dict,
summary_function = "mean",
.drop = TRUE
) %>%
# reorder variables
select(eid,
sex_f31_0_0,
year_of_birth_f34_0_0,
mean_body_mass_index_bmi_x21001,
mean_systolic_blood_pressure_automated_reading_x4080)
#> Number of summary columns to make: 3
#> Time taken: 0 minutes, 0 seconds.
as_tibble(ukb_main_numerical_vars_summarised)
#> # A tibble: 10 × 5
#> eid sex_f31_0_0 year_of_birth_f34_0_0 mean_body_mass_index_bmi_x21001
#> <int> <fct> <int> <dbl>
#> 1 1 Female 1952 20.5
#> 2 2 Female 1946 26.0
#> 3 3 Male 1951 25.8
#> 4 4 Female 1956 NaN
#> 5 5 NA NA 21.2
#> 6 6 Male 1948 28.6
#> 7 7 Female 1949 NaN
#> 8 8 Male 1956 NaN
#> 9 9 Female 1962 23.9
#> 10 10 Male 1953 27.6
#> # ℹ 1 more variable: mean_systolic_blood_pressure_automated_reading_x4080 <dbl>
4. Tidy clinical events data and extract outcomes of interest
Use tidy_clinical_events()
to reshape clinical events
fields (such as self-reported non-cancer medical conditions) to long
format:
# tidy clinical events
clinical_events <- tidy_clinical_events(
ukb_main = ukb_main,
ukb_data_dict = ukb_data_dict,
ukb_codings = ukb_codings,
clinical_events_sources = c("self_report_non_cancer",
"summary_hes_icd10")
)
#> Tidying clinical events for self_report_non_cancer
#> Time taken: 0 minutes, 0 seconds.
#> Tidying clinical events for summary_hes_icd10
#> Time taken: 0 minutes, 0 seconds.
# returns a named list of data frames
clinical_events
#> $self_report_non_cancer
#> eid source index code date
#> <int> <char> <char> <char> <char>
#> 1: 1 f20002 0_0 1665 1998-12-24
#> 2: 2 f20002 0_0 1383 2011-01-05
#> 3: 3 f20002 0_0 1665 <NA>
#> 4: 4 f20002 0_0 1383 <NA>
#> 5: 1 f20002 0_3 1223 2003-02-25
#> 6: 2 f20002 0_3 1352 2020-07-02
#> 7: 1 f20002 2_0 1514 2011-04-07
#> 8: 2 f20002 2_0 1447 1981-03-01
#> 9: 2 f20002 2_3 1165 1983-01-03
#>
#> $summary_hes_icd10
#> eid source index code date
#> <int> <char> <char> <char> <char>
#> 1: 1 f41270 0_0 X715 1955-11-12
#> 2: 2 f41270 0_0 E11 1939-02-16
#> 3: 1 f41270 0_3 E10 1910-02-19
#> 4: 2 f41270 0_3 M0087 1965-08-08
# combine with dplyr
clinical_events <- dplyr::bind_rows(clinical_events)
clinical_events
#> eid source index code date
#> <int> <char> <char> <char> <char>
#> 1: 1 f20002 0_0 1665 1998-12-24
#> 2: 2 f20002 0_0 1383 2011-01-05
#> 3: 3 f20002 0_0 1665 <NA>
#> 4: 4 f20002 0_0 1383 <NA>
#> 5: 1 f20002 0_3 1223 2003-02-25
#> 6: 2 f20002 0_3 1352 2020-07-02
#> 7: 1 f20002 2_0 1514 2011-04-07
#> 8: 2 f20002 2_0 1447 1981-03-01
#> 9: 2 f20002 2_3 1165 1983-01-03
#> 10: 1 f41270 0_0 X715 1955-11-12
#> 11: 2 f41270 0_0 E11 1939-02-16
#> 12: 1 f41270 0_3 E10 1910-02-19
#> 13: 2 f41270 0_3 M0087 1965-08-08
To identify participants with a condition of interest, first decide which codes will capture this. For example, the following includes a (non-exhaustive) list of clinical codes for diabetes:3
example_clinical_codes()
#> # A tibble: 8 × 6
#> disease description category code_type code author
#> <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 Diabetes diabetes Diabete… data_cod… 1220 ukbwr
#> 2 Diabetes gestational diabetes Gestati… data_cod… 1221 ukbwr
#> 3 Diabetes type 1 diabetes Type 1 … data_cod… 1222 ukbwr
#> 4 Diabetes type 2 diabetes Type 2 … data_cod… 1223 ukbwr
#> 5 Diabetes Type 1 diabetes mellitus Type 1 … icd10 E10 ukbwr
#> 6 Diabetes Type 2 diabetes mellitus Type 2 … icd10 E11 ukbwr
#> 7 Diabetes Insulin dependent diabetes mellitus Type 1 … read2 C108. ukbwr
#> 8 Diabetes Non-insulin dependent diabetes melli… Type 2 … read2 C109. ukbwr
Supply this to extract_phenotypes()
to filter for
participants who have any matching clinical codes in their records. By
default, only the earliest date is extracted:
# extract phenotypes
diabetes_cases <- extract_phenotypes(clinical_events = clinical_events,
clinical_codes = example_clinical_codes())
#> Filtering for requested clinical codes/sources
#> Joining filtered events with clinical codelist
#> Time taken: 0 minutes, 0 seconds.
diabetes_cases
#> eid source index code date code_type disease category
#> <int> <char> <char> <char> <char> <char> <char> <char>
#> 1: 1 f20002 0_3 1223 2003-02-25 data_coding_6 Diabetes Type 2 DM
#> 2: 2 f41270 0_0 E11 1939-02-16 icd10 Diabetes Type 2 DM
#> 3: 1 f41270 0_3 E10 1910-02-19 icd10 Diabetes Type 1 DM
#> author
#> <char>
#> 1: ukbwr
#> 2: ukbwr
#> 3: ukbwr
5. Analyse
Merge the output from steps 4 and 5:
ukb_main_processed <-
# first summarise `diabetes_cases` - for each eid, get the earliest date
diabetes_cases %>%
group_by(eid,
disease) %>%
summarise(diabetes_min_date = min(date, na.rm = TRUE)) %>%
ungroup() %>%
# create indicator column for diabetes
mutate(diabetes_indicator = case_when(!is.na(diabetes_min_date) ~ "Yes",
TRUE ~ "No")) %>%
# join with `ukb_main_numerical_vars_summarised`
dplyr::full_join(ukb_main_numerical_vars_summarised,
by = "eid")
#> `summarise()` has grouped output by 'eid'. You can override using the `.groups`
#> argument.
ukb_main_processed
#> # A tibble: 10 × 8
#> eid disease diabetes_min_date diabetes_indicator sex_f31_0_0
#> <int> <chr> <chr> <chr> <fct>
#> 1 1 Diabetes 1910-02-19 Yes Female
#> 2 2 Diabetes 1939-02-16 Yes Female
#> 3 3 NA NA NA Male
#> 4 4 NA NA NA Female
#> 5 5 NA NA NA NA
#> 6 6 NA NA NA Male
#> 7 7 NA NA NA Female
#> 8 8 NA NA NA Male
#> 9 9 NA NA NA Female
#> 10 10 NA NA NA Male
#> # ℹ 3 more variables: year_of_birth_f34_0_0 <int>,
#> # mean_body_mass_index_bmi_x21001 <dbl>,
#> # mean_systolic_blood_pressure_automated_reading_x4080 <dbl>
Describe:
ukb_main_processed %>%
select(-eid) %>%
group_by(diabetes_indicator) %>%
summarise(pct_female = sum(sex_f31_0_0 == "Female", na.rm = TRUE) / n(),
across(where(is.numeric), ~ mean(.x, na.rm = TRUE)))
#> # A tibble: 2 × 5
#> diabetes_indicator pct_female year_of_birth_f34_0_0 mean_body_mass_index_bmi…¹
#> <chr> <dbl> <dbl> <dbl>
#> 1 Yes 1 1949 23.2
#> 2 NA 0.375 1954. 25.4
#> # ℹ abbreviated name: ¹mean_body_mass_index_bmi_x21001
#> # ℹ 1 more variable: mean_systolic_blood_pressure_automated_reading_x4080 <dbl>
Setup for multiple projects
The following setup is recommended to reduce duplicated steps between multiple projects that draw on the same datasets:
-
Download the UK Biobank data dictionary and codings files (available from the UK Biobank data showcase) and for each new R project, place the following
.Renviron
file in the project root directory (replacingPATH/TO
with the correct file paths):UKB_DATA_DICT=/PATH/TO/Data_Dictionary_Showcase.tsv UKB_CODINGS=/PATH/TO/Codings.tsv
Functions with arguments
ukb_data_dict
andukb_codings
useget_ukb_data_dict()
andget_ukb_codings()
by default, which will automatically search for environmental variablesUKB_DATA_DICT
andUKB_CODINGS
and read the files from these locations. Create a clinical events database using
make_clinical_events_db()
. This function includes the option to incorporate primary care data.4 Having connected to the database, phenotypes may be extracted withextract_phenotypes()
:
# build dummy clinical events SQLite DB in tempdir
ukb_db_path <- tempfile(fileext = ".db")
make_clinical_events_db(ukb_main_path = ukb_main_path,
ukb_db_path = ukb_db_path,
gp_clinical_path = get_ukb_dummy("dummy_gp_clinical.txt",
path_only = TRUE),
gp_scripts_path = get_ukb_dummy("dummy_gp_scripts.txt",
path_only = TRUE),
ukb_data_dict = ukb_data_dict,
ukb_codings = ukb_codings)
# Connect to the database
con <- DBI::dbConnect(RSQLite::SQLite(),
ukb_db_path)
# Convert to a named list of dbplyr::tbl_dbi objects
ukbdb <- ukbwranglr::db_tables_to_list(con)
# Value columns (from `gp_clinical.txt`) and prescription names/quantities (from `gp_scripts.txt`) are stored separately from the main `clinical_events` table
ukbdb
#> $clinical_events
#> # Source: table<`clinical_events`> [?? x 5]
#> # Database: sqlite 3.45.2 [/tmp/RtmpNoleal/file55e02bbba532.db]
#> eid source index code date
#> <int> <chr> <chr> <chr> <chr>
#> 1 1 f40001 0_0 X095 1917-10-08
#> 2 2 f40001 0_0 A162 1955-02-11
#> 3 1 f40001 1_0 X095 1910-02-19
#> 4 2 f40001 1_0 A162 1965-08-08
#> 5 1 f40002 0_0 W192 1917-10-08
#> 6 2 f40002 0_0 V374 1955-02-11
#> 7 1 f40002 1_3 X715 1910-02-19
#> 8 1 f20003 0_0 1140861958 1955-02-11
#> 9 2 f20003 0_0 1141146234 1965-08-08
#> 10 1 f20003 2_0 1141146188 1910-02-19
#> # ℹ more rows
#>
#> $gp_clinical_values
#> # Source: table<`gp_clinical_values`> [?? x 4]
#> # Database: sqlite 3.45.2 [/tmp/RtmpNoleal/file55e02bbba532.db]
#> index value1 value2 value3
#> <chr> <chr> <chr> <chr>
#> 1 1 1 2 3
#> 2 2 1 2 3
#> 3 3 1 2 3
#> 4 4 1 2 3
#> 5 5 1 2 3
#> 6 6 1 2 3
#> 7 7 NA NA NA
#> 8 8 NA NA NA
#> 9 9 NA NA NA
#> 10 10 NA NA NA
#> # ℹ more rows
#>
#> $gp_scripts_names_and_quantities
#> # Source: table<`gp_scripts_names_and_quantities`> [6 x 3]
#> # Database: sqlite 3.45.2 [/tmp/RtmpNoleal/file55e02bbba532.db]
#> index drug_name quantity
#> <chr> <chr> <chr>
#> 1 1 drug2 50
#> 2 2 NA NA
#> 3 3 drug2 30
#> 4 4 drug2 30
#> 5 5 drug2 30
#> 6 6 2 30
# extract phenotypes
diabetes_cases <-
extract_phenotypes(clinical_events = ukbdb$clinical_events,
clinical_codes = example_clinical_codes(),
verbose = FALSE)
diabetes_cases
#> # A tibble: 8 × 9
#> eid source index code date code_type disease category author
#> <int> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 1 f20002 0_3 1223 2003-02-25 data_coding… Diabet… Type 2 … ukbwr
#> 2 1 gpc1_r2 7 C108. 1990-10-01 read2 Diabet… Type 1 … ukbwr
#> 3 1 gpc1_r2 11 C108. 1990-10-03 read2 Diabet… Type 1 … ukbwr
#> 4 2 gpc2_r2 8 C109. 1990-10-02 read2 Diabet… Type 2 … ukbwr
#> 5 2 gpc2_r2 12 C109. 1990-10-04 read2 Diabet… Type 2 … ukbwr
#> 6 1 f41270 0_3 E10 1910-02-19 icd10 Diabet… Type 1 … ukbwr
#> 7 1 f20002_icd10 0_3 E11 2003-02-25 icd10 Diabet… Type 2 … ukbwr
#> 8 2 f41270 0_0 E11 1939-02-16 icd10 Diabet… Type 2 … ukbwr