Skip to contents

Adds tables named clinical_events, and optionally 'gp_clinical_values' and 'gp_scripts_names_and_quantities' to a SQLite database file (the latter 2 are only added if gp_clinical_path and/or gp_scripts_path respectively are provided). This is a long format table combining all clinical events data from a UK Biobank main dataset and the UK Biobank primary care clinical events dataset. Use clinical_events_sources() to see a list of all currently included clinical events sources. Expect this to take ~1 hour to finish running.

Usage

make_clinical_events_db(
  ukb_main_path,
  ukb_db_path,
  ukb_main_delim = "auto",
  gp_clinical_path = NULL,
  gp_scripts_path = NULL,
  ukb_data_dict = get_ukb_data_dict(),
  ukb_codings = get_ukb_codings(),
  overwrite = FALSE,
  chunk_size = 5e+05
)

Arguments

ukb_main_path

Path to the main UKB dataset file.

ukb_db_path

Path to the SQLite database file. The file name must end with '.db'. If no file with this name exists then one will be created.

ukb_main_delim

Delimiter for ukb_main_path. Default value is "auto".

gp_clinical_path

(Optional) path to the UKB primary care clinical events file (gp_clinical.txt).

gp_scripts_path

(Optional) path to the UKB primary care prescriptions file (gp_scripts.txt).

ukb_data_dict

The UKB data dictionary (available online at the UK Biobank data showcase. This should be a data frame where all columns are of type character.

ukb_codings

The UKB codings file (available online at the UK Biobank data showcase. This should be a data frame where all columns are of type character.

overwrite

If TRUE, then tables clinical_events and gp_clinical_values will be overwritten if they already exist in the database. Default value is FALSE.

chunk_size

The number of rows to include in each chunk when processing primary care datasets.

Value

Returns ukb_db_path invisibly.

Details

See the introduction to dbplyr vignette for getting started with databases and dplyr::dplyr.

Indexes are set on the source, code and eid columns in the clinical_events table for faster querying.

Examples

# dummy UKB data dictionary and codings
dummy_ukb_data_dict <- get_ukb_dummy("dummy_Data_Dictionary_Showcase.tsv")
dummy_ukb_codings <- get_ukb_dummy("dummy_Codings.tsv")

# file paths to dummy UKB main and primary care datasets
dummy_ukb_main_path <- get_ukb_dummy(
  "dummy_ukb_main.tsv",
  path_only = TRUE
)

dummy_gp_clinical_path <- get_ukb_dummy(
  "dummy_gp_clinical.txt",
  path_only = TRUE
)

dummy_gp_scripts_path <- get_ukb_dummy(
  "dummy_gp_scripts.txt",
  path_only = TRUE
)

# file path where SQLite database will be created
dummy_ukb_db_path <- file.path(tempdir(), "ukb.db")

# build database
suppressWarnings(make_clinical_events_db(
  ukb_main_path = dummy_ukb_main_path,
  gp_clinical_path = dummy_gp_clinical_path,
  gp_scripts_path = dummy_gp_scripts_path,
  ukb_db_path = dummy_ukb_db_path,
  ukb_data_dict = dummy_ukb_data_dict,
  ukb_codings = dummy_ukb_codings,
))
#> ***CREATING DATA DICTIONARY FOR UKB MAIN DATASET***
#> ***READING DIAGNOSIS COLUMNS FROM UKB MAIN DATASET INTO R***
#> STEP 1 of 2
#> Reading data into R
#> STEP 2 of 2
#> Renaming with descriptive column names
#> Time taken: 0 minutes, 0 seconds.
#> ***TIDYING CLINICAL EVENTS DATA***
#> Tidying clinical events for primary_death_icd10
#> Time taken: 0 minutes, 0 seconds.
#> Tidying clinical events for secondary_death_icd10
#> Time taken: 0 minutes, 0 seconds.
#> Tidying clinical events for self_report_medication
#> Time taken: 0 minutes, 0 seconds.
#> Tidying clinical events for self_report_non_cancer
#> Time taken: 0 minutes, 0 seconds.
#> Tidying clinical events for self_report_non_cancer_icd10
#> Time taken: 0 minutes, 0 seconds.
#> Tidying clinical events for self_report_cancer
#> Time taken: 0 minutes, 0 seconds.
#> Tidying clinical events for self_report_operation
#> Time taken: 0 minutes, 0 seconds.
#> Tidying clinical events for cancer_register_icd9
#> Time taken: 0 minutes, 0 seconds.
#> Tidying clinical events for cancer_register_icd10
#> Time taken: 0 minutes, 0 seconds.
#> Tidying clinical events for summary_hes_icd9
#> Time taken: 0 minutes, 0 seconds.
#> Tidying clinical events for summary_hes_icd10
#> Time taken: 0 minutes, 0 seconds.
#> Tidying clinical events for summary_hes_opcs3
#> Time taken: 0 minutes, 0 seconds.
#> Tidying clinical events for summary_hes_opcs4
#> Time taken: 0 minutes, 0 seconds.
#> ***WRITING CLINICAL EVENTS FROM MAIN UKB DATASET TO `clinical_events` TABLE IN DATABASE***
#> ***APPENDING UKB PRIMARY CARE CLINICAL EVENTS DATA TO 'clinical_events' TABLE AND WRITING VALUE COLUMNS TO `gp_clinical_values` TABLE***
#> Writing file to table
#> Writing from line 1. Time taken: 0 minutes, 0 seconds
#> Complete. Time taken: 0 minutes, 0 seconds
#> ***APPENDING UKB PRIMARY CARE PRESCRIPTION DATA TO 'clinical_events' TABLE AND WRITING DRUG NAME AND QUANTITY COLUMNS TO `gp_scripts_names_and_quantities` TABLE***
#> Writing file to table
#> Writing from line 1. Time taken: 0 minutes, 0 seconds
#> Complete. Time taken: 0 minutes, 0 seconds
#> ***SETTING MULTICOLUMN INDEX ON `code`/`source`, AND INDEX ON `eid` COLUMNS IN UKB DATABASE 'clinical_events' TABLE***
#> SUCCESS! UKB DATABASE SETUP COMPLETE
#> To connect to db: `con <- DBI::dbConnect(RSQLite::SQLite(), '/tmp/RtmpXWKA05/ukb.db')`, then `ukbdb <- ukbwranglr::db_tables_to_list(con)`
#> Time taken: 0 minutes, 1 seconds.

# connect to database
con <- DBI::dbConnect(
  RSQLite::SQLite(),
  dummy_ukb_db_path
)

ukbdb <- db_tables_to_list(con)

# table names
names(ukbdb)
#> [1] "clinical_events"                 "gp_clinical_values"             
#> [3] "gp_scripts_names_and_quantities"

# view tables
ukbdb$clinical_events
#> # Source:   table<`clinical_events`> [?? x 5]
#> # Database: sqlite 3.45.2 [/tmp/RtmpXWKA05/ukb.db]
#>      eid source index code       date      
#>    <int> <chr>  <chr> <chr>      <chr>     
#>  1     1 f40001 0_0   X095       1917-10-08
#>  2     2 f40001 0_0   A162       1955-02-11
#>  3     1 f40001 1_0   X095       1910-02-19
#>  4     2 f40001 1_0   A162       1965-08-08
#>  5     1 f40002 0_0   W192       1917-10-08
#>  6     2 f40002 0_0   V374       1955-02-11
#>  7     1 f40002 1_3   X715       1910-02-19
#>  8     1 f20003 0_0   1140861958 1955-02-11
#>  9     2 f20003 0_0   1141146234 1965-08-08
#> 10     1 f20003 2_0   1141146188 1910-02-19
#> # ℹ more rows

ukbdb$gp_clinical_values
#> # Source:   table<`gp_clinical_values`> [?? x 4]
#> # Database: sqlite 3.45.2 [/tmp/RtmpXWKA05/ukb.db]
#>    index value1 value2 value3
#>    <chr> <chr>  <chr>  <chr> 
#>  1 1     1      2      3     
#>  2 2     1      2      3     
#>  3 3     1      2      3     
#>  4 4     1      2      3     
#>  5 5     1      2      3     
#>  6 6     1      2      3     
#>  7 7     NA     NA     NA    
#>  8 8     NA     NA     NA    
#>  9 9     NA     NA     NA    
#> 10 10    NA     NA     NA    
#> # ℹ more rows

ukbdb$gp_scripts_names_and_quantities
#> # Source:   table<`gp_scripts_names_and_quantities`> [6 x 3]
#> # Database: sqlite 3.45.2 [/tmp/RtmpXWKA05/ukb.db]
#>   index drug_name quantity
#>   <chr> <chr>     <chr>   
#> 1 1     drug2     50      
#> 2 2     NA        NA      
#> 3 3     drug2     30      
#> 4 4     drug2     30      
#> 5 5     drug2     30      
#> 6 6     2         30      

# close database connection
DBI::dbDisconnect(con)