loading_data.Rmd
happyR provides an easy framework to import hap.py results into R. The user can decide whether to upload results from a single sample, by providing a happy prefix path, or to bulk load multiple datasets via a samplesheet. In both cases, it is easy to extract relevant results using helper functions.
Let’s first explore how to load hap.py outputs from a single sample:
# define happy_prefix (the -o argument to hap.py, here: path/to/files/happy_demo)
extdata_dir <- system.file("extdata", package = "happyR")
happy_prefix <- sprintf("%s/happy_demo", extdata_dir)
# load hap.py results
hap_result <- read_happy(happy_prefix)
#> Reading summary table
#> Reading extended table
#> Reading precision-recall curve data
class(hap_result)
#> [1] "happy_result"
names(hap_result)
#> [1] "summary" "extended" "pr_curve"
hap_result
is now a happy_result
object with the following fields:
summary
(from summary.csv
) - a data.frame with high-level ALL / PASS numbersextended
(from extended.csv
) - a data.frame with region / subtype stratified metricspr_curve
(from roc.*.csv.gz
) - a happy_roc
object (a list of data.frames) containing precision-recall over quality scoreWe can query each item in our happy_result
object using standard R syntax:
hap_result$summary %>% head
#> # A tibble: 4 x 17
#> Type Filter TRUTH.TOTAL TRUTH.TP TRUTH.FN QUERY.TOTAL QUERY.FP QUERY.UNK
#> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 INDEL ALL 8937 7839 1098 11812 343 3520
#> 2 INDEL PASS 8937 7550 1387 9971 283 1964
#> 3 SNP ALL 52494 52125 369 90092 582 37348
#> 4 SNP PASS 52494 46920 5574 48078 143 992
#> # … with 9 more variables: FP.gt <dbl>, METRIC.Recall <dbl>,
#> # METRIC.Precision <dbl>, METRIC.Frac_NA <dbl>, METRIC.F1_Score <dbl>,
#> # TRUTH.TOTAL.TiTv_ratio <dbl>, QUERY.TOTAL.TiTv_ratio <dbl>,
#> # TRUTH.TOTAL.het_hom_ratio <dbl>, QUERY.TOTAL.het_hom_ratio <dbl>
hap_result$extended %>% head
#> # A tibble: 6 x 65
#> Type Subtype Subset Filter Genotype QQ.Field QQ METRIC.Recall
#> <chr> <chr> <chr> <chr> <chr> <chr> <dbl> <dbl>
#> 1 INDEL * * ALL * QUAL NA 0.877
#> 2 INDEL * * PASS * QUAL NA 0.845
#> 3 INDEL * TS_bo… ALL * QUAL NA 0.527
#> 4 INDEL * TS_bo… PASS * QUAL NA 0.438
#> 5 INDEL * TS_co… ALL * QUAL NA 0.914
#> 6 INDEL * TS_co… PASS * QUAL NA 0.887
#> # … with 57 more variables: METRIC.Precision <dbl>, METRIC.Frac_NA <dbl>,
#> # METRIC.F1_Score <dbl>, FP.gt <int>, FP.al <int>, Subset.Size <dbl>,
#> # Subset.IS_CONF.Size <dbl>, Subset.Level <dbl>, TRUTH.TOTAL <int>,
#> # TRUTH.TOTAL.ti <dbl>, TRUTH.TOTAL.tv <dbl>, TRUTH.TOTAL.het <dbl>,
#> # TRUTH.TOTAL.homalt <dbl>, TRUTH.TOTAL.TiTv_ratio <dbl>,
#> # TRUTH.TOTAL.het_hom_ratio <dbl>, TRUTH.TP <int>, TRUTH.TP.ti <dbl>,
#> # TRUTH.TP.tv <dbl>, TRUTH.TP.het <dbl>, TRUTH.TP.homalt <dbl>,
#> # TRUTH.TP.TiTv_ratio <dbl>, TRUTH.TP.het_hom_ratio <dbl>,
#> # TRUTH.FN <int>, TRUTH.FN.ti <dbl>, TRUTH.FN.tv <dbl>,
#> # TRUTH.FN.het <dbl>, TRUTH.FN.homalt <dbl>, TRUTH.FN.TiTv_ratio <dbl>,
#> # TRUTH.FN.het_hom_ratio <dbl>, QUERY.TOTAL <int>, QUERY.TOTAL.ti <dbl>,
#> # QUERY.TOTAL.tv <dbl>, QUERY.TOTAL.het <dbl>, QUERY.TOTAL.homalt <dbl>,
#> # QUERY.TOTAL.TiTv_ratio <dbl>, QUERY.TOTAL.het_hom_ratio <dbl>,
#> # QUERY.TP <int>, QUERY.TP.ti <dbl>, QUERY.TP.tv <dbl>,
#> # QUERY.TP.het <dbl>, QUERY.TP.homalt <dbl>, QUERY.TP.TiTv_ratio <dbl>,
#> # QUERY.TP.het_hom_ratio <dbl>, QUERY.FP <int>, QUERY.FP.ti <dbl>,
#> # QUERY.FP.tv <dbl>, QUERY.FP.het <dbl>, QUERY.FP.homalt <dbl>,
#> # QUERY.FP.TiTv_ratio <dbl>, QUERY.FP.het_hom_ratio <dbl>,
#> # QUERY.UNK <int>, QUERY.UNK.ti <dbl>, QUERY.UNK.tv <dbl>,
#> # QUERY.UNK.het <dbl>, QUERY.UNK.homalt <dbl>,
#> # QUERY.UNK.TiTv_ratio <dbl>, QUERY.UNK.het_hom_ratio <dbl>
names(hap_result$pr_curve)
#> [1] "INDEL_SEL" "INDEL_PASS" "SNP" "all" "INDEL"
#> [6] "SNP_PASS" "SNP_SEL"
# e.g. here pr_curve$INDEL_PASS maps to happy_demo.roc.Locations.INDEL.PASS.csv.gz
hap_result$pr_curve$INDEL_PASS %>% head
#> # A tibble: 6 x 66
#> Type Subtype Subset Filter Genotype QQ.Field QQ METRIC.Recall
#> <chr> <chr> <chr> <chr> <chr> <chr> <dbl> <dbl>
#> 1 INDEL * * PASS * QUAL 0. 0.845
#> 2 INDEL * * PASS * QUAL 1.09e1 0.845
#> 3 INDEL * * PASS * QUAL 1.00e3 0.482
#> 4 INDEL * * PASS * QUAL 1.00e3 0.482
#> 5 INDEL * * PASS * QUAL 1.00e4 0.000224
#> 6 INDEL * * PASS * QUAL 1.00e3 0.482
#> # … with 58 more variables: METRIC.Precision <dbl>, METRIC.Frac_NA <dbl>,
#> # METRIC.F1_Score <dbl>, FP.gt <int>, FP.al <int>, Subset.Size <int>,
#> # Subset.IS_CONF.Size <dbl>, Subset.Level <dbl>, TRUTH.TOTAL <int>,
#> # TRUTH.TOTAL.ti <dbl>, TRUTH.TOTAL.tv <dbl>, TRUTH.TOTAL.het <dbl>,
#> # TRUTH.TOTAL.homalt <dbl>, TRUTH.TOTAL.TiTv_ratio <dbl>,
#> # TRUTH.TOTAL.het_hom_ratio <dbl>, TRUTH.TP <int>, TRUTH.TP.ti <dbl>,
#> # TRUTH.TP.tv <dbl>, TRUTH.TP.het <dbl>, TRUTH.TP.homalt <dbl>,
#> # TRUTH.TP.TiTv_ratio <dbl>, TRUTH.TP.het_hom_ratio <dbl>,
#> # TRUTH.FN <int>, TRUTH.FN.ti <dbl>, TRUTH.FN.tv <dbl>,
#> # TRUTH.FN.het <dbl>, TRUTH.FN.homalt <dbl>, TRUTH.FN.TiTv_ratio <dbl>,
#> # TRUTH.FN.het_hom_ratio <dbl>, QUERY.TOTAL <int>, QUERY.TOTAL.ti <dbl>,
#> # QUERY.TOTAL.tv <dbl>, QUERY.TOTAL.het <dbl>, QUERY.TOTAL.homalt <dbl>,
#> # QUERY.TOTAL.TiTv_ratio <dbl>, QUERY.TOTAL.het_hom_ratio <dbl>,
#> # QUERY.TP <int>, QUERY.TP.ti <dbl>, QUERY.TP.tv <dbl>,
#> # QUERY.TP.het <dbl>, QUERY.TP.homalt <dbl>, QUERY.TP.TiTv_ratio <dbl>,
#> # QUERY.TP.het_hom_ratio <dbl>, QUERY.FP <int>, QUERY.FP.ti <dbl>,
#> # QUERY.FP.tv <dbl>, QUERY.FP.het <dbl>, QUERY.FP.homalt <dbl>,
#> # QUERY.FP.TiTv_ratio <dbl>, QUERY.FP.het_hom_ratio <dbl>,
#> # QUERY.UNK <int>, QUERY.UNK.ti <dbl>, QUERY.UNK.tv <dbl>,
#> # QUERY.UNK.het <dbl>, QUERY.UNK.homalt <dbl>,
#> # QUERY.UNK.TiTv_ratio <dbl>, QUERY.UNK.het_hom_ratio <dbl>,
#> # file_source <chr>
Or we can use the helper pr_data
function to query pr_curve
with advanced filters:
del_pr <- pr_data(hap_result, var_type = "indel", filter = "PASS", subtype = "*")
del_pr %>% head
#> # A tibble: 6 x 66
#> Type Subtype Subset Filter Genotype QQ.Field QQ METRIC.Recall
#> <chr> <chr> <chr> <chr> <chr> <chr> <dbl> <dbl>
#> 1 SNP * * PASS * QUAL 0 0.894
#> 2 SNP * * PASS * QUAL 10.2 0.894
#> 3 SNP * * PASS * QUAL 10.8 0.894
#> 4 SNP * * PASS * QUAL 100. 0.894
#> 5 SNP * * PASS * QUAL 101. 0.894
#> 6 SNP * * PASS * QUAL 1000. 0.442
#> # … with 58 more variables: METRIC.Precision <dbl>, METRIC.Frac_NA <dbl>,
#> # METRIC.F1_Score <dbl>, FP.gt <int>, FP.al <int>, Subset.Size <int>,
#> # Subset.IS_CONF.Size <dbl>, Subset.Level <dbl>, TRUTH.TOTAL <int>,
#> # TRUTH.TOTAL.ti <dbl>, TRUTH.TOTAL.tv <dbl>, TRUTH.TOTAL.het <dbl>,
#> # TRUTH.TOTAL.homalt <dbl>, TRUTH.TOTAL.TiTv_ratio <dbl>,
#> # TRUTH.TOTAL.het_hom_ratio <dbl>, TRUTH.TP <int>, TRUTH.TP.ti <dbl>,
#> # TRUTH.TP.tv <dbl>, TRUTH.TP.het <dbl>, TRUTH.TP.homalt <dbl>,
#> # TRUTH.TP.TiTv_ratio <dbl>, TRUTH.TP.het_hom_ratio <dbl>,
#> # TRUTH.FN <int>, TRUTH.FN.ti <dbl>, TRUTH.FN.tv <dbl>,
#> # TRUTH.FN.het <dbl>, TRUTH.FN.homalt <dbl>, TRUTH.FN.TiTv_ratio <dbl>,
#> # TRUTH.FN.het_hom_ratio <dbl>, QUERY.TOTAL <int>, QUERY.TOTAL.ti <dbl>,
#> # QUERY.TOTAL.tv <dbl>, QUERY.TOTAL.het <dbl>, QUERY.TOTAL.homalt <dbl>,
#> # QUERY.TOTAL.TiTv_ratio <dbl>, QUERY.TOTAL.het_hom_ratio <dbl>,
#> # QUERY.TP <int>, QUERY.TP.ti <dbl>, QUERY.TP.tv <dbl>,
#> # QUERY.TP.het <dbl>, QUERY.TP.homalt <dbl>, QUERY.TP.TiTv_ratio <dbl>,
#> # QUERY.TP.het_hom_ratio <dbl>, QUERY.FP <int>, QUERY.FP.ti <dbl>,
#> # QUERY.FP.tv <dbl>, QUERY.FP.het <dbl>, QUERY.FP.homalt <dbl>,
#> # QUERY.FP.TiTv_ratio <dbl>, QUERY.FP.het_hom_ratio <dbl>,
#> # QUERY.UNK <int>, QUERY.UNK.ti <dbl>, QUERY.UNK.tv <dbl>,
#> # QUERY.UNK.het <dbl>, QUERY.UNK.homalt <dbl>,
#> # QUERY.UNK.TiTv_ratio <dbl>, QUERY.UNK.het_hom_ratio <dbl>,
#> # file_source <chr>
Often, we will want to inspect results from multiple samples, which we can achieve by passing a samplesheet to happyR:
# define happyr samplesheet
extdata_dir <- system.file("extdata", package = "happyR")
samplesheet <- readr::read_csv("group_id,replicate_id,happy_prefix
PCR-Free,NA12878-I30,NA12878-I30_S1
PCR-Free,NA12878-I33,NA12878-I33_S1
Nano,NA12878-R1,NA12878-R1_S1
Nano,NA12878-R2,NA12878-R2_S1
") %>%
mutate(happy_prefix = sprintf("%s/%s", extdata_dir, happy_prefix))
samplesheet
#> # A tibble: 4 x 3
#> group_id replicate_id happy_prefix
#> <chr> <chr> <chr>
#> 1 PCR-Free NA12878-I30 /Library/Frameworks/R.framework/Versions/3.4/Resou…
#> 2 PCR-Free NA12878-I33 /Library/Frameworks/R.framework/Versions/3.4/Resou…
#> 3 Nano NA12878-R1 /Library/Frameworks/R.framework/Versions/3.4/Resou…
#> 4 Nano NA12878-R2 /Library/Frameworks/R.framework/Versions/3.4/Resou…
# load hap.py results
hap_samplesheet <- read_samplesheet_(samplesheet)
#> Reading summary table
#> Reading extended table
#> Reading precision-recall curve data
#> Missing file: /Library/Frameworks/R.framework/Versions/3.4/Resources/library/happyR/extdata/NA12878-I30_S1.roc.Locations.SNP.csv.gz
#> Missing file: /Library/Frameworks/R.framework/Versions/3.4/Resources/library/happyR/extdata/NA12878-I30_S1.roc.Locations.SNP.PASS.csv.gz
#> Missing file: /Library/Frameworks/R.framework/Versions/3.4/Resources/library/happyR/extdata/NA12878-I30_S1.roc.Locations.SNP.SEL.csv.gz
#> Reading summary table
#> Reading extended table
#> Reading precision-recall curve data
#> Missing file: /Library/Frameworks/R.framework/Versions/3.4/Resources/library/happyR/extdata/NA12878-I33_S1.roc.Locations.SNP.csv.gz
#> Missing file: /Library/Frameworks/R.framework/Versions/3.4/Resources/library/happyR/extdata/NA12878-I33_S1.roc.Locations.SNP.PASS.csv.gz
#> Missing file: /Library/Frameworks/R.framework/Versions/3.4/Resources/library/happyR/extdata/NA12878-I33_S1.roc.Locations.SNP.SEL.csv.gz
#> Reading summary table
#> Reading extended table
#> Reading precision-recall curve data
#> Missing file: /Library/Frameworks/R.framework/Versions/3.4/Resources/library/happyR/extdata/NA12878-R1_S1.roc.Locations.SNP.csv.gz
#> Missing file: /Library/Frameworks/R.framework/Versions/3.4/Resources/library/happyR/extdata/NA12878-R1_S1.roc.Locations.SNP.PASS.csv.gz
#> Missing file: /Library/Frameworks/R.framework/Versions/3.4/Resources/library/happyR/extdata/NA12878-R1_S1.roc.Locations.SNP.SEL.csv.gz
#> Reading summary table
#> Reading extended table
#> Reading precision-recall curve data
#> Missing file: /Library/Frameworks/R.framework/Versions/3.4/Resources/library/happyR/extdata/NA12878-R2_S1.roc.Locations.SNP.csv.gz
#> Missing file: /Library/Frameworks/R.framework/Versions/3.4/Resources/library/happyR/extdata/NA12878-R2_S1.roc.Locations.SNP.PASS.csv.gz
#> Missing file: /Library/Frameworks/R.framework/Versions/3.4/Resources/library/happyR/extdata/NA12878-R2_S1.roc.Locations.SNP.SEL.csv.gz
# or directly from a samplesheet.csv
# hap_samplesheet <- read_samplesheet(samplesheet_path = "/path/to/happyr_samplesheet.csv")
class(hap_samplesheet)
#> [1] "happy_samplesheet"
names(hap_samplesheet)
#> [1] "samplesheet" "results"
hap_samplesheet
is a happy_samplesheet
object that contains the following fields:
samplesheet
: a data.frame with the original samplesheetresults
: a happy_result_list
that contains individual happy_result
objectsids
: a vector of result idsWe can query samplesheet
and ids
fields by directly accessing the relevant list items:
hap_samplesheet$samplesheet %>% head
#> # A tibble: 4 x 3
#> group_id replicate_id happy_prefix
#> <chr> <chr> <chr>
#> 1 PCR-Free NA12878-I30 /Library/Frameworks/R.framework/Versions/3.4/Resou…
#> 2 PCR-Free NA12878-I33 /Library/Frameworks/R.framework/Versions/3.4/Resou…
#> 3 Nano NA12878-R1 /Library/Frameworks/R.framework/Versions/3.4/Resou…
#> 4 Nano NA12878-R2 /Library/Frameworks/R.framework/Versions/3.4/Resou…
hap_samplesheet$ids %>% head
#> NULL
And access aggregate hap.py results with the extract_results
function:
summary <- extract_results(hap_samplesheet$results, table = "summary")
summary %>% head
#> # A tibble: 6 x 18
#> Type Filter TRUTH.TOTAL TRUTH.TP TRUTH.FN QUERY.TOTAL QUERY.FP QUERY.UNK
#> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 INDEL ALL 8330 7655 675 15820 691 7319
#> 2 INDEL PASS 8330 7547 783 13324 397 5221
#> 3 SNP ALL 48280 46396 1884 61177 342 14416
#> 4 SNP PASS 48280 45606 2674 52699 152 6921
#> 5 INDEL ALL 8330 7629 701 15558 660 7119
#> 6 INDEL PASS 8330 7546 784 13362 394 5270
#> # … with 10 more variables: FP.gt <dbl>, METRIC.Recall <dbl>,
#> # METRIC.Precision <dbl>, METRIC.Frac_NA <dbl>, METRIC.F1_Score <dbl>,
#> # TRUTH.TOTAL.TiTv_ratio <dbl>, QUERY.TOTAL.TiTv_ratio <dbl>,
#> # TRUTH.TOTAL.het_hom_ratio <dbl>, QUERY.TOTAL.het_hom_ratio <dbl>,
#> # happy_prefix <chr>
# see the extract_results documentation for a list of possible values for the table argument