Loading hap.py results

happyR provides an easy framework to import hap.py results into R. The user can decide whether to upload results from a single sample, by providing a happy prefix path, or to bulk load multiple datasets via a samplesheet. In both cases, it is easy to extract relevant results using helper functions.

Set up

library(happyR)
library(tidyverse)

Loading and querying individual hap.py results

Let’s first explore how to load hap.py outputs from a single sample:

# define happy_prefix (the -o argument to hap.py, here: path/to/files/happy_demo)
extdata_dir <- system.file("extdata", package = "happyR")
happy_prefix <- sprintf("%s/happy_demo", extdata_dir)

# load hap.py results
hap_result <- read_happy(happy_prefix)
#> Reading summary table
#> Reading extended table
#> Reading precision-recall curve data
class(hap_result)
#> [1] "happy_result"
names(hap_result)
#> [1] "summary"  "extended" "pr_curve"

hap_result is now a happy_result object with the following fields:

summary (from summary.csv) - a data.frame with high-level ALL / PASS numbers
extended (from extended.csv) - a data.frame with region / subtype stratified metrics
pr_curve (from roc.*.csv.gz) - a happy_roc object (a list of data.frames) containing precision-recall over quality score

We can query each item in our happy_result object using standard R syntax:

hap_result$summary %>% head
#> # A tibble: 4 x 17
#>   Type  Filter TRUTH.TOTAL TRUTH.TP TRUTH.FN QUERY.TOTAL QUERY.FP QUERY.UNK
#>   <chr> <chr>        <dbl>    <dbl>    <dbl>       <dbl>    <dbl>     <dbl>
#> 1 INDEL ALL           8937     7839     1098       11812      343      3520
#> 2 INDEL PASS          8937     7550     1387        9971      283      1964
#> 3 SNP   ALL          52494    52125      369       90092      582     37348
#> 4 SNP   PASS         52494    46920     5574       48078      143       992
#> # … with 9 more variables: FP.gt <dbl>, METRIC.Recall <dbl>,
#> #   METRIC.Precision <dbl>, METRIC.Frac_NA <dbl>, METRIC.F1_Score <dbl>,
#> #   TRUTH.TOTAL.TiTv_ratio <dbl>, QUERY.TOTAL.TiTv_ratio <dbl>,
#> #   TRUTH.TOTAL.het_hom_ratio <dbl>, QUERY.TOTAL.het_hom_ratio <dbl>

hap_result$extended %>% head
#> # A tibble: 6 x 65
#>   Type  Subtype Subset Filter Genotype QQ.Field    QQ METRIC.Recall
#>   <chr> <chr>   <chr>  <chr>  <chr>    <chr>    <dbl>         <dbl>
#> 1 INDEL *       *      ALL    *        QUAL        NA         0.877
#> 2 INDEL *       *      PASS   *        QUAL        NA         0.845
#> 3 INDEL *       TS_bo… ALL    *        QUAL        NA         0.527
#> 4 INDEL *       TS_bo… PASS   *        QUAL        NA         0.438
#> 5 INDEL *       TS_co… ALL    *        QUAL        NA         0.914
#> 6 INDEL *       TS_co… PASS   *        QUAL        NA         0.887
#> # … with 57 more variables: METRIC.Precision <dbl>, METRIC.Frac_NA <dbl>,
#> #   METRIC.F1_Score <dbl>, FP.gt <int>, FP.al <int>, Subset.Size <dbl>,
#> #   Subset.IS_CONF.Size <dbl>, Subset.Level <dbl>, TRUTH.TOTAL <int>,
#> #   TRUTH.TOTAL.ti <dbl>, TRUTH.TOTAL.tv <dbl>, TRUTH.TOTAL.het <dbl>,
#> #   TRUTH.TOTAL.homalt <dbl>, TRUTH.TOTAL.TiTv_ratio <dbl>,
#> #   TRUTH.TOTAL.het_hom_ratio <dbl>, TRUTH.TP <int>, TRUTH.TP.ti <dbl>,
#> #   TRUTH.TP.tv <dbl>, TRUTH.TP.het <dbl>, TRUTH.TP.homalt <dbl>,
#> #   TRUTH.TP.TiTv_ratio <dbl>, TRUTH.TP.het_hom_ratio <dbl>,
#> #   TRUTH.FN <int>, TRUTH.FN.ti <dbl>, TRUTH.FN.tv <dbl>,
#> #   TRUTH.FN.het <dbl>, TRUTH.FN.homalt <dbl>, TRUTH.FN.TiTv_ratio <dbl>,
#> #   TRUTH.FN.het_hom_ratio <dbl>, QUERY.TOTAL <int>, QUERY.TOTAL.ti <dbl>,
#> #   QUERY.TOTAL.tv <dbl>, QUERY.TOTAL.het <dbl>, QUERY.TOTAL.homalt <dbl>,
#> #   QUERY.TOTAL.TiTv_ratio <dbl>, QUERY.TOTAL.het_hom_ratio <dbl>,
#> #   QUERY.TP <int>, QUERY.TP.ti <dbl>, QUERY.TP.tv <dbl>,
#> #   QUERY.TP.het <dbl>, QUERY.TP.homalt <dbl>, QUERY.TP.TiTv_ratio <dbl>,
#> #   QUERY.TP.het_hom_ratio <dbl>, QUERY.FP <int>, QUERY.FP.ti <dbl>,
#> #   QUERY.FP.tv <dbl>, QUERY.FP.het <dbl>, QUERY.FP.homalt <dbl>,
#> #   QUERY.FP.TiTv_ratio <dbl>, QUERY.FP.het_hom_ratio <dbl>,
#> #   QUERY.UNK <int>, QUERY.UNK.ti <dbl>, QUERY.UNK.tv <dbl>,
#> #   QUERY.UNK.het <dbl>, QUERY.UNK.homalt <dbl>,
#> #   QUERY.UNK.TiTv_ratio <dbl>, QUERY.UNK.het_hom_ratio <dbl>

names(hap_result$pr_curve)
#> [1] "INDEL_SEL"  "INDEL_PASS" "SNP"        "all"        "INDEL"     
#> [6] "SNP_PASS"   "SNP_SEL"
# e.g. here pr_curve$INDEL_PASS maps to happy_demo.roc.Locations.INDEL.PASS.csv.gz
hap_result$pr_curve$INDEL_PASS %>% head
#> # A tibble: 6 x 66
#>   Type  Subtype Subset Filter Genotype QQ.Field     QQ METRIC.Recall
#>   <chr> <chr>   <chr>  <chr>  <chr>    <chr>     <dbl>         <dbl>
#> 1 INDEL *       *      PASS   *        QUAL     0.          0.845   
#> 2 INDEL *       *      PASS   *        QUAL     1.09e1      0.845   
#> 3 INDEL *       *      PASS   *        QUAL     1.00e3      0.482   
#> 4 INDEL *       *      PASS   *        QUAL     1.00e3      0.482   
#> 5 INDEL *       *      PASS   *        QUAL     1.00e4      0.000224
#> 6 INDEL *       *      PASS   *        QUAL     1.00e3      0.482   
#> # … with 58 more variables: METRIC.Precision <dbl>, METRIC.Frac_NA <dbl>,
#> #   METRIC.F1_Score <dbl>, FP.gt <int>, FP.al <int>, Subset.Size <int>,
#> #   Subset.IS_CONF.Size <dbl>, Subset.Level <dbl>, TRUTH.TOTAL <int>,
#> #   TRUTH.TOTAL.ti <dbl>, TRUTH.TOTAL.tv <dbl>, TRUTH.TOTAL.het <dbl>,
#> #   TRUTH.TOTAL.homalt <dbl>, TRUTH.TOTAL.TiTv_ratio <dbl>,
#> #   TRUTH.TOTAL.het_hom_ratio <dbl>, TRUTH.TP <int>, TRUTH.TP.ti <dbl>,
#> #   TRUTH.TP.tv <dbl>, TRUTH.TP.het <dbl>, TRUTH.TP.homalt <dbl>,
#> #   TRUTH.TP.TiTv_ratio <dbl>, TRUTH.TP.het_hom_ratio <dbl>,
#> #   TRUTH.FN <int>, TRUTH.FN.ti <dbl>, TRUTH.FN.tv <dbl>,
#> #   TRUTH.FN.het <dbl>, TRUTH.FN.homalt <dbl>, TRUTH.FN.TiTv_ratio <dbl>,
#> #   TRUTH.FN.het_hom_ratio <dbl>, QUERY.TOTAL <int>, QUERY.TOTAL.ti <dbl>,
#> #   QUERY.TOTAL.tv <dbl>, QUERY.TOTAL.het <dbl>, QUERY.TOTAL.homalt <dbl>,
#> #   QUERY.TOTAL.TiTv_ratio <dbl>, QUERY.TOTAL.het_hom_ratio <dbl>,
#> #   QUERY.TP <int>, QUERY.TP.ti <dbl>, QUERY.TP.tv <dbl>,
#> #   QUERY.TP.het <dbl>, QUERY.TP.homalt <dbl>, QUERY.TP.TiTv_ratio <dbl>,
#> #   QUERY.TP.het_hom_ratio <dbl>, QUERY.FP <int>, QUERY.FP.ti <dbl>,
#> #   QUERY.FP.tv <dbl>, QUERY.FP.het <dbl>, QUERY.FP.homalt <dbl>,
#> #   QUERY.FP.TiTv_ratio <dbl>, QUERY.FP.het_hom_ratio <dbl>,
#> #   QUERY.UNK <int>, QUERY.UNK.ti <dbl>, QUERY.UNK.tv <dbl>,
#> #   QUERY.UNK.het <dbl>, QUERY.UNK.homalt <dbl>,
#> #   QUERY.UNK.TiTv_ratio <dbl>, QUERY.UNK.het_hom_ratio <dbl>,
#> #   file_source <chr>

Or we can use the helper pr_data function to query pr_curve with advanced filters:

del_pr <- pr_data(hap_result, var_type = "indel", filter = "PASS", subtype = "*")
del_pr %>% head
#> # A tibble: 6 x 66
#>   Type  Subtype Subset Filter Genotype QQ.Field     QQ METRIC.Recall
#>   <chr> <chr>   <chr>  <chr>  <chr>    <chr>     <dbl>         <dbl>
#> 1 SNP   *       *      PASS   *        QUAL        0           0.894
#> 2 SNP   *       *      PASS   *        QUAL       10.2         0.894
#> 3 SNP   *       *      PASS   *        QUAL       10.8         0.894
#> 4 SNP   *       *      PASS   *        QUAL      100.          0.894
#> 5 SNP   *       *      PASS   *        QUAL      101.          0.894
#> 6 SNP   *       *      PASS   *        QUAL     1000.          0.442
#> # … with 58 more variables: METRIC.Precision <dbl>, METRIC.Frac_NA <dbl>,
#> #   METRIC.F1_Score <dbl>, FP.gt <int>, FP.al <int>, Subset.Size <int>,
#> #   Subset.IS_CONF.Size <dbl>, Subset.Level <dbl>, TRUTH.TOTAL <int>,
#> #   TRUTH.TOTAL.ti <dbl>, TRUTH.TOTAL.tv <dbl>, TRUTH.TOTAL.het <dbl>,
#> #   TRUTH.TOTAL.homalt <dbl>, TRUTH.TOTAL.TiTv_ratio <dbl>,
#> #   TRUTH.TOTAL.het_hom_ratio <dbl>, TRUTH.TP <int>, TRUTH.TP.ti <dbl>,
#> #   TRUTH.TP.tv <dbl>, TRUTH.TP.het <dbl>, TRUTH.TP.homalt <dbl>,
#> #   TRUTH.TP.TiTv_ratio <dbl>, TRUTH.TP.het_hom_ratio <dbl>,
#> #   TRUTH.FN <int>, TRUTH.FN.ti <dbl>, TRUTH.FN.tv <dbl>,
#> #   TRUTH.FN.het <dbl>, TRUTH.FN.homalt <dbl>, TRUTH.FN.TiTv_ratio <dbl>,
#> #   TRUTH.FN.het_hom_ratio <dbl>, QUERY.TOTAL <int>, QUERY.TOTAL.ti <dbl>,
#> #   QUERY.TOTAL.tv <dbl>, QUERY.TOTAL.het <dbl>, QUERY.TOTAL.homalt <dbl>,
#> #   QUERY.TOTAL.TiTv_ratio <dbl>, QUERY.TOTAL.het_hom_ratio <dbl>,
#> #   QUERY.TP <int>, QUERY.TP.ti <dbl>, QUERY.TP.tv <dbl>,
#> #   QUERY.TP.het <dbl>, QUERY.TP.homalt <dbl>, QUERY.TP.TiTv_ratio <dbl>,
#> #   QUERY.TP.het_hom_ratio <dbl>, QUERY.FP <int>, QUERY.FP.ti <dbl>,
#> #   QUERY.FP.tv <dbl>, QUERY.FP.het <dbl>, QUERY.FP.homalt <dbl>,
#> #   QUERY.FP.TiTv_ratio <dbl>, QUERY.FP.het_hom_ratio <dbl>,
#> #   QUERY.UNK <int>, QUERY.UNK.ti <dbl>, QUERY.UNK.tv <dbl>,
#> #   QUERY.UNK.het <dbl>, QUERY.UNK.homalt <dbl>,
#> #   QUERY.UNK.TiTv_ratio <dbl>, QUERY.UNK.het_hom_ratio <dbl>,
#> #   file_source <chr>

Aggregating results from multiple samples

Often, we will want to inspect results from multiple samples, which we can achieve by passing a samplesheet to happyR:

# define happyr samplesheet
extdata_dir <- system.file("extdata", package = "happyR")
samplesheet <- readr::read_csv("group_id,replicate_id,happy_prefix
PCR-Free,NA12878-I30,NA12878-I30_S1
PCR-Free,NA12878-I33,NA12878-I33_S1
Nano,NA12878-R1,NA12878-R1_S1
Nano,NA12878-R2,NA12878-R2_S1
") %>% 
mutate(happy_prefix = sprintf("%s/%s", extdata_dir, happy_prefix))

samplesheet
#> # A tibble: 4 x 3
#>   group_id replicate_id happy_prefix                                       
#>   <chr>    <chr>        <chr>                                              
#> 1 PCR-Free NA12878-I30  /Library/Frameworks/R.framework/Versions/3.4/Resou…
#> 2 PCR-Free NA12878-I33  /Library/Frameworks/R.framework/Versions/3.4/Resou…
#> 3 Nano     NA12878-R1   /Library/Frameworks/R.framework/Versions/3.4/Resou…
#> 4 Nano     NA12878-R2   /Library/Frameworks/R.framework/Versions/3.4/Resou…

# load hap.py results
hap_samplesheet <- read_samplesheet_(samplesheet)
#> Reading summary table
#> Reading extended table
#> Reading precision-recall curve data
#> Missing file: /Library/Frameworks/R.framework/Versions/3.4/Resources/library/happyR/extdata/NA12878-I30_S1.roc.Locations.SNP.csv.gz
#> Missing file: /Library/Frameworks/R.framework/Versions/3.4/Resources/library/happyR/extdata/NA12878-I30_S1.roc.Locations.SNP.PASS.csv.gz
#> Missing file: /Library/Frameworks/R.framework/Versions/3.4/Resources/library/happyR/extdata/NA12878-I30_S1.roc.Locations.SNP.SEL.csv.gz
#> Reading summary table
#> Reading extended table
#> Reading precision-recall curve data
#> Missing file: /Library/Frameworks/R.framework/Versions/3.4/Resources/library/happyR/extdata/NA12878-I33_S1.roc.Locations.SNP.csv.gz
#> Missing file: /Library/Frameworks/R.framework/Versions/3.4/Resources/library/happyR/extdata/NA12878-I33_S1.roc.Locations.SNP.PASS.csv.gz
#> Missing file: /Library/Frameworks/R.framework/Versions/3.4/Resources/library/happyR/extdata/NA12878-I33_S1.roc.Locations.SNP.SEL.csv.gz
#> Reading summary table
#> Reading extended table
#> Reading precision-recall curve data
#> Missing file: /Library/Frameworks/R.framework/Versions/3.4/Resources/library/happyR/extdata/NA12878-R1_S1.roc.Locations.SNP.csv.gz
#> Missing file: /Library/Frameworks/R.framework/Versions/3.4/Resources/library/happyR/extdata/NA12878-R1_S1.roc.Locations.SNP.PASS.csv.gz
#> Missing file: /Library/Frameworks/R.framework/Versions/3.4/Resources/library/happyR/extdata/NA12878-R1_S1.roc.Locations.SNP.SEL.csv.gz
#> Reading summary table
#> Reading extended table
#> Reading precision-recall curve data
#> Missing file: /Library/Frameworks/R.framework/Versions/3.4/Resources/library/happyR/extdata/NA12878-R2_S1.roc.Locations.SNP.csv.gz
#> Missing file: /Library/Frameworks/R.framework/Versions/3.4/Resources/library/happyR/extdata/NA12878-R2_S1.roc.Locations.SNP.PASS.csv.gz
#> Missing file: /Library/Frameworks/R.framework/Versions/3.4/Resources/library/happyR/extdata/NA12878-R2_S1.roc.Locations.SNP.SEL.csv.gz
# or directly from a samplesheet.csv
# hap_samplesheet <- read_samplesheet(samplesheet_path = "/path/to/happyr_samplesheet.csv")
class(hap_samplesheet)
#> [1] "happy_samplesheet"
names(hap_samplesheet)
#> [1] "samplesheet" "results"

hap_samplesheet is a happy_samplesheet object that contains the following fields:

samplesheet: a data.frame with the original samplesheet
results: a happy_result_list that contains individual happy_result objects
ids: a vector of result ids

We can query samplesheet and ids fields by directly accessing the relevant list items:

hap_samplesheet$samplesheet %>% head
#> # A tibble: 4 x 3
#>   group_id replicate_id happy_prefix                                       
#>   <chr>    <chr>        <chr>                                              
#> 1 PCR-Free NA12878-I30  /Library/Frameworks/R.framework/Versions/3.4/Resou…
#> 2 PCR-Free NA12878-I33  /Library/Frameworks/R.framework/Versions/3.4/Resou…
#> 3 Nano     NA12878-R1   /Library/Frameworks/R.framework/Versions/3.4/Resou…
#> 4 Nano     NA12878-R2   /Library/Frameworks/R.framework/Versions/3.4/Resou…
hap_samplesheet$ids %>% head
#> NULL

And access aggregate hap.py results with the extract_results function:

summary <- extract_results(hap_samplesheet$results, table = "summary")
summary %>% head
#> # A tibble: 6 x 18
#>   Type  Filter TRUTH.TOTAL TRUTH.TP TRUTH.FN QUERY.TOTAL QUERY.FP QUERY.UNK
#>   <chr> <chr>        <dbl>    <dbl>    <dbl>       <dbl>    <dbl>     <dbl>
#> 1 INDEL ALL           8330     7655      675       15820      691      7319
#> 2 INDEL PASS          8330     7547      783       13324      397      5221
#> 3 SNP   ALL          48280    46396     1884       61177      342     14416
#> 4 SNP   PASS         48280    45606     2674       52699      152      6921
#> 5 INDEL ALL           8330     7629      701       15558      660      7119
#> 6 INDEL PASS          8330     7546      784       13362      394      5270
#> # … with 10 more variables: FP.gt <dbl>, METRIC.Recall <dbl>,
#> #   METRIC.Precision <dbl>, METRIC.Frac_NA <dbl>, METRIC.F1_Score <dbl>,
#> #   TRUTH.TOTAL.TiTv_ratio <dbl>, QUERY.TOTAL.TiTv_ratio <dbl>,
#> #   TRUTH.TOTAL.het_hom_ratio <dbl>, QUERY.TOTAL.het_hom_ratio <dbl>,
#> #   happy_prefix <chr>
# see the extract_results documentation for a list of possible values for the table argument

Mar Gonzalez-Porta

2019-07-12

Set up

Loading and querying individual hap.py results

Aggregating results from multiple samples

Contents