Mapping_QC.Rmd
We start the workflow by loading Repsc and the human hg38 BSgenome object into our R environment:
bams <- grep('deduplicated', dir('~/davidbr/proj/epitherapy/data/h1299/10x/dacsb/aligned/chunked_bams/', pattern = '.bam$', full.names = TRUE), value = TRUE)
tes <- importRMSK('~/tgdata/src/Repsc/inst/extdata/hg38.fa.out.gz', curate = TRUE)
mutateReads(BSgenome = Hsapiens,
reads = bams,
paired = TRUE,
tes = tes,
n_reads = 1e6,
outdir = '~/davidbr/proj/epitherapy/data/h1299/10x/dacsb/fastqs/',
error_rate = c(0, 0.25, 0.5, 1, 2, 4, 8))
r1 <- dir('~/davidbr/proj/epitherapy/data/h1299/10x/dacsb/fastqs/', pattern = 'perc_error_R1', full.names = TRUE)
r2 <- dir('~/davidbr/proj/epitherapy/data/h1299/10x/dacsb/fastqs/', pattern = 'perc_error_R2', full.names = TRUE)
setwd('~/davidbr/proj/epitherapy/data/h1299/10x/dacsb/aligned/')
cmds <- list()
for (i in 1:length(r1))
{
cmds[[i]] <-
glue("system('/net/mraid14/export/data/tools/star/STAR-2.5.1b/source/STAR --runThreadN 10 --genomeDir /net/mraid14/export/data/users/davidbr/tools/cellranger/references/refdata-cellranger-GRCh38-1.2.0/star/ --outSAMtype BAM SortedByCoordinate --outFilterMultimapScoreRange 2 --outFileNamePrefix {gsub('.fastq', '.', basename(r1[i]))} --outSAMunmapped Within --readFilesIn {r1[i]} {r2[i]}')")
}
# create new empty env and fill with relevant
empty <- new.env(parent = emptyenv())
# distribute, compute and combine
res <-
gcluster.run3(command_list = cmds,
max.jobs = 50,
envir = empty,
io_saturation = FALSE)
bams <- grep('error', dir('~/davidbr/proj/epitherapy/data/h1299/10x/dacsb/aligned/', pattern = 'bam$', full.names = TRUE), value = TRUE)
reads <- importBAM(bams,
paired = TRUE,
mate = 'first',
anchor = 'fiveprime',
multi = FALSE,
what = 'qname',
barcode = c('0.25', '0.5', '0', '1', '2', '4', '8'))
# add overlapping TE locus
reads_anno <- plyranges::join_overlap_left_directed(reads, tes)
reads_dt <- reads_anno %>% as.data.table()
p_n_reads <- reads_dt %>% count(barcode) %>% ggplot(aes(barcode, n)) + geom_bar(stat = ‘identity’)
fam_counts <- reads_dt[which(barcode == 0), ] %>% count(name)
bla = reads_dt[, same := abs(start - start[barcode == 0]) < 10, by = ‘qname’ ][, .(perc_cor_map = sum(same, na.rm = TRUE) / length(same) * 100, repclass = repclass[barcode == 0]), by = c(‘barcode’, ‘name’)] bla %>% filter(name %in% (fam_counts %>% filter(n > 100) %>% pull(name))) %>% ggplot(aes(x = as.numeric(barcode), y = perc_cor_map, group = name, col = repclass, alpha = 0.25)) + geom_point() + geom_line() + facet_wrap(~repclass) + theme(legend.position = ‘none’) + scale_color_brewer(palette = ‘Set1’) ```
ltr12c <-
reads_anno %>%
as_tibble %>%
filter(name == 'LTR12C') %>%
group_by(id_unique, barcode) %>%
summarise(counts = sum(NH_weight)) %>%
ungroup %>%
spread(barcode, counts, fill = 0)
# plot
p1 <-
ltr12c %>%
ggplot(aes(x = `0`, y = `0.25`)) +
ggrastr::geom_point_rast(size = 2) +
scale_y_log10(limits = c(1, 1e4)) +
scale_x_log10(limits = c(1, 1e4))
# plot
p2 <-
ltr12c %>%
ggplot(aes(x = `0`, y = `0.5`)) +
ggrastr::geom_point_rast(size = 2) +
scale_y_log10(limits = c(1, 1e4)) +
scale_x_log10(limits = c(1, 1e4))
p3 <-
ltr12c %>%
ggplot(aes(x = `0`, y = `1`)) +
ggrastr::geom_point_rast(size = 2) +
scale_y_log10(limits = c(1, 1e4)) +
scale_x_log10(limits = c(1, 1e4))
p4 <-
ltr12c %>%
ggplot(aes(x = `0`, y = `2`)) +
ggrastr::geom_point_rast(size = 2) +
scale_y_log10(limits = c(1, 1e4)) +
scale_x_log10(limits = c(1, 1e4))
p5 <-
ltr12c %>%
ggplot(aes(x = `0`, y = `4`)) +
ggrastr::geom_point_rast(size = 2) +
scale_y_log10(limits = c(1, 1e4)) +
scale_x_log10(limits = c(1, 1e4))
p6 <-
ltr12c %>%
ggplot(aes(x = `0`, y = `8`)) +
ggrastr::geom_point_rast(size = 2) +
scale_y_log10(limits = c(1, 1e4)) +
scale_x_log10(limits = c(1, 1e4))
p_comb <- cowplot::plot_grid(p1, p2, p3, p4, p5, p6)
print(p_comb)