Skip to content

polyA size

zhou-ran edited this page Jan 5, 2022 · 4 revisions

1. Prepare pair mapped bam file

Followed snakemake pipeline in this repo with your own dataset, or using the prepared bam in example_data/example.bam

snakemake -s Snakefile.smk

2. Estimate polyA tail part

Using R (4.0.1) to extract the length of polyA on cDNA.

sub_bams <- GenomicAlignments::readGAlignments(
  'example.bam',
  use.names = T,
  param = Rsamtools::ScanBamParam(
    which = GenomicRanges::GRanges(),
    tag = c("DT"),
    flag = Rsamtools::scanBamFlag(
      isPaired = TRUE,
      isFirstMateRead=TRUE,
      isSupplementaryAlignment = FALSE,
      isSecondaryAlignment = FALSE
    )
  )
)

dt_length <- nchar(mcols(sub_bams)$DT)
plot(density(dt_length[dt_length >= 10]))

polyA_tail

Prepare the la_dis_arr and pmf_la_dis_arr for apamix

empDis <- fitur::fit_empirical(dt_length[dt_length >= 10])

la_dis_arr <- c(10, 30, 50, 70, 90, 110, 130)
pmf_la_dis_arr <- rep(0, length(la_dis_arr))
names(pmf_la_dis_arr) <- c(10, 30, 50, 70, 90, 110, 130)

tmp_val <- unlist(empDis$dempDis(1:130)[c(10, 30, 50, 70, 90, 110, 130)])
pmf_la_dis_arr[names(tmp_val)] <- unname(tmp_val)

print(la_dis_arr)
# [1]  10  30  50  70  90 110 130

pmf_la_dis_arr <- table(cut(dt_length, c(0, la_dis_arr)))
names(pmf_la_dis_arr) <- la_dis_arr

print(pmf_la_dis_arr)
#    10     30     50     70     90    110    130 
# 19586 132252  97910    205     27      2      0 

Then call APA with estimated pmf_la_dis_arr

python SCAPE/main.py apamix \
... \
--la_dis_arr '[10, 30, 50, 70, 90, 110, 130]' \
--pmf_la_dis_arr '[19586,132252,97910,205,27,2,0]'