R version 2.11.1 (2010-05-31)
Copyright (C) 2010 The R Foundation for Statistical Computing
ISBN 3-900051-07-0

R is free software and comes with ABSOLUTELY NO WARRANTY.
You are welcome to redistribute it under certain conditions.
Type 'license()' or 'licence()' for distribution details.

  Natural language support but running in an English locale

R is a collaborative project with many contributors.
Type 'contributors()' for more information and
'citation()' on how to cite R or R packages in publications.

Type 'demo()' for some demos, 'help()' for on-line help, or
'help.start()' for an HTML browser interface to help.
Type 'q()' to quit R.

> # 21 October 2010
> # From Mark Robinson, 18 October 2010
> # Ongoing help from Mark...
> 
> # Run this script by e.g.
> # source(file="/projects/remc_bigdata/Karsan/R/20101022/bammap2counts_DE.H4ac.R")
> 
> # source("http://bioconductor.org/biocLite.R")
> # old.packages(repos=biocinstallRepos())
> # update.packages(repos=biocinstallRepos(), ask=FALSE)
> 
> #install.packages("Repitools",repos="http://r-forge.r-project.org",type="source")
> # biocLite("Rsamtools") 1.2.0
> 
> # biocLite("BSgenome.Hsapiens.UCSC.hg18") 834 M as a tar.gz
> # biocLite("BSgenome.Mmusculus.UCSC.mm8")  747MB
> # R CMD REMOVE BSgenome.Hsapiens.UCSC.hg18
> # R CMD INSTALL /projects/remc_bigdata/Data/BSgenome.Hsapiens.UCSC.hg18_1.3.16.tar.gz
> 
> # load packages
> #library(IRanges)
> #library(Biostrings)
> #library(GenomicRanges)
> #library(IRanges)
> 
> library(Rsamtools)
Loading required package: IRanges

Attaching package: 'IRanges'

The following object(s) are masked from 'package:base':

    cbind, Map, mapply, order, paste, pmax, pmax.int, pmin, pmin.int,
    rbind, rep.int, table

Loading required package: GenomicRanges
Loading required package: Biostrings
> library(Repitools)
Loading required package: R.methodsS3
R.methodsS3 v1.2.1 (2010-09-18) successfully loaded. See ?R.methodsS3 for help.
Loading required package: BSgenome
> library(edgeR)
> library(BSgenome.Hsapiens.UCSC.hg18)
> 
> 
> # some parameter settings
> # need to add a failed-chastity filter and a MAPQ filter
> p <- ScanBamParam(what=c("rname", "strand", "pos"), flag=scanBamFlag(isUnmappedQuery=FALSE,isDuplicate=FALSE))
> 
> # 1000, 750, 500 bp windows
> windowSize <- 750
> fragSize <- 200
> chrNames <- paste("chr", c(1:22, "X", "Y"), sep = "")
> 
> # file names (here, BAM format) --> list of GRanges
> # HS1238 is 50 bp, HS1235 is 75 bp, both are SE reads
> # Compare HS1238 then HS1235
> #readLen <- c(50,75)  # vector to match filenames
> #filenames <- c("/projects/remc_bigdata/Karsan/HS1238_kd/maq2sam/HS1238.h.sorted.bam",
> #               "/projects/remc_bigdata/Karsan/HS1235_ctl/maq2sam/HS1235.h.sorted.bam")
> # OR compare HS1235 then HS1238
> readLen <- c(75,50)  # vector to match filenames
> filenames <- c("/projects/remc_bigdata/Karsan/HS1235_ctl/maq2sam/HS1235.h.sorted.bam",
+                "/projects/remc_bigdata/Karsan/HS1238_kd/maq2sam/HS1238.h.sorted.bam")
> 
> # A multivariate version of sapply. mapply applies FUN to the first elements of each ... argument, 
> # the second elements, the third elements, and so on. Arguments are recycled if necessary.
> #GRanges(seqnames=sb$rname, ranges=IRanges(start=sb$pos,width=v), strand=sb$strand)
> gr <- mapply(FUN=function(u,v) {
+   sb <- scanBam(u, param=p)[[1]]
+   GRanges(seqnames=paste("chr",sb$rname,sep=""), ranges=IRanges(start=sb$pos,width=readLen), strand=sb$strand)
+ },filenames,readLen)
> 
> #names(gr) <- c("HS1238_kd","HS1235_ctl")
> names(gr) <- c("HS1235_ctl","HS1238_kd")
> 
> # file names (here, MAP format) --> list of GRanges [much slower!]
> #fnmap <- c("/home/data/NGS/BGI/27-07-2010/BRE01/BRE01_lib1PE122.uniq.map.gz",
> #           "/home/data/NGS/BGI/27-07-2010/BRE02/BRE02_lib2PE122.uniq.map.gz")
> #grmap <- lapply(fnmap, FUN=function(u) {
> #  x <- readAligned(u, type="Bowtie")
> #  GRanges(seqnames=chromosome(x), ranges=IRanges(start=position(x),width=readLen), 
> #          strand=strand(x))
> #})
> #grl <- GRangesList(grmap)  # use this for MAP files
> 
> grl <- GRangesList(gr)
> 
> # create "table" of windows along the genome
> windows <- genomeBlocks(Hsapiens, chrNames, windowSize)
> 
> head(windows)
GRanges with 6 ranges and 0 elementMetadata values
    seqnames       ranges strand |
       <Rle>    <IRanges>  <Rle> |
[1]     chr1 [   1,  750]      * |
[2]     chr1 [ 751, 1500]      * |
[3]     chr1 [1501, 2250]      * |
[4]     chr1 [2251, 3000]      * |
[5]     chr1 [3001, 3750]      * |
[6]     chr1 [3751, 4500]      * |

seqlengths
  chr1  chr2  chr3  chr4  chr5  chr6 ... chr19 chr20 chr21 chr22  chrX  chrY
    NA    NA    NA    NA    NA    NA ...    NA    NA    NA    NA    NA    NA
> #RangedData with 6 rows and 0 value columns across 24 spaces
> #        space       ranges |
> #  <character>    <IRanges> |
> #1        chr1 [   1, 1000] |
> #2        chr1 [1001, 2000] |
> #3        chr1 [2001, 3000] |
> #4        chr1 [3001, 4000] |
> #5        chr1 [4001, 5000] |
> #6        chr1 [5001, 6000] |
> 
> 
> # create count table (after extending reads to be length 'fragSize')
> counts <- annotationBlocksCounts(grl, windows, fragSize, verbose=TRUE)
Counting in HS1235_ctl 
Counting in HS1238_kd 
Counting successful.
> # MR: If colSums are 0, we'll have to fix the chromosome names problem.
> colSums(counts)
HS1235_ctl  HS1238_kd 
  27515375   20638484 
> # HS1238_kd HS1235_ctl 
> #  19563732   26082144
> 
> k <- rowSums(counts) > 10  # filter on the total count for a bin
> 
> # do an edgeR DE analysis of counts
> f <- calcNormFactors(counts[k,])
> 
> d <- DGEList(counts=counts[k,], 
+ 	         group=colnames(counts), 
+              lib.size=colSums(counts[k,])*f,
+              genes=as.data.frame(windows)[k,1:3])
> d <- estimateCommonDisp(d)  # in my example, no bio replicates, will get warning
Warning message:
In estimateCommonDisp(d) :
  There is no replication.  Setting common dispersion to 0.
> #Warning message:
> #In estimateCommonDisp(d) :
> #  There is no replication.  Setting common dispersion to 0.
> 
> de <- exactTest(d)  # 4.6GB
Comparison of groups:  HS1238_kd - HS1235_ctl 
> #> de
> 
> proc.time()
   user  system elapsed 
713.165  25.815 745.258