R version 2.11.1 (2010-05-31) Copyright (C) 2010 The R Foundation for Statistical Computing ISBN 3-900051-07-0 R is free software and comes with ABSOLUTELY NO WARRANTY. You are welcome to redistribute it under certain conditions. Type 'license()' or 'licence()' for distribution details. Natural language support but running in an English locale R is a collaborative project with many contributors. Type 'contributors()' for more information and 'citation()' on how to cite R or R packages in publications. Type 'demo()' for some demos, 'help()' for on-line help, or 'help.start()' for an HTML browser interface to help. Type 'q()' to quit R. > # 01 November 2010 > # From Mark Robinson, 18 October 2010 > # Ongoing help from Mark... > > > # Run this script by e.g. > # source(file="/projects/remc_bigdata/Karsan/R/20101206/bammap2counts_DE-H4ac.20101206.R") > # source(file="/projects/remc_bigdata/Karsan/R/20101102/bammap2counts_DE-H4ac.20101102.R") > # puzzle, 31 October 2010: for HS1235 vs HS1238, get 38 reported first (see below) > #Counting in HS1235_ctl > #Counting in HS1238_kd > #Counting successful. > #Comparison of groups: HS1238_kd - HS1235_ctl > > > # source("http://bioconductor.org/biocLite.R") > # old.packages(repos=biocinstallRepos()) > # update.packages(repos=biocinstallRepos(), ask=FALSE) > > #install.packages("Repitools",repos="http://r-forge.r-project.org",type="source") > # biocLite("Rsamtools") 1.2.0 > > # biocLite("BSgenome.Hsapiens.UCSC.hg18") 834 M as a tar.gz > # biocLite("BSgenome.Mmusculus.UCSC.mm8") 747MB > # R CMD REMOVE BSgenome.Hsapiens.UCSC.hg18 > # R CMD INSTALL /projects/remc_bigdata/Data/BSgenome.Hsapiens.UCSC.hg18_1.3.16.tar.gz > > # load packages > #library(IRanges) > #library(Biostrings) > #library(GenomicRanges) > #library(IRanges) > > library(Rsamtools) Loading required package: IRanges Attaching package: 'IRanges' The following object(s) are masked from 'package:base': cbind, Map, mapply, order, paste, pmax, pmax.int, pmin, pmin.int, rbind, rep.int, table Loading required package: GenomicRanges Loading required package: Biostrings > library(Repitools) Loading required package: R.methodsS3 R.methodsS3 v1.2.1 (2010-09-18) successfully loaded. See ?R.methodsS3 for help. Loading required package: BSgenome > library(edgeR) > library(BSgenome.Hsapiens.UCSC.hg18) > > > print("some parameter settings") [1] "some parameter settings" > # need to add a failed-chastity filter and a MAPQ filter > p <- ScanBamParam(what=c("rname", "strand", "pos"), flag=scanBamFlag(isUnmappedQuery=FALSE,isDuplicate=FALSE)) > > # 1000, 750, 500 bp windows > windowSize <- 1000 > #windowSize <- 750 > #windowSize <- 500 > fragSize <- 200 > chrNames <- paste("chr", c(1:22, "X", "Y"), sep = "") > #print(paste("w=",windowSize,sep="") > #print(paste("fragSize=",fragSize,sep="")) > > # file names (here, BAM format) --> list of GRanges > # HS1238 is 50 bp, HS1235 is 75 bp, both are SE reads > # Compare HS1238 then HS1235 > readLen <- c(50,75) # vector to match filenames > filenames <- c("/projects/remc_bigdata/Karsan/HS1238_kd/maq2sam/HS1238.h.sorted.bam", + "/projects/remc_bigdata/Karsan/HS1235_ctl/maq2sam/HS1235.h.sorted.bam") > # OR compare HS1235 then HS1238 > #readLen <- c(75,50) # vector to match filenames > #filenames <- c("/projects/remc_bigdata/Karsan/HS1235_ctl/maq2sam/HS1235.h.sorted.bam", > # "/projects/remc_bigdata/Karsan/HS1238_kd/maq2sam/HS1238.h.sorted.bam") > > # A multivariate version of sapply. mapply applies FUN to the first elements of each ... argument, > # the second elements, the third elements, and so on. Arguments are recycled if necessary. > #GRanges(seqnames=sb$rname, ranges=IRanges(start=sb$pos,width=v), strand=sb$strand) > #Add 'chr' to chromosome names > print("read in .bam files") [1] "read in .bam files" > gr <- mapply(FUN=function(u,v) { + sb <- scanBam(u, param=p)[[1]] + GRanges(seqnames=paste("chr",sb$rname,sep=""), ranges=IRanges(start=sb$pos,width=readLen), strand=sb$strand) + },filenames,readLen) > > names(gr) <- c("HS1238_kd","HS1235_ctl") > #names(gr) <- c("HS1235_ctl","HS1238_kd") > > # file names (here, MAP format) --> list of GRanges [much slower!] > #fnmap <- c("/home/data/NGS/BGI/27-07-2010/BRE01/BRE01_lib1PE122.uniq.map.gz", > # "/home/data/NGS/BGI/27-07-2010/BRE02/BRE02_lib2PE122.uniq.map.gz") > #grmap <- lapply(fnmap, FUN=function(u) { > # x <- readAligned(u, type="Bowtie") > # GRanges(seqnames=chromosome(x), ranges=IRanges(start=position(x),width=readLen), > # strand=strand(x)) > #}) > #grl <- GRangesList(grmap) # use this for MAP files > > grl <- GRangesList(gr) > > print("create table of windows along the genome") [1] "create table of windows along the genome" > windows <- genomeBlocks(Hsapiens, chrNames, windowSize) > > head(windows) GRanges with 6 ranges and 0 elementMetadata values seqnames ranges strand | | [1] chr1 [ 1, 1000] * | [2] chr1 [1001, 2000] * | [3] chr1 [2001, 3000] * | [4] chr1 [3001, 4000] * | [5] chr1 [4001, 5000] * | [6] chr1 [5001, 6000] * | seqlengths chr1 chr2 chr3 chr4 chr5 chr6 ... chr19 chr20 chr21 chr22 chrX chrY NA NA NA NA NA NA ... NA NA NA NA NA NA > #RangedData with 6 rows and 0 value columns across 24 spaces > # space ranges | > # | > #1 chr1 [ 1, 1000] | > #2 chr1 [1001, 2000] | > #3 chr1 [2001, 3000] | > #4 chr1 [3001, 4000] | > #5 chr1 [4001, 5000] | > #6 chr1 [5001, 6000] | > > > print("create count table after extending reads to be length fragSize") [1] "create count table after extending reads to be length fragSize" > counts <- annotationBlocksCounts(grl, windows, fragSize, verbose=TRUE) Counting in HS1238_kd Counting in HS1235_ctl Counting successful. > # MR: If colSums are 0, we'll have to fix the chromosome names problem. > colSums(counts) HS1238_kd HS1235_ctl 19563732 26082144 > # HS1238_kd HS1235_ctl > # 19563732 26082144 > > #print("filter on the total count for a bin") > k <- rowSums(counts) > 10 > > #print("do an edgeR DE analysis of counts") > f <- calcNormFactors(counts[k,]) > > d <- DGEList(counts=counts[k,], + group=colnames(counts), + lib.size=colSums(counts[k,])*f, + genes=as.data.frame(windows)[k,1:3]) > d <- estimateCommonDisp(d) # in my example, no bio replicates, will get warning Warning message: In estimateCommonDisp(d) : There is no replication. Setting common dispersion to 0. > # Write out a smear plot PDF > #pdf("/projects/remc_bigdata/Karsan/R/20101101/DE-H4ac.1000bp-bins.edgeR.20101101.smearPlot.pdf") > #pdf("/projects/remc_bigdata/Karsan/R/20101101/DE-H4ac.500bp-bins.edgeR.20101101.smearPlot.pdf") > #plotSmear(d) > #grid() > #dev.off() > > #Warning message: > #In estimateCommonDisp(d) : > # There is no replication. Setting common dispersion to 0. > print("do exactTest(d)") [1] "do exactTest(d)" > #de <- exactTest(d,pair=c("HS1238_kd","HS1235_ctl")) # Comparison of groups: HS1235_ctl - HS1238_kd > de <- exactTest(d,pair=c("HS1235_ctl","HS1238_kd")) # Comparison of groups: HS1238_kd - HS1235_ctl - makes pos-FC=1238-enriched Comparison of groups: HS1238_kd - HS1235_ctl > > #> de > #An object of class "deDGEList" > #$table > # logConc logFC p.value > #1 -17.61765 1.02039588 2.954411e-06 > #2 -20.89289 0.24495614 1.000000e+00 > #3 -20.88492 -0.49612556 5.034447e-01 > #4 -19.78170 0.32656991 5.327093e-01 > #5 -20.39289 0.07503114 1.000000e+00 > #1511146 more rows ... > > topTags(de) Comparison of groups: HS1238_kd - HS1235_ctl logConc logFC PValue FDR 2677274 -10.863195 -0.7603573 9.500483e-294 1.435666e-287 121187 -8.015492 -0.2615755 1.802020e-252 1.361563e-246 1294341 -9.974019 -0.4176546 6.519279e-166 3.283871e-160 1606831 -12.850526 -1.0059503 2.006271e-130 7.579445e-125 1604542 -12.880627 -0.9892234 1.805570e-123 5.456979e-118 1293822 -14.726969 -1.6940994 4.573325e-100 1.151831e-94 1719803 -15.587381 2.1549864 9.805623e-95 2.116825e-89 2012668 -18.029993 -4.9018324 1.904948e-89 3.598330e-84 2677275 -10.435734 -0.3546701 6.543402e-88 1.098674e-82 1120720 -11.430976 -0.4916045 1.256872e-84 1.899324e-79 > #> topTags(de) > #Comparison of groups: HS1238_kd - HS1235_ctl > # logConc logFC PValue FDR > #2677274 -10.863195 -0.7603573 9.500483e-294 1.435666e-287 > #121187 -8.015492 -0.2615755 1.802020e-252 1.361563e-246 > #1294341 -9.974019 -0.4176546 6.519279e-166 3.283871e-160 > #1606831 -12.850526 -1.0059503 2.006271e-130 7.579445e-125 > #1604542 -12.880627 -0.9892234 1.805570e-123 5.456979e-118 > #1293822 -14.726969 -1.6940994 4.573325e-100 1.151831e-94 > #1719803 -15.587381 2.1549864 9.805623e-95 2.116825e-89 > #2012668 -18.029993 -4.9018324 1.904948e-89 3.598330e-84 > #2677275 -10.435734 -0.3546701 6.543402e-88 1.098674e-82 > #1120720 -11.430976 -0.4916045 1.256872e-84 1.899324e-79 > > #print("table with genome locations, counts, statistics") > #xx <- cbind(as.data.frame(windows)[k,-5], counts[k,], de$table) > # Mark (31 Oct 2010) - add an B&H adjusted p-val to the output > xx <- cbind(as.data.frame(windows)[k,-5], counts[k,], de$table,adjp=p.adjust(de$table$p.value, method="BH") ) > # Check the table > table(xx$adjp < .05, xx$logFC > 0) FALSE TRUE FALSE 763844 733655 TRUE 6696 6956 > # FALSE TRUE > # FALSE 763844 733655 > # TRUE 6696 6956 > > #w <- xx$p.value < .00001 # these are not correct pvals, so you may want to be harsher than this. > # Get low-adjp records > #w <- xx$adjp < 0.05 > w <- xx$p.value < 0.1 > xxx <- xx[w,] > > print("write.tablem all records") [1] "write.tablem all records" > write.table(xx, "/projects/remc_bigdata/Karsan/R/20101212a/HS1238_vs_1235.bins-w1000.d.edgeR.all.20101212a.txt", sep="\t", row.names=FALSE, quote=FALSE) > #write.table(xx, "/projects/remc_bigdata/Karsan/R/20101212a/HS1238_vs_1235.bins-w750.d.edgeR.all.20101212a.txt", sep="\t", row.names=FALSE, quote=FALSE) > > print("write.table, records with p-val < 0.01") [1] "write.table, records with p-val < 0.01" > write.table(xxx, "/projects/remc_bigdata/Karsan/R/20101212a/HS1238_vs_1235.bins-w1000.d.edgeR.p0.1.20101212a.txt", sep="\t", row.names=FALSE, quote=FALSE) > #write.table(xxx, "/projects/remc_bigdata/Karsan/R/20101212a/HS1238_vs_1235.bins-w750.edgeR.p0.1.20101212a.txt", sep="\t", row.names=FALSE, quote=FALSE) > #write.table(xx, "/projects/remc_bigdata/Karsan/R/20101102/HS1238_vs_1235.w500.edgeR.p0.1.20101102.txt", sep="\t", row.names=FALSE, quote=FALSE) > # with 'adjp' > # 1 2 3 4 5 6 7 8 9 10 > #seqnames start end width HS1238_kd HS1235_ctl logConc logFC p.value adjp > #chr1 1 1000 1000 114 74 -17.6176485910214 1.02039588397656 2.95441057191974e-06 0.00171401942252625 > > # awk '{ if($10 <= 0.05) { print $0 > "HS1238-vs-1235.w1000.edgeR.adjp-le-0.05.txt" } }' HS1238_vs_1235.w1000.edgeR-pairs-adjp.txt > #[grobertson@xhost09 20101031]$ wc -l HS1238_vs_1235.w1000.edgeR-pairs-adjp.txt > # 1,511,152 HS1238_vs_1235.w1000.edgeR-pairs-adjp.txt > #[grobertson@xhost09 20101031]$ wc -l HS1238-vs-1235.w1000.edgeR.adjp-le-0.05.txt > # 13,652 HS1238-vs-1235.w1000.edgeR.adjp-le-0.05 > > > > #write.table(xx, "/projects/remc_bigdata/Karsan/R/20101031/HS1235_vs_1238.w1000.edgeR.txt", sep="\t", row.names=FALSE, quote=FALSE) > #seqnames start end width HS1238_kd HS1235_ctl logConc logFC p.value > #chr1 1 1000 1000 114 74 -17.6176485910214 1.02039588397656 2.95441057191974e-06 > #chr1 1001 2000 1000 9 10 -20.8928937327534 0.244956141995718 0.999999999999992 > #chr1 2001 3000 1000 7 13 -20.8849229608189 -0.49612556064272 0.503444671630861 > #chr1 3001 4000 1000 20 21 -19.7816975220852 0.32656990754937 0.532709255236125 > #chr1 4001 5000 1000 12 15 -20.3928937327534 0.0750311405534056 0.999999999999987 > #chr1 5001 6000 1000 10 12 -20.6853749831140 0.133924829606974 0.999999999999999 > #chr1 6001 7000 1000 14 16 -20.2351428198894 0.204314157498372 0.85553555190565 > #chr1 7001 8000 1000 14 22 -20.0054270105708 -0.255117461138925 0.617719317029688 > #chr1 8001 9000 1000 15 11 -20.4556591737953 0.84441821241199 0.122078120708466 > #chr1 9001 10000 1000 11 12 -20.616623221239 0.271428353356909 0.677639484405523 > #chr1 10001 11000 1000 15 15 -20.2319296853097 0.396959235440768 0.47312965989113 > > # find differential expression de<-exactTest(d) > # highlighting the top 500 most DE tags de.tags <- rownames(topTags(de, n=500)$table) > #pdf(file="/projects/remc_bigdata/Karsan/R/20101022/HS1238_vs_1235.w750.edgeR.plotSmear.pdf") > #plotSmear(d) > #de.tags <- rownames(topTags(de,n=500)$table) > #plotSmear(d,de.tags=de.tags) > #dev.off() > > # 21 Oct 2010 - For w=1000, uses a max of ~5.6G for Iva's data. > # "/projects/remc_bigdata/Karsan/R/20101022/bammap2counts_DE.H4ac.R" > > #awk '{if($9<=0.000001) {print $0 > "HS1235_vs_1238.w500.edgeR.p1e-6.txt"}}' HS1235_vs_1238.w500.edgeR.txt > > ############### > #print("now do the opposite DE calculation") > #de1 <- exactTest(d,pair=c("HS1235_ctl","HS1238_kd")) # Comparison of groups: HS1238_kd - HS1235_ctl > #topTags(de) > #xx1 <- cbind(as.data.frame(windows)[k,-5], counts[k,], de1$table) > #w1 <- xx$p.value < 0.1 > #xxx1 <- xx1[w,] > #write.table(xx1, "/projects/remc_bigdata/Karsan/R/20101128_TSS/HS1235_vs_1238.w1000.edgeR.p0.1.20101102.txt", sep="\t", row.names=FALSE, quote=FALSE) > #write.table(xx1, "/projects/remc_bigdata/Karsan/R/20101102/HS1235_vs_1238.w500.edgeR.p0.1.20101102.txt", sep="\t", row.names=FALSE, quote=FALSE) > > > ############################################################################## > ### 31 October 2010, Mark Robinson ########################################### > print("-------------------------------------------") [1] "-------------------------------------------" > print("Now do differentially enriched TSS regions") [1] "Now do differentially enriched TSS regions" > library(rtracklayer) Loading required package: RCurl Loading required package: bitops > session <- browserSession("UCSC") > genome(session) <- "hg18" > #trackNames(session) ## list the track names > q <- ucscTableQuery(session, "knownGene") > knownGene <- getTable(q) > > > ### END TSS-merge code from Mark Robinson - 12 Dec 2010 ######################## > # merge TSSs with identical starts > knownGene$position <- ifelse(knownGene$strand=="+", knownGene$txStart, knownGene$txEnd) > key <- paste(knownGene$chrom, knownGene$position, sep=":") > ukey <- unique(key) > m <- match(ukey, key) > mkg <- knownGene[m,c("chrom","strand","txStart","txEnd","name","position")] # mkg = merged known genes > > # create merged IDs > rownames(mkg) <- ukey > ids <- split(knownGene$name, key)