## ----Dir, echo=TRUE, eval=TRUE-------------------------------------------
library(EpigeneticsCSAMA)
dataDirectory =  system.file("bedfiles", package="EpigeneticsCSAMA")

## ----DirShow-------------------------------------------------------------
dataDirectory

## ----RepresentReadsAsGRanges,eval=TRUE, results='hide'-------------------

library(GenomicRanges)
library(rtracklayer)
library(IRanges)

input = import.bed(file.path(dataDirectory, 'ES_input_filtered_ucsc_chr6.bed'))
rep1 = import.bed(file.path(dataDirectory, 'H3K27ac_rep1_filtered_ucsc_chr6.bed'))
rep2 = import.bed(file.path(dataDirectory, 'H3K27ac_rep2_filtered_ucsc_chr6.bed'))


## ----dataStr-------------------------------------------------------------
rep1

## ----ReadNumber----------------------------------------------------------
length(input)
length(rep1)
length(rep2)

## ----ReadExtension_Definition, results='hide'----------------------------
library(chipseq)

prepareChIPseq = function(reads){
    frag.len = median( estimate.mean.fraglen(reads) )
    cat( paste0( 'Median fragment size for this library is ', round(frag.len)))
    reads.extended = resize(reads, width = frag.len)
    return( trim(reads.extended) )
}

## ----ReadExtension,eval=TRUE---------------------------------------------
input = prepareChIPseq( input )
rep1 = prepareChIPseq( rep1 )
rep2 = prepareChIPseq( rep2 )

## ----Rep1Inspect---------------------------------------------------------
rep1

## ----GetBins_preps-------------------------------------------------------
data(si)
si

## ----GetBins,eval=TRUE---------------------------------------------------
binsize = 200
bins = tileGenome(si['chr6'], tilewidth=binsize,
                  cut.last.tile.in.chrom=TRUE)
bins

## ----Binning_function,eval=TRUE------------------------------------------
BinChIPseq = function( reads, bins ){

       mcols(bins)$score = countOverlaps( bins, reads ) 
       return( bins ) 
}

## ----Binning, eval=TRUE--------------------------------------------------
input.200bins = BinChIPseq( input, bins )
rep1.200bins = BinChIPseq( rep1, bins )
rep2.200bins = BinChIPseq( rep2, bins )

rep1.200bins

## ----simplePlot,fig.width=3.5, fig.height=3.5----------------------------
plot( 200000:201000, rep1.200bins$score[200000:201000], 
   xlab="chr6", ylab="counts per bin", type="l")

## ----ExportbedGraphFiles-------------------------------------------------
export(input.200bins, 
       con='input_chr6.bedGraph',
       format = "bedGraph")
export(rep1.200bins, 
       con='H3K27ac_rep1_chr6.bedGraph',
       format = "bedGraph")
export(rep2.200bins, 
       con='H3K27ac_rep2_chr6.bedGraph',
       format = "bedGraph")

## ----Visualisation_Prep_libs, results='hide'-----------------------------
library(Gviz)

## ----BM------------------------------------------------------------------
data(bm)
bm

## ----AT------------------------------------------------------------------
AT = GenomeAxisTrack( )

## ----Visualisation_Gviz, fig.width=5, fig.height=3, dpi=200--------------
plotTracks(c( bm, AT),
           from=122530000, to=122900000,
           transcriptAnnotation="symbol", window="auto", 
           cex.title=1, fontsize=10 )

## ----dataTrackGet--------------------------------------------------------
input.track = DataTrack(input.200bins, 
                        strand="*", genome="mm9", col.histogram='gray',
                        fill.histogram='black', name="Input", col.axis="black",
                        cex.axis=0.4, ylim=c(0,150))

rep1.track = DataTrack(rep1.200bins, 
                        strand="*", genome="mm9", col.histogram='steelblue',
                        fill.histogram='black', name="Rep. 1", col.axis='steelblue',
                        cex.axis=0.4, ylim=c(0,150))

rep2.track = DataTrack(rep2.200bins, 
                        strand="*", genome="mm9", col.histogram='steelblue',
                        fill.histogram='black', name="Rep. 2", col.axis='steelblue',
                        cex.axis=0.4, ylim=c(0,150))

## ----dataTrackPlot, fig.width=4, fig.height=4, dpi=200-------------------
plotTracks(c(input.track, rep1.track, rep2.track, bm, AT),
           from=122530000, to=122900000,
           transcriptAnnotation="symbol", window="auto", 
           type="histogram", cex.title=0.7, fontsize=10 )

## ----MACSreadingtoR------------------------------------------------------
peaks.rep1 = import.bed(file.path(dataDirectory,'Rep1_peaks_ucsc_chr6.bed'))
peaks.rep2 = import.bed(file.path(dataDirectory,'Rep2_peaks_ucsc_chr6.bed'))

## ----PeaksInBrowser_preps------------------------------------------------
peaks1.track = AnnotationTrack(peaks.rep1, 
                               genome="mm9", name='Peaks Rep. 1',
                               chromosome='chr6',
                               shape='box',fill='blue3',size=2)
peaks2.track = AnnotationTrack(peaks.rep2, 
                               genome="mm9", name='Peaks Rep. 2',
                               chromosome='chr6',
                               shape='box',fill='blue3',size=2)

## ----PeaksInBrowserPlot_nanog, fig.width=4, fig.height=3, dpi=200--------
plotTracks(c(input.track, rep1.track, peaks1.track,
             rep2.track, peaks2.track, bm, AT),
           from=122630000, to=122700000,
           transcriptAnnotation="symbol", window="auto", 
           type="histogram", cex.title=0.7, fontsize=10 )

## ----findOverlap---------------------------------------------------------
ovlp = findOverlaps( peaks.rep1, peaks.rep2 )
ovlp

## ----nbrCommonPeaks------------------------------------------------------
ov = min( length(unique( queryHits(ovlp) )), length(unique( subjectHits(ovlp) ) ) )

## ----VennDiagram1, fig.width=3.5, fig.height=3.5, dpi=200----------------
library(VennDiagram)

draw.pairwise.venn( 
   area1=length(peaks.rep1),
   area2=length(peaks.rep2), 
   cross.area=ov, 
   category=c("rep1", "rep2"), 
   fill=c("steelblue", "blue3"), 
   cat.cex=0.7)

## ----EnrichedRegionsIsolation, fig.width=4, fig.height=4, dpi=200--------
enriched.regions = Reduce(subsetByOverlaps, list(peaks.rep1, peaks.rep2))

enr.reg.track = AnnotationTrack(enriched.regions,
                                genome="mm9", name='Enriched regions',
                                chromosome='chr6',
                                shape='box',fill='green3',size=2)

plotTracks(c(input.track, rep1.track, peaks1.track,
             rep2.track, peaks2.track, enr.reg.track, 
             bm, AT),
           from=122630000, to=122700000,
           transcriptAnnotation="symbol", window="auto", 
           type="histogram", cex.title=0.5, fontsize=10 )


## ----TSS-----------------------------------------------------------------
data(egs)
head(egs)

## ----TSSfinding----------------------------------------------------------
egs$TSS = ifelse( egs$strand == "1", egs$start_position, egs$end_position )
head(egs)

## ----Promoter------------------------------------------------------------
promoter_regions = 
  GRanges(seqnames = Rle( paste0('chr', egs$chromosome_name) ),
          ranges = IRanges( start = egs$TSS - 200,
                            end = egs$TSS + 200 ),
          strand = Rle( rep("*", nrow(egs)) ),
          gene = egs$external_gene_id)
promoter_regions

## ------------------------------------------------------------------------
ovlp2 = findOverlaps( enriched.regions, promoter_regions )

cat(sprintf( "%d of %d promoters are overlapped by an enriched region.",
   length( unique(subjectHits(ovlp2)) ), length( promoter_regions ) ) )

## ------------------------------------------------------------------------
ovlp2b = findOverlaps( promoter_regions, enriched.regions )

cat(sprintf( "%d of %d enriched regions overlap a promoter.",
   length( unique( subjectHits(ovlp2b) ) ), length( enriched.regions ) ) )

## ------------------------------------------------------------------------
promotor_total_length = sum(width(reduce(promoter_regions)))
promotor_total_length

## ------------------------------------------------------------------------
promotor_fraction_of_chromosome_6 = promotor_total_length / seqlengths(si)["chr6"]

## ------------------------------------------------------------------------
binom.test( length( unique( subjectHits( ovlp2b ) ) ), length( enriched.regions ), promotor_fraction_of_chromosome_6 )

## ----promoterRegionTiling,eval=TRUE--------------------------------------
pos.TSS = egs[ unique( queryHits( findOverlaps( promoter_regions, enriched.regions ) ) ),]
pos.TSS[1:3,]

## ----Tiles---------------------------------------------------------------
tiles = sapply( 1:nrow(pos.TSS), function(i)
   if( pos.TSS$strand[i] == "1" )
      pos.TSS$TSS[i] + seq( -1000, 900, length.out=20 )
   else
      pos.TSS$TSS[i] + seq( 900, -1000, length.out=20 ) )

tiles = GRanges(tilename = paste( rep( pos.TSS$ensembl_gene_id, each=20), 1:20, sep="_" ),
                seqnames = Rle( rep(paste0('chr', pos.TSS$chromosome_name), each=20) ), 
                ranges = IRanges(start = as.vector(tiles),
                                 width = 100),
                strand = Rle(rep("*", length(as.vector(tiles)))),
                seqinfo=si)

tiles                

## ----AverProf_I,eval=TRUE------------------------------------------------
H3K27ac.p = countOverlaps( tiles, rep1) +
  countOverlaps( tiles, rep2 )

H3K27ac.p.matrix = matrix( H3K27ac.p, nrow=nrow(pos.TSS), 
                           ncol=20, byrow=TRUE )

## ----Aver_plot, fig.width=8, fig.height=10, dpi=200, dev.args = list(pointsize=11)----
colors = colorRampPalette(c('white','red','gray','black'))(100) 

layout(mat=matrix(c(1,2,0,3), 2, 2), 
       widths=c(2,2,2), 
       heights=c(0.5,5,0.5,5), TRUE)


par(mar=c(4,4,1.5,1))
image(seq(0, max(H3K27ac.p.matrix), length.out=100), 1,
      matrix(seq(0, max(H3K27ac.p.matrix), length.out=100),100,1),
      col = colors,
      xlab='Distance from TSS', ylab='',
      main='Number of reads', yaxt='n',
      lwd=3, axes=TRUE)
box(col='black', lwd=2)
image(x=seq(-1000, 1000, length.out=20),
      y=1:nrow(H3K27ac.p.matrix),
      z=t(H3K27ac.p.matrix[order(rowSums(H3K27ac.p.matrix)),]), 
      col=colors,
      xlab='Distance from TSS (bp)',
      ylab='Promoters', lwd=2)
box(col='black', lwd=2)
abline(v=0, lwd=1, col='gray')
plot(x=seq(-1000, 1000, length.out=20),
     y=colMeans(H3K27ac.p.matrix),
     ty='b', pch=19,
     col='red4',lwd=2,
     ylab='Mean tag count',
     xlab='Distance from TSS (bp)')
abline(h=seq(1,100,by=5),
       v=seq(-1000, 1000, length.out=20),
       lwd=0.25, col='gray')
box(col='black', lwd=2)


## ------------------------------------------------------------------------
sessionInfo()

## ----DataDownload, echo=TRUE,eval=FALSE----------------------------------
#  wget ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR066/SRR066787/SRR066787.fastq.gz .
#  wget ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR066/SRR066766/SRR066766.fastq.gz .
#  wget ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR066/SRR066767/SRR066767.fastq.gz .

## ----ReadQuality_preps, echo=TRUE,eval=FALSE-----------------------------
#  fls = list.files(dataDirectory, ".fastq$", full=TRUE)
#  names(fls) = sub(".fastq", "", basename(fls))

## ----QA, echo=TRUE,eval=FALSE--------------------------------------------
#  library(ShortRead)
#  qas = lapply(seq_along(fls),
#                function(i, fls) qa(readFastq(fls[i]), names(fls)[i]),
#                fls)
#  qa = do.call(rbind, qas)
#  rpt = report(qa,dest = 'QA_report.html')

## ----ReadProcessing, echo=TRUE, eval=FALSE-------------------------------
#  gunzip SRR066787.fastq.gz
#  gunzip SRR066766.fastq.gz
#  gunzip SRR066767.fastq.gz

## ----Alignment, echo=TRUE, eval=FALSE------------------------------------
#  bowtie2 -p 8 -q NCBIM37.67 SRR066787.fastq -S ES_input.sam
#  bowtie2 -p 8 -q NCBIM37.67 SRR066766.fastq -S H3K27ac_rep1.sam
#  bowtie2 -p 8 -q NCBIM37.67 SRR066767.fastq -S H3K27ac_rep2.sam

## ----BestQualityRead, echo=TRUE, eval=FALSE------------------------------
#  samtools view -bS -q 40 ES_input.sam > ES_input_bestAlignment.bam
#  samtools view -bS -q 40 H3K27ac_rep1.sam > H3K27ac_rep1_bestAlignment.bam
#  samtools view -bS -q 40 H3K27ac_rep2.sam > H3K27ac_rep2_bestAlignment.bam

## ----PCRDuplRemoval, echo=TRUE, eval=FALSE-------------------------------
#  samtools rmdup -s  ES_input_bestAlignment.bam ES_input_filtered.bam
#  samtools rmdup -s  H3K27ac_rep1_bestAlignment.bam H3K27ac_rep1_filtered.bam
#  samtools rmdup -s  H3K27ac_rep2_bestAlignment.bam H3K27ac_rep2_filtered.bam

## ----BAMTOBED, echo=TRUE, eval=FALSE-------------------------------------
#  bedtools bamtobed -i ES_input_filtered.bam > ES_input_filtered.bed
#  bedtools bamtobed -i H3K27ac_rep1_filtered.bam > H3K27ac_rep1_filtered.bed
#  bedtools bamtobed -i H3K27ac_rep2_filtered.bam > H3K27ac_rep2_filtered.bed

## ----Prefixes, echo=TRUE, eval=FALSE-------------------------------------
#  awk '$0="chr"$0' ES_input_filtered.bed > ES_input_filtered_ucsc.bed
#  awk '$0="chr"$0' H3K27ac_rep1_filtered.bed > H3K27ac_rep1_filtered_ucsc.bed
#  awk '$0="chr"$0' H3K27ac_rep2_filtered.bed > H3K27ac_rep2_filtered_ucsc.bed

## ----bedSubsetting, echo=TRUE, eval=FALSE--------------------------------
#  awk '{if($1=="chr6") print $0}' ES_input_filtered_ucsc.bed
#  > ES_input_filtered_ucsc_chr6.bed
#  awk '{if($1=="chr6") print $0}' H3K27ac_rep1_filtered_ucsc.bed
#  > H3K27ac_rep1_filtered_ucsc_chr6.bed
#  awk '{if($1=="chr6") print $0}' H3K27ac_rep2_filtered_ucsc.bed
#  > H3K27ac_rep2_filtered_ucsc_chr6.bed

## ----Getmm9SequenceInfo, echo=TRUE,eval=FALSE----------------------------
#  library(BSgenome.Mmusculus.UCSC.mm9)
#  genome = BSgenome.Mmusculus.UCSC.mm9
#  si = seqinfo(genome)
#  si = si[ paste0('chr', c(1:19, 'X', 'Y'))]

## ----Visualisation_Prep_mart, eval=FALSE---------------------------------
#  library(biomaRt)
#  mart = useMart(biomart = "ENSEMBL_MART_ENSEMBL",
#                 dataset = "mmusculus_gene_ensembl",
#                 host="may2012.archive.ensembl.org")
#  fm = Gviz:::.getBMFeatureMap()
#  fm["symbol"] = "external_gene_id"

## ----Visualisation_Prep_region,eval=FALSE--------------------------------
#  bm = BiomartGeneRegionTrack(chromosome='chr6', genome="mm9",
#                               start=122530000, end = 122900000,
#                               biomart=mart,filter=list("with_ox_refseq_mrna"=TRUE),
#                               size=4, name="RefSeq", utr5="red3", utr3="red3",
#                               protein_coding="black", col.line=NULL, cex=7,
#                               collapseTranscripts="longest",
#                               featureMap=fm)

## ----macs,eval=FALSE-----------------------------------------------------
#  macs14 -t H3K27ac_rep1_filtered.bed -c ES_input_filtered_ucsc.bed -f BED -g mm --nomodel -n Rep1
#  macs14 -t H3K27ac_rep2_filtered.bed -c ES_input_filtered_ucsc.bed -f BED -g mm --nomodel -n Rep2
#  awk '$0="chr"$0' Rep1_peaks.bed > Rep1_peaks_ucsc.bed
#  awk '$0="chr"$0' Rep2_peaks.bed > Rep2_peaks_ucsc.bed
#  awk '{if($1=="chr6") print $0}' Rep1_peaks_ucsc.bed > Rep1_peaks_ucsc_chr6.bed
#  awk '{if($1=="chr6") print $0}' Rep2_peaks_ucsc.bed > Rep2_peaks_ucsc_chr6.bed

## ----usingMartToFindFeaturesOfInterest,eval=FALSE------------------------
#  listAttributes(mart)[1:3,]
#  ds = useDataset('mmusculus_gene_ensembl', mart=mart)
#  chroms = 6
#  
#  egs = getBM(attributes = c('ensembl_gene_id','external_gene_id',
#                             'chromosome_name','start_position',
#                             'end_position','strand'),
#              filters='chromosome_name',
#              values=chroms,
#              mart=ds)

