The material in this course requires R version 3.3 and Bioconductor version 3.4
stopifnot(
getRversion() >= '3.3' && getRversion() < '3.4',
BiocInstaller::biocVersion() == "3.4"
)
TxDb
packagese.g., `{r Biocpkg(“TxDb.Hsapiens.UCSC.hg19.knownGene”)}
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
txdb
## TxDb object:
## # Db type: TxDb
## # Supporting package: GenomicFeatures
## # Data source: UCSC
## # Genome: hg19
## # Organism: Homo sapiens
## # Taxonomy ID: 9606
## # UCSC Table: knownGene
## # Resource URL: http://genome.ucsc.edu/
## # Type of Gene ID: Entrez Gene ID
## # Full dataset: yes
## # miRBase build ID: GRCh37
## # transcript_nrow: 82960
## # exon_nrow: 289969
## # cds_nrow: 237533
## # Db created by: GenomicFeatures package from Bioconductor
## # Creation time: 2015-10-07 18:11:28 +0000 (Wed, 07 Oct 2015)
## # GenomicFeatures version at creation time: 1.21.30
## # RSQLite version at creation time: 1.0.0
## # DBSCHEMAVERSION: 1.1
methods(class=class(txdb))
## [1] $ $<- ExpressionSet annotatedDataFrameFrom
## [5] as.list asBED asGFF assayData
## [9] assayData<- cds cdsBy cdsByOverlaps
## [13] coerce columns combine contents
## [17] dbInfo dbconn dbfile dbmeta
## [21] dbschema disjointExons distance exons
## [25] exonsBy exonsByOverlaps extractUpstreamSeqs featureNames
## [29] featureNames<- fiveUTRsByTranscript genes initialize
## [33] intronsByTranscript isActiveSeq isActiveSeq<- isNA
## [37] keys keytypes locateVariants mapIds
## [41] mapIdsToRanges mapRangesToIds mapToTranscripts mappedkeys
## [45] metadata microRNAs nhit organism
## [49] predictCoding promoters revmap sample
## [53] sampleNames sampleNames<- saveDb select
## [57] seqinfo seqlevels0 seqlevels<- show
## [61] species storageMode storageMode<- summarizeVariants
## [65] tRNAs taxonomyId threeUTRsByTranscript transcripts
## [69] transcriptsBy transcriptsByOverlaps updateObject
## see '?methods' for accessing help and source code
TxDb
objects
dbfile(txdb)
GenomicFeatures::makeTxDbFrom*()
Accessing gene models
exons()
, transcripts()
, genes()
, cds()
(coding sequence)promoters()
& friendsexonsBy()
& friends – exons by gene, transcript, …keytypes()
, columns()
, keys()
, select()
, mapIds()
exons(txdb)
## GRanges object with 289969 ranges and 1 metadata column:
## seqnames ranges strand | exon_id
## <Rle> <IRanges> <Rle> | <integer>
## [1] chr1 [11874, 12227] + | 1
## [2] chr1 [12595, 12721] + | 2
## [3] chr1 [12613, 12721] + | 3
## [4] chr1 [12646, 12697] + | 4
## [5] chr1 [13221, 14409] + | 5
## ... ... ... ... . ...
## [289965] chrUn_gl000241 [35706, 35859] - | 289965
## [289966] chrUn_gl000241 [36711, 36875] - | 289966
## [289967] chrUn_gl000243 [11501, 11530] + | 289967
## [289968] chrUn_gl000243 [13608, 13637] + | 289968
## [289969] chrUn_gl000247 [ 5787, 5816] - | 289969
## -------
## seqinfo: 93 sequences (1 circular) from hg19 genome
exonsBy(txdb, "tx")
## GRangesList object of length 82960:
## $1
## GRanges object with 3 ranges and 3 metadata columns:
## seqnames ranges strand | exon_id exon_name exon_rank
## <Rle> <IRanges> <Rle> | <integer> <character> <integer>
## [1] chr1 [11874, 12227] + | 1 <NA> 1
## [2] chr1 [12613, 12721] + | 3 <NA> 2
## [3] chr1 [13221, 14409] + | 5 <NA> 3
##
## $2
## GRanges object with 3 ranges and 3 metadata columns:
## seqnames ranges strand | exon_id exon_name exon_rank
## [1] chr1 [11874, 12227] + | 1 <NA> 1
## [2] chr1 [12595, 12721] + | 2 <NA> 2
## [3] chr1 [13403, 14409] + | 6 <NA> 3
##
## $3
## GRanges object with 3 ranges and 3 metadata columns:
## seqnames ranges strand | exon_id exon_name exon_rank
## [1] chr1 [11874, 12227] + | 1 <NA> 1
## [2] chr1 [12646, 12697] + | 4 <NA> 2
## [3] chr1 [13221, 14409] + | 5 <NA> 3
##
## ...
## <82957 more elements>
## -------
## seqinfo: 93 sequences (1 circular) from hg19 genome
BSgenome
packagese.g., `{r Biocpkg(“BSgenome.Hsapiens.UCSC.hg19”)}
library(BSgenome.Hsapiens.UCSC.hg19)
genome <- BSgenome.Hsapiens.UCSC.hg19
getSeq(genome, exons(txdb)[1:100])
## A DNAStringSet instance of length 100
## width seq
## [1] 354 CTTGCCGTCAGCCTTTTCTTTGACCTCTTCTTTCTGTTCATGT...AAAGATGAGTGAGAGCATCAACTTCTCTCACAACCTAGGCCA
## [2] 127 GCTCCTGTCTCCCCCCAGGTGTGTGGTGATGCCAGGCATGCCC...GACTTGGATCACACTCTTGTGAGTGTCCCCAGTGTTGCAGAG
## [3] 109 GTGTGTGGTGATGCCAGGCATGCCCTTCCCCAGCATCAGGTCT...GACTTGGATCACACTCTTGTGAGTGTCCCCAGTGTTGCAGAG
## [4] 52 CATCAGGTCTCCAGAGCTGCAGAAGACGACGGCCGACTTGGATCACACTCTT
## [5] 1189 GCAGGGCCATCAGGCACCAAAGGGATTCTGCCAGCATAGTGCT...ACAGCTAGAGATCCTTTATTAAAAGCACACTGTTGGTTTCTG
## ... ... ...
## [96] 251 CCCGTCCCCGGCGCGGCCCGCGCGCTCCTCCGCCGCCTCTCGC...ATCCTCAACGTGGACCCGGTGCAGCACACGTACTCCTGCAAG
## [97] 262 GTTCGGGTCTGGCGGTACTTGAAGGGCAAAGACCTGGTGGCCC...TCACCCTGCGGAACCTGGAGGAGGTGGAGTTCTGTGTGGAAG
## [98] 48 ATAAACCCGGGACCCACTTCACTCCAGTGCCTCCGACGCCTCCTGATG
## [99] 216 CGTGCCGGGGAATGCTGTGCGGCTTCGGCGCCGTGTGCGAGCC...GCCAGCAGCGCCGCATCCGCCTGCTCAGCCGCGGGCCGTGCG
## [100] 225 GCTCGCGGGACCCCTGCTCCAACGTGACCTGCAGCTTCGGCAG...CCCGCCAGGAGAATGTCTTCAAGAAGTTCGACGGCCCTTGTG
OrgDb
library(org.Hs.eg.db)
org.Hs.eg.db
## OrgDb object:
## | DBSCHEMAVERSION: 2.1
## | Db type: OrgDb
## | Supporting package: AnnotationDbi
## | DBSCHEMA: HUMAN_DB
## | ORGANISM: Homo sapiens
## | SPECIES: Human
## | EGSOURCEDATE: 2016-Mar14
## | EGSOURCENAME: Entrez Gene
## | EGSOURCEURL: ftp://ftp.ncbi.nlm.nih.gov/gene/DATA
## | CENTRALID: EG
## | TAXID: 9606
## | GOSOURCENAME: Gene Ontology
## | GOSOURCEURL: ftp://ftp.geneontology.org/pub/go/godatabase/archive/latest-lite/
## | GOSOURCEDATE: 20160305
## | GOEGSOURCEDATE: 2016-Mar14
## | GOEGSOURCENAME: Entrez Gene
## | GOEGSOURCEURL: ftp://ftp.ncbi.nlm.nih.gov/gene/DATA
## | KEGGSOURCENAME: KEGG GENOME
## | KEGGSOURCEURL: ftp://ftp.genome.jp/pub/kegg/genomes
## | KEGGSOURCEDATE: 2011-Mar15
## | GPSOURCENAME: UCSC Genome Bioinformatics (Homo sapiens)
## | GPSOURCEURL: ftp://hgdownload.cse.ucsc.edu/goldenPath/hg19
## | GPSOURCEDATE: 2010-Mar22
## | ENSOURCEDATE: 2016-Mar9
## | ENSOURCENAME: Ensembl
## | ENSOURCEURL: ftp://ftp.ensembl.org/pub/current_fasta
## | UPSOURCENAME: Uniprot
## | UPSOURCEURL: http://www.UniProt.org/
## | UPSOURCEDATE: Wed Mar 23 15:50:09 2016
##
## Please see: help('select') for usage information
OrgDb
objects
TxDb
keytypes()
, columns()
, keys()
, select()
, mapIds()
select()
Specification of key type
select(org.Hs.eg.db, c("BRCA1", "PTEN"), c("ENTREZID", "GENENAME"), "SYMBOL")
## 'select()' returned 1:1 mapping between keys and columns
## SYMBOL ENTREZID GENENAME
## 1 BRCA1 672 breast cancer 1
## 2 PTEN 5728 phosphatase and tensin homolog
keytypes(org.Hs.eg.db)
## [1] "ACCNUM" "ALIAS" "ENSEMBL" "ENSEMBLPROT" "ENSEMBLTRANS" "ENTREZID"
## [7] "ENZYME" "EVIDENCE" "EVIDENCEALL" "GENENAME" "GO" "GOALL"
## [13] "IPI" "MAP" "OMIM" "ONTOLOGY" "ONTOLOGYALL" "PATH"
## [19] "PFAM" "PMID" "PROSITE" "REFSEQ" "SYMBOL" "UCSCKG"
## [25] "UNIGENE" "UNIPROT"
columns(org.Hs.eg.db)
## [1] "ACCNUM" "ALIAS" "ENSEMBL" "ENSEMBLPROT" "ENSEMBLTRANS" "ENTREZID"
## [7] "ENZYME" "EVIDENCE" "EVIDENCEALL" "GENENAME" "GO" "GOALL"
## [13] "IPI" "MAP" "OMIM" "ONTOLOGY" "ONTOLOGYALL" "PATH"
## [19] "PFAM" "PMID" "PROSITE" "REFSEQ" "SYMBOL" "UCSCKG"
## [25] "UNIGENE" "UNIPROT"
Related functionality
mapIds()
– special case for mapping from 1 identifier to anotherOrganismDb
objects: combined org.*
, TxDb.*
, and other annotation resources for easy access
library(Homo.sapiens)
## Loading required package: OrganismDbi
## Loading required package: GO.db
##
select(Homo.sapiens, c("BRCA1", "PTEN"),
c("TXNAME", "TXCHROM", "TXSTART", "TXEND"),
"SYMBOL")
## 'select()' returned 1:many mapping between keys and columns
## SYMBOL TXNAME TXCHROM TXSTART TXEND
## 1 BRCA1 uc010whl.2 chr17 41196312 41276132
## 2 BRCA1 uc002icp.4 chr17 41196312 41277340
## 3 BRCA1 uc010whm.2 chr17 41196312 41277340
## 4 BRCA1 uc002icu.3 chr17 41196312 41277468
## 5 BRCA1 uc010cyx.3 chr17 41196312 41277468
## 6 BRCA1 uc002icq.3 chr17 41196312 41277500
## 7 BRCA1 uc002ict.3 chr17 41196312 41277500
## 8 BRCA1 uc010whn.2 chr17 41196312 41277500
## 9 BRCA1 uc010who.3 chr17 41196312 41277500
## 10 BRCA1 uc010whp.2 chr17 41196312 41322420
## 11 BRCA1 uc010whq.1 chr17 41215350 41256973
## 12 BRCA1 uc002idc.1 chr17 41215350 41277468
## 13 BRCA1 uc010whr.1 chr17 41215350 41277468
## 14 BRCA1 uc002idd.3 chr17 41243117 41276132
## 15 BRCA1 uc002ide.1 chr17 41243452 41256973
## 16 BRCA1 uc010cyy.1 chr17 41243452 41277340
## 17 BRCA1 uc010whs.1 chr17 41243452 41277468
## 18 BRCA1 uc010cyz.2 chr17 41243452 41277500
## 19 BRCA1 uc010cza.2 chr17 41243452 41277500
## 20 BRCA1 uc010wht.1 chr17 41243452 41277500
## 21 PTEN uc001kfb.3 chr10 89623195 89728532
## 22 PTEN uc021pvw.1 chr10 89623195 89728532
biomaRt
, AnnotationHub
http://biomart.org; Bioconductor package biomaRt
## NEEDS INTERNET ACCESS !!
library(biomaRt)
head(listMarts(), 3) ## list marts
head(listDatasets(useMart("ensembl")), 3) ## mart datasets
ensembl <- ## fully specified mart
useMart("ensembl", dataset = "hsapiens_gene_ensembl")
head(listFilters(ensembl), 3) ## filters
myFilter <- "chromosome_name"
substr(filterOptions(myFilter, ensembl), 1, 50) ## return values
myValues <- c("21", "22")
head(listAttributes(ensembl), 3) ## attributes
myAttributes <- c("ensembl_gene_id","chromosome_name")
## assemble and query the mart
res <- getBM(attributes = myAttributes, filters = myFilter,
values = myValues, mart = ensembl)
Other internet resources
Example: Ensembl ‘GTF’ files to R / Bioconductor GRanges and TxDb
library(AnnotationHub)
hub <- AnnotationHub()
hub
query(hub, c("Ensembl", "80", "gtf"))
## ensgtf = display(hub) # visual choice
hub["AH47107"]
gtf <- hub[["AH47107"]]
gtf
txdb <- GenomicFeatures::makeTxDbFromGRanges(gtf)
Example: non-model organism OrgDb
packages
library(AnnotationHub)
hub <- AnnotationHub()
query(hub, "OrgDb")
Example: Map Roadmap epigenomic marks to hg38
Roadmap BED file as GRanges
library(AnnotationHub)
hub <- AnnotationHub()
query(hub , c("EpigenomeRoadMap", "E126", "H3K4ME2"))
E126 <- hub[["AH29817"]]
UCSC ‘liftOver’ file to map coordinates
query(hub , c("hg19", "hg38", "chainfile"))
chain <- hub[["AH14150"]]
lift over – possibly one-to-many mapping, so GRanges to GRangesList
library(rtracklayer)
E126hg38 <- liftOver(E126, chain)
E126hg38
Example: read variants from a VCF file, and annotate with respect to a known gene model
## input variants
library(VariantAnnotation)
fl <- system.file("extdata", "chr22.vcf.gz", package="VariantAnnotation")
vcf <- readVcf(fl, "hg19")
seqlevels(vcf) <- "chr22"
## known gene model
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
coding <- locateVariants(rowRanges(vcf),
TxDb.Hsapiens.UCSC.hg19.knownGene,
CodingVariants())
head(coding)
## GRanges object with 6 ranges and 9 metadata columns:
## seqnames ranges strand | LOCATION LOCSTART LOCEND QUERYID TXID
## <Rle> <IRanges> <Rle> | <factor> <integer> <integer> <integer> <character>
## 1 chr22 [50301422, 50301422] - | coding 939 939 24 75253
## 2 chr22 [50301476, 50301476] - | coding 885 885 25 75253
## 3 chr22 [50301488, 50301488] - | coding 873 873 26 75253
## 4 chr22 [50301494, 50301494] - | coding 867 867 27 75253
## 5 chr22 [50301584, 50301584] - | coding 777 777 28 75253
## 6 chr22 [50302962, 50302962] - | coding 698 698 57 75253
## CDSID GENEID PRECEDEID FOLLOWID
## <IntegerList> <character> <CharacterList> <CharacterList>
## 1 218562 79087
## 2 218562 79087
## 3 218562 79087
## 4 218562 79087
## 5 218562 79087
## 6 218563 79087
## -------
## seqinfo: 1 sequence from an unspecified genome; no seqlengths
Acknowledgements
sessionInfo()
sessionInfo()
## R version 3.3.0 (2016-05-03)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 14.04.4 LTS
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C LC_TIME=en_US.UTF-8
## [4] LC_COLLATE=C LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C LC_ADDRESS=C
## [10] LC_TELEPHONE=C LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] grid stats4 parallel stats graphics grDevices utils datasets methods
## [10] base
##
## other attached packages:
## [1] Homo.sapiens_1.3.1 GO.db_3.3.0
## [3] OrganismDbi_1.15.1 AnnotationHub_2.5.4
## [5] Gviz_1.17.4 biomaRt_2.29.2
## [7] org.Hs.eg.db_3.3.0 BiocParallel_1.7.4
## [9] TxDb.Hsapiens.UCSC.hg19.knownGene_3.2.2 GenomicFeatures_1.25.14
## [11] AnnotationDbi_1.35.3 VariantAnnotation_1.19.2
## [13] RNAseqData.HNRNPC.bam.chr14_0.11.0 GenomicAlignments_1.9.4
## [15] Rsamtools_1.25.0 SummarizedExperiment_1.3.5
## [17] Biobase_2.33.0 BSgenome.Hsapiens.UCSC.hg19_1.4.0
## [19] BSgenome_1.41.2 rtracklayer_1.33.7
## [21] GenomicRanges_1.25.8 GenomeInfoDb_1.9.1
## [23] Biostrings_2.41.4 XVector_0.13.2
## [25] IRanges_2.7.11 S4Vectors_0.11.7
## [27] BiocGenerics_0.19.1 ggplot2_2.1.0
## [29] BiocStyle_2.1.10
##
## loaded via a namespace (and not attached):
## [1] httr_1.2.0 splines_3.3.0 Formula_1.2-1
## [4] shiny_0.13.2 interactiveDisplayBase_1.11.3 latticeExtra_0.6-28
## [7] RBGL_1.49.1 yaml_2.1.13 RSQLite_1.0.0
## [10] lattice_0.20-33 biovizBase_1.21.0 chron_2.3-47
## [13] digest_0.6.9 RColorBrewer_1.1-2 colorspace_1.2-6
## [16] htmltools_0.3.5 httpuv_1.3.3 Matrix_1.2-6
## [19] plyr_1.8.4 XML_3.98-1.4 zlibbioc_1.19.0
## [22] xtable_1.8-2 scales_0.4.0 nnet_7.3-12
## [25] survival_2.39-4 magrittr_1.5 mime_0.4
## [28] evaluate_0.9 foreign_0.8-66 graph_1.51.0
## [31] BiocInstaller_1.23.4 tools_3.3.0 data.table_1.9.6
## [34] formatR_1.4 matrixStats_0.50.2 stringr_1.0.0
## [37] munsell_0.4.3 cluster_2.0.4 ensembldb_1.5.8
## [40] RCurl_1.95-4.8 dichromat_2.0-0 bitops_1.0-6
## [43] labeling_0.3 rmarkdown_0.9.6 gtable_0.2.0
## [46] codetools_0.2-14 DBI_0.4-1 reshape2_1.4.1
## [49] R6_2.1.2 gridExtra_2.2.1 knitr_1.13
## [52] Hmisc_3.17-4 stringi_1.1.1 Rcpp_0.12.5
## [55] rpart_4.1-10 acepack_1.3-3.3