% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/tr2g.R
\name{tr2g_gtf}
\alias{tr2g_gtf}
\title{Get transcript and gene info from GTF file}
\usage{
tr2g_gtf(
  file,
  Genome = NULL,
  get_transcriptome = TRUE,
  out_path = ".",
  write_tr2g = TRUE,
  transcript_id = "transcript_id",
  gene_id = "gene_id",
  gene_name = "gene_name",
  transcript_version = "transcript_version",
  gene_version = "gene_version",
  version_sep = ".",
  transcript_biotype_col = "transcript_biotype",
  gene_biotype_col = "gene_biotype",
  transcript_biotype_use = "all",
  gene_biotype_use = "all",
  chrs_only = TRUE,
  compress_fa = FALSE,
  save_filtered_gtf = TRUE,
  overwrite = FALSE
)
}
\arguments{
\item{file}{Path to a GTF file to be read. The file can remain gzipped. Use
\code{getGTF} from the \code{biomartr} package to download GTF files
from Ensembl, and use \code{getGFF} from \code{biomartr} to download
GFF3 files from Ensembl and RefSeq.}

\item{Genome}{Either a \code{\link{BSgenome}} or a \code{\link{XStringSet}}
object of genomic sequences, where the intronic sequences will be extracted
from. Use \code{\link{genomeStyles}} to check which styles are supported for
your organism of interest; supported styles can be interconverted. If the
style in your genome or annotation is not supported, then the style of
chromosome names in the genome and annotation should be manually set to be
consistent.}

\item{get_transcriptome}{Logical, whether to extract transcriptome from
genome with the GTF file. If filtering biotypes or chromosomes, the filtered
\code{GRanges} will be used to extract transcriptome.}

\item{out_path}{Directory to save the outputs written to disk. If this
directory does not exist, then it will be created. Defaults to the current
working directory.}

\item{write_tr2g}{Logical, whether to write tr2g to disk. If \code{TRUE}, then
a file \code{tr2g.tsv} will be written into \code{out_path}.}

\item{transcript_id}{Character vector of length 1. Tag in \code{attribute}
field corresponding to transcript IDs. This argument must be supplied and
cannot be \code{NA} or \code{NULL}. Will throw error if tag indicated in this
argument does not exist.}

\item{gene_id}{Character vector of length 1. Tag in \code{attribute}
field corresponding to gene IDs. This argument must be supplied and
cannot be \code{NA} or \code{NULL}. Note that this is different from gene
symbols, which do not have to be unique. This can be Ensembl or Entrez IDs.
However, if the gene symbols are in fact unique for each gene, you may
supply the tag for human readable gene symbols to this argument. Will throw
error if tag indicated in this argument does not exist. This is typically
"gene_id" for annotations from Ensembl and "gene" for refseq.}

\item{gene_name}{Character vector of length 1. Tag in \code{attribute}
field corresponding to gene symbols. This argument can be \code{NA} or
\code{NULL} if you are fine with non-human readable gene IDs and do not wish
to extract human readable gene symbols.}

\item{transcript_version}{Character vector of length 1. Tag in \code{attribute}
field corresponding to \emph{transcript} version number. If your GTF file does not
include transcript version numbers, or if you do not wish to include the
version number, then use \code{NULL} for this argument. To decide whether to
include transcript version number, check whether version numbers are included
in the \code{transcripts.txt} in the \code{kallisto} output directory. If that file
includes version numbers, then trannscript version numbers must be included
here as well. If that file does not include version numbers, then transcript
version numbers must not be included here.}

\item{gene_version}{Character vector of length 1. Tag in \code{attribute}
field corresponding to \emph{gene} version number. If your GTF file does not
include gene version numbers, or if you do not wish to include the
version number, then use \code{NULL} for this argument. Unlike transcript
version number, it's up to you whether to include gene version number.}

\item{version_sep}{Character to separate bewteen the main ID and the version
number. Defaults to ".", as in Ensembl.}

\item{transcript_biotype_col}{Character vector of length 1. Tag in
\code{attribute} field corresponding to \emph{transcript} biotype.}

\item{gene_biotype_col}{Character vector of length 1. Tag in \code{attribute}
field corresponding to \emph{gene} biotype.}

\item{transcript_biotype_use}{Character, can be "all" or
a vector of \emph{transcript} biotypes to be used. Transcript biotypes aren't
entirely the same as gene biotypes. For instance, in Ensembl annotation,
\code{retained_intron} is a transcript biotype, but not a gene biotype. If
"cellranger", then a warning will be given. See \code{data("ensembl_tx_biotypes")}
for all available transcript biotypes from Ensembl.}

\item{gene_biotype_use}{Character, can be "all", "cellranger", or
a vector of \emph{gene} biotypes to be used. If "cellranger", then the biotypes
used by Cell Ranger's reference are used. See \code{data("cellranger_biotypes")}
for gene biotypes the Cell Ranger reference uses. See
\code{data("ensembl_gene_biotypes")} for all available gene biotypes from Ensembl.
Note that gene biotypes and transcript biotypes are not always the same.}

\item{chrs_only}{Logical, whether to include chromosomes only, for GTF and
GFF files can contain annotations for scaffolds, which are not incorporated
into chromosomes. This will also exclude haplotypes. Defaults to \code{TRUE}.
Only applicable to species found in \code{genomeStyles()}.}

\item{compress_fa}{Logical, whether to compress the output fasta file. If
\code{TRUE}, then the fasta file will be gzipped.}

\item{save_filtered_gtf}{Logical. If filtering type, biotypes, and/or
chromosomes, whether to save the filtered \code{GRanges} as a GTF file.}

\item{overwrite}{Logical, whether to overwrite if files with names of outputs
written to disk already exist.}
}
\value{
A data frame at least 2 columns: \code{gene} for gene ID,
\code{transcript} for transcript ID, and optionally, \code{gene_name} for
gene names.
}
\description{
This function reads a GTF file and extracts the transcript ID and
corresponding gene ID. This function assumes that the GTF file is properly
formatted. See \url{http://mblab.wustl.edu/GTF2.html} for a detailed
description of proper GTF format. Note that GFF3 files have a somewhat
different and more complicated format in the attribute field, which this
function does not support. See \url{http://gmod.org/wiki/GFF3} for a detailed
description of proper GFF3 format. To extract transcript and gene information
from GFF3 files, see the function \code{\link{tr2g_gff3}} in this package.
}
\details{
Transcript and gene versions may not be present in all GTF files, so these
arguments are optional. This function has arguments for transcript and gene
version numbers because Ensembl IDs have version numbers. For Ensembl IDs, we
recommend including the version number, since a change in version number
signals a change in the entity referred to by the ID after reannotation. If a
version is used, then it will be appended to the ID, separated by
\code{version_sep}.

The transcript and gene IDs are The \code{attribute} field (the last
field) of GTF files can be complicated and inconsistent across different
sources. Please check the \code{attribute} tags in your GTF file and consider
the arguments of this function carefully. The defaults are set according to
Ensembl GTF files; defaults may not work for files from other sources. Due to
the general lack of standards for the \code{attribute} field, you may need to
further clean up the output of this function.
}
\examples{
toy_path <- system.file("testdata", package = "BUSpaRse")
file_use <- paste(toy_path, "gtf_test.gtf", sep = "/")
# Default
tr2g <- tr2g_gtf(file = file_use, get_transcriptome = FALSE,
  write_tr2g = FALSE, save_filtered_gtf = FALSE)
# Excluding version numbers
tr2g <- tr2g_gtf(file = file_use, transcript_version = NULL,
  gene_version = NULL, get_transcriptome = FALSE,
  write_tr2g = FALSE, save_filtered_gtf = FALSE)
}
\seealso{
ensembl_gene_biotypes ensembl_tx_biotypes cellranger_biotypes

Other functions to retrieve transcript and gene info: 
\code{\link{sort_tr2g}()},
\code{\link{tr2g_EnsDb}()},
\code{\link{tr2g_TxDb}()},
\code{\link{tr2g_ensembl}()},
\code{\link{tr2g_fasta}()},
\code{\link{tr2g_gff3}()},
\code{\link{transcript2gene}()}
}
\concept{functions to retrieve transcript and gene info}
