|
| 1 | +\name{makeGPosFromDataFrame} |
| 2 | + |
| 3 | +\alias{makeGPosFromDataFrame} |
| 4 | + |
| 5 | +\alias{coerce,data.frame,GPos-method} |
| 6 | +\alias{coerce,DataFrame,GPos-method} |
| 7 | + |
| 8 | +\title{Make a GPos object from a data.frame or DataFrame} |
| 9 | + |
| 10 | +\description{ |
| 11 | + \code{makeGPosFromDataFrame} takes a data-frame-like object as |
| 12 | + input and tries to automatically find the columns that describe |
| 13 | + a genomic position. It returns them as a \link{GPos} object. |
| 14 | + |
| 15 | + \code{makeGPosFromDataFrame} is also the workhorse behind the |
| 16 | + coercion method from data.frame (or \link[S4Vectors]{DataFrame}) to |
| 17 | + \link{GPos}. |
| 18 | +} |
| 19 | + |
| 20 | +\usage{ |
| 21 | +makeGPosFromDataFrame(df, |
| 22 | + keep.extra.columns=FALSE, |
| 23 | + ignore.strand=FALSE, |
| 24 | + seqinfo=NULL, |
| 25 | + seqnames.field=c("seqnames", "seqname", |
| 26 | + "chromosome", "chrom", |
| 27 | + "chr", "chromosome_name", |
| 28 | + "seqid"), |
| 29 | + start.field=c("start", "pos"), |
| 30 | + end.field=c("end", "stop", "pos"), |
| 31 | + strand.field="strand", |
| 32 | + starts.in.df.are.0based=FALSE) |
| 33 | +} |
| 34 | + |
| 35 | +\arguments{ |
| 36 | + \item{df}{ |
| 37 | + A data.frame or \link[S4Vectors]{DataFrame} object. If not, then |
| 38 | + the function first tries to turn \code{df} into a data frame with |
| 39 | + \code{as.data.frame(df)}. |
| 40 | + } |
| 41 | + \item{keep.extra.columns}{ |
| 42 | + \code{TRUE} or \code{FALSE} (the default). |
| 43 | + If \code{TRUE}, the columns in \code{df} that are not used to form |
| 44 | + the genomic ranges of the returned \link{GPos} object are then |
| 45 | + returned as metadata columns on the object. Otherwise, they are ignored. |
| 46 | + If \code{df} has a \code{width} column, then it's always ignored. |
| 47 | + } |
| 48 | + \item{ignore.strand}{ |
| 49 | + \code{TRUE} or \code{FALSE} (the default). |
| 50 | + If \code{TRUE}, then the strand of the returned \link{GPos} object |
| 51 | + is set to \code{"*"}. |
| 52 | + } |
| 53 | + \item{seqinfo}{ |
| 54 | + Either \code{NULL}, or a \link[GenomeInfoDb]{Seqinfo} object, |
| 55 | + or a character vector of unique sequence names (a.k.a. \emph{seqlevels}), |
| 56 | + or a named numeric vector of sequence lengths. |
| 57 | + When not \code{NULL}, \code{seqinfo} must be compatible with the genomic |
| 58 | + ranges in \code{df}, that is, it must have one entry for each unique |
| 59 | + sequence name represented in \code{df}. Note that it can have additional |
| 60 | + entries i.e. entries for seqlevels not represented in \code{df}. |
| 61 | + } |
| 62 | + \item{seqnames.field}{ |
| 63 | + A character vector of recognized names for the column in \code{df} |
| 64 | + that contains the chromosome name (a.k.a. sequence name) associated |
| 65 | + with each genomic range. |
| 66 | + Only the first name in \code{seqnames.field} that is found |
| 67 | + in \code{colnames(df)} is used. |
| 68 | + If no one is found, then an error is raised. |
| 69 | + } |
| 70 | + \item{start.field}{ |
| 71 | + A character vector of recognized names for the column in \code{df} |
| 72 | + that contains the start positions of the genomic ranges. |
| 73 | + Only the first name in \code{start.field} that is found |
| 74 | + in \code{colnames(df)} is used. |
| 75 | + If no one is found, then an error is raised. |
| 76 | + } |
| 77 | + \item{end.field}{ |
| 78 | + A character vector of recognized names for the column in \code{df} |
| 79 | + that contains the end positions of the genomic ranges. |
| 80 | + Only the first name in \code{start.field} that is found |
| 81 | + in \code{colnames(df)} is used. |
| 82 | + If no one is found, then an error is raised. |
| 83 | + } |
| 84 | + \item{strand.field}{ |
| 85 | + A character vector of recognized names for the column in \code{df} |
| 86 | + that contains the strand associated with each genomic range. |
| 87 | + Only the first name in \code{strand.field} that is found |
| 88 | + in \code{colnames(df)} is used. |
| 89 | + If no one is found or if \code{ignore.strand} is \code{TRUE}, |
| 90 | + then the strand of the returned \link{GPos} object is |
| 91 | + set to \code{"*"}. |
| 92 | + } |
| 93 | + \item{starts.in.df.are.0based}{ |
| 94 | + \code{TRUE} or \code{FALSE} (the default). |
| 95 | + If \code{TRUE}, then the start positions of the genomic ranges in |
| 96 | + \code{df} are considered to be \emph{0-based} and are converted to |
| 97 | + \emph{1-based} in the returned \link{GPos} object. |
| 98 | + This feature is intended to make it more convenient to handle input |
| 99 | + that contains data obtained from resources using the "0-based |
| 100 | + start" convention. A notorious example of such resource is the UCSC |
| 101 | + Table Browser (\url{http://genome.ucsc.edu/cgi-bin/hgTables}). |
| 102 | + } |
| 103 | +} |
| 104 | +
|
| 105 | +\value{ |
| 106 | + A \link{GPos} object with one element per row in the input. |
| 107 | +
|
| 108 | + If the \code{seqinfo} argument was supplied, the returned object will |
| 109 | + have exactly the seqlevels specified in \code{seqinfo} and in the same |
| 110 | + order. Otherwise, the seqlevels are ordered according to the output of |
| 111 | + the \code{\link[GenomeInfoDb]{rankSeqlevels}} function (except if |
| 112 | + \code{df} contains the seqnames in the form of a factor-Rle, in which |
| 113 | + case the levels of the factor-Rle become the seqlevels of the returned |
| 114 | + object and with no re-ordering). |
| 115 | +
|
| 116 | + If \code{df} has non-automatic row names (i.e. \code{rownames(df)} is |
| 117 | + not \code{NULL} and is not \code{seq_len(nrow(df))}), then they will be |
| 118 | + used to set names on the returned \link{GPos} object. |
| 119 | +} |
| 120 | +
|
| 121 | +\note{ |
| 122 | + Coercing data.frame or \link[S4Vectors]{DataFrame} \code{df} into |
| 123 | + a \link{GPos} object (with \code{as(df, "GPos")}), or |
| 124 | + calling \code{GPos(df)}, are both equivalent to calling |
| 125 | + \code{makeGPosFromDataFrame(df, keep.extra.columns=TRUE)}. |
| 126 | +} |
| 127 | +
|
| 128 | +\author{ |
| 129 | + H. Pagès, based on a proposal by Kasper Daniel Hansen |
| 130 | +} |
| 131 | +
|
| 132 | +\seealso{ |
| 133 | + \itemize{ |
| 134 | + \item \link{GPos} objects. |
| 135 | +
|
| 136 | + \item \link[GenomeInfoDb]{Seqinfo} objects and the |
| 137 | + \code{\link[GenomeInfoDb]{rankSeqlevels}} function in the |
| 138 | + \pkg{GenomeInfoDb} package. |
| 139 | +
|
| 140 | + \item The \code{\link[rtracklayer]{getTable}} function in the |
| 141 | + \pkg{rtracklayer} package for an R interface to the UCSC |
| 142 | + Table Browser. |
| 143 | +
|
| 144 | + \item \link[S4Vectors]{DataFrame} objects in the \pkg{S4Vectors} package. |
| 145 | + } |
| 146 | +} |
| 147 | +
|
| 148 | +\examples{ |
| 149 | +## --------------------------------------------------------------------- |
| 150 | +## BASIC EXAMPLES |
| 151 | +## --------------------------------------------------------------------- |
| 152 | +
|
| 153 | +df <- data.frame(chr="chr1", pos = 11:15, score=1:5) |
| 154 | +makeGPosFromDataFrame(df) |
| 155 | +
|
| 156 | +df <- data.frame(chr="chr1", start=11:15, end=11:15, |
| 157 | + strand=c("+","-","+","*","."), score=1:5) |
| 158 | +df |
| 159 | +makeGPosFromDataFrame(df) # strand value "." is replaced with "*" |
| 160 | +
|
| 161 | +## The strand column is optional: |
| 162 | +df <- data.frame(chr="chr1", start=11:15, end=11:15, score=1:5) |
| 163 | +makeGPosFromDataFrame(df) |
| 164 | +
|
| 165 | +gr <- makeGPosFromDataFrame(df, keep.extra.columns=TRUE) |
| 166 | +gr2 <- as(df, "GPos") # equivalent to the above |
| 167 | +stopifnot(identical(gr, gr2)) |
| 168 | +gr2 <- GPos(df) # equivalent to the above |
| 169 | +stopifnot(identical(gr, gr2)) |
| 170 | +
|
| 171 | +makeGPosFromDataFrame(df, ignore.strand=TRUE) |
| 172 | +makeGPosFromDataFrame(df, keep.extra.columns=TRUE, |
| 173 | + ignore.strand=TRUE) |
| 174 | +
|
| 175 | +makeGPosFromDataFrame(df, seqinfo=paste0("chr", 4:1)) |
| 176 | +makeGPosFromDataFrame(df, seqinfo=c(chrM=NA, chr1=500, chrX=100)) |
| 177 | +makeGPosFromDataFrame(df, seqinfo=Seqinfo(paste0("chr", 4:1))) |
| 178 | +
|
| 179 | +## --------------------------------------------------------------------- |
| 180 | +## ABOUT AUTOMATIC DETECTION OF THE seqnames/start/end/strand COLUMNS |
| 181 | +## --------------------------------------------------------------------- |
| 182 | +
|
| 183 | +## Automatic detection of the seqnames/start/end/strand columns is |
| 184 | +## case insensitive: |
| 185 | +df <- data.frame(ChRoM="chr1", StarT=11:15, stoP=11:15, |
| 186 | + STRAND=c("+","-","+","*","."), score=1:5) |
| 187 | +makeGPosFromDataFrame(df) |
| 188 | +
|
| 189 | +## It also ignores a common prefix between the start and end columns: |
| 190 | +df <- data.frame(seqnames="chr1", tx_start=11:15, tx_end=11:15, |
| 191 | + strand=c("+","-","+","*","."), score=1:5) |
| 192 | +makeGPosFromDataFrame(df) |
| 193 | +
|
| 194 | +## The common prefix between the start and end columns is used to |
| 195 | +## disambiguate between more than one seqnames column: |
| 196 | +df <- data.frame(chrom="chr1", tx_start=11:15, tx_end=11:15, |
| 197 | + tx_chr="chr2", score=1:5) |
| 198 | +makeGPosFromDataFrame(df) |
| 199 | +} |
| 200 | +
|
| 201 | +\keyword{manip} |
0 commit comments