From 052f70f7b418a823c524486be7c611582fcf980f Mon Sep 17 00:00:00 2001 From: LiNk-NY Date: Mon, 3 Feb 2020 18:12:55 -0500 Subject: [PATCH 1/2] add helper makeGPosFromDataFrame --- DESCRIPTION | 1 + NAMESPACE | 1 + R/GPos-class.R | 2 +- R/makeGPosFromDataFrame.R | 37 ++++++ R/makeGRangesFromDataFrame.R | 34 +++++- man/GenomicRangesList-class.Rd | 2 +- man/makeGPosFromDataFrame.Rd | 201 +++++++++++++++++++++++++++++++++ 7 files changed, 271 insertions(+), 7 deletions(-) create mode 100644 R/makeGPosFromDataFrame.R create mode 100644 man/makeGPosFromDataFrame.Rd diff --git a/DESCRIPTION b/DESCRIPTION index 0d6e0728..5e8dfeee 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -53,6 +53,7 @@ Collate: normarg-utils.R GenomicRangesList-class.R GRangesList-class.R makeGRangesFromDataFrame.R + makeGPosFromDataFrame.R makeGRangesListFromDataFrame.R RangedData-methods.R findOverlaps-methods.R diff --git a/NAMESPACE b/NAMESPACE index 4b7e75b4..6a444ab2 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -134,6 +134,7 @@ export( GNCList, GenomicRangesList, GRangesList, + makeGPosFromDataFrame, makeGRangesFromDataFrame, makeGRangesListFromDataFrame, makeGRangesListFromFeatureFragments, diff --git a/R/GPos-class.R b/R/GPos-class.R index fe36a06a..81eb2ba3 100644 --- a/R/GPos-class.R +++ b/R/GPos-class.R @@ -176,7 +176,7 @@ GPos <- function(seqnames=NULL, pos=NULL, strand=NULL, if (is(x_ranges, "IRanges")) # i.e. 'x' is not a GPos strand <- rep.int(strand, width(x_ranges)) } - if (length(mcols) == 0L && is(x, "GPos")) + if (length(mcols) == 0L && inherits(x, "GenomicRanges")) mcols <- mcols(x, use.names=FALSE) if (is.null(seqinfo)) seqinfo <- seqinfo(x) diff --git a/R/makeGPosFromDataFrame.R b/R/makeGPosFromDataFrame.R new file mode 100644 index 00000000..ff79f5b3 --- /dev/null +++ b/R/makeGPosFromDataFrame.R @@ -0,0 +1,37 @@ +### ========================================================================= +### makeGPosFromDataFrame() +### ------------------------------------------------------------------------- + +### 'df' must be a data.frame or DataFrame object. +makeGPosFromDataFrame <- function(df, + keep.extra.columns=FALSE, + ignore.strand=FALSE, + seqinfo=NULL, + seqnames.field=c("seqnames", "seqname", + "chromosome", "chrom", + "chr", "chromosome_name", + "seqid"), + start.field=c("start", "pos"), + end.field=c("end", "stop", "pos"), + strand.field="strand", + starts.in.df.are.0based=FALSE) +{ + .makeXFromDataFrame(df = df, x = "GPos", + keep.extra.columns=keep.extra.columns, + ignore.strand=ignore.strand, + seqinfo=seqinfo, + seqnames.field=seqnames.field, + start.field=start.field, + end.field=end.field, + strand.field=strand.field, + starts.in.df.are.0based=starts.in.df.are.0based) +} + +setAs("data.frame", "GPos", + function(from) makeGPosFromDataFrame(from, keep.extra.columns=TRUE) +) + +setAs("DataFrame", "GPos", + function(from) makeGPosFromDataFrame(from, keep.extra.columns=TRUE) +) + diff --git a/R/makeGRangesFromDataFrame.R b/R/makeGRangesFromDataFrame.R index 4226dd2f..ec53fcf2 100644 --- a/R/makeGRangesFromDataFrame.R +++ b/R/makeGRangesFromDataFrame.R @@ -74,7 +74,7 @@ .find_strand_col <- function(df_colnames, strand.field, prefix) { idx <- which(df_colnames %in% paste0(prefix, strand.field)) - if (length(idx) == 0L) + if (length(idx) == 0L) idx <- which(df_colnames %in% strand.field) if (length(idx) == 0L) return(NA_integer_) @@ -144,7 +144,6 @@ ans } -### 'df' must be a data.frame or DataFrame object. makeGRangesFromDataFrame <- function(df, keep.extra.columns=FALSE, ignore.strand=FALSE, @@ -158,7 +157,32 @@ makeGRangesFromDataFrame <- function(df, strand.field="strand", starts.in.df.are.0based=FALSE) { - ## Check args. + .makeXFromDataFrame(df = df, x = "GRanges", + keep.extra.columns=keep.extra.columns, + ignore.strand=ignore.strand, + seqinfo=seqinfo, + seqnames.field=seqnames.field, + start.field=start.field, + end.field=end.field, + strand.field=strand.field, + starts.in.df.are.0based=starts.in.df.are.0based) +} + +.makeXFromDataFrame <- function(df, x = c("GRanges", "GPos"), + keep.extra.columns=FALSE, + ignore.strand=FALSE, + seqinfo=NULL, + seqnames.field=c("seqnames", "seqname", + "chromosome", "chrom", + "chr", "chromosome_name", + "seqid"), + start.field="start", + end.field=c("end", "stop"), + strand.field="strand", + starts.in.df.are.0based=FALSE) +{ +### 'df' must be a data.frame or DataFrame object. + ## Check args. if (is.character(df)) # for people that provide the path to a file stop("'df' must be a data.frame or DataFrame object") if (!(is.data.frame(df) || is(df, "DataFrame"))) @@ -177,7 +201,7 @@ makeGRangesFromDataFrame <- function(df, end.field=end.field, strand.field=strand.field, ignore.strand=ignore.strand) - + FUN <- switch(x, GRanges = GRanges, GPos = GPos) ## Prepare 'ans_seqnames'. ans_seqnames <- df[[granges_cols[["seqnames"]]]] @@ -230,7 +254,7 @@ makeGRangesFromDataFrame <- function(df, } ## Make and return the GRanges object. - GRanges(ans_seqnames, ans_ranges, strand=ans_strand, + FUN(ans_seqnames, ans_ranges, strand=ans_strand, ans_mcols, seqinfo=ans_seqinfo) } diff --git a/man/GenomicRangesList-class.Rd b/man/GenomicRangesList-class.Rd index e8ca4a29..0387f678 100644 --- a/man/GenomicRangesList-class.Rd +++ b/man/GenomicRangesList-class.Rd @@ -89,7 +89,7 @@ } Note that the \emph{Vector class hierarchy} has many more classes. In particular \link[S4Vectors]{Vector}, \link[S4Vectors]{List}, - \link[IRanges]{RangesList}, and \link[IRanges]{IntegerRangesList} + \link[IRanges]{IRangesList}, and \link[IRanges]{IntegerRangesList} have other subclasses not shown here. } diff --git a/man/makeGPosFromDataFrame.Rd b/man/makeGPosFromDataFrame.Rd new file mode 100644 index 00000000..0f7961c0 --- /dev/null +++ b/man/makeGPosFromDataFrame.Rd @@ -0,0 +1,201 @@ +\name{makeGPosFromDataFrame} + +\alias{makeGPosFromDataFrame} + +\alias{coerce,data.frame,GPos-method} +\alias{coerce,DataFrame,GPos-method} + +\title{Make a GPos object from a data.frame or DataFrame} + +\description{ + \code{makeGPosFromDataFrame} takes a data-frame-like object as + input and tries to automatically find the columns that describe + a genomic position. It returns them as a \link{GPos} object. + + \code{makeGPosFromDataFrame} is also the workhorse behind the + coercion method from data.frame (or \link[S4Vectors]{DataFrame}) to + \link{GPos}. +} + +\usage{ +makeGPosFromDataFrame(df, + keep.extra.columns=FALSE, + ignore.strand=FALSE, + seqinfo=NULL, + seqnames.field=c("seqnames", "seqname", + "chromosome", "chrom", + "chr", "chromosome_name", + "seqid"), + start.field=c("start", "pos"), + end.field=c("end", "stop", "pos"), + strand.field="strand", + starts.in.df.are.0based=FALSE) +} + +\arguments{ + \item{df}{ + A data.frame or \link[S4Vectors]{DataFrame} object. If not, then + the function first tries to turn \code{df} into a data frame with + \code{as.data.frame(df)}. + } + \item{keep.extra.columns}{ + \code{TRUE} or \code{FALSE} (the default). + If \code{TRUE}, the columns in \code{df} that are not used to form + the genomic ranges of the returned \link{GPos} object are then + returned as metadata columns on the object. Otherwise, they are ignored. + If \code{df} has a \code{width} column, then it's always ignored. + } + \item{ignore.strand}{ + \code{TRUE} or \code{FALSE} (the default). + If \code{TRUE}, then the strand of the returned \link{GPos} object + is set to \code{"*"}. + } + \item{seqinfo}{ + Either \code{NULL}, or a \link[GenomeInfoDb]{Seqinfo} object, + or a character vector of unique sequence names (a.k.a. \emph{seqlevels}), + or a named numeric vector of sequence lengths. + When not \code{NULL}, \code{seqinfo} must be compatible with the genomic + ranges in \code{df}, that is, it must have one entry for each unique + sequence name represented in \code{df}. Note that it can have additional + entries i.e. entries for seqlevels not represented in \code{df}. + } + \item{seqnames.field}{ + A character vector of recognized names for the column in \code{df} + that contains the chromosome name (a.k.a. sequence name) associated + with each genomic range. + Only the first name in \code{seqnames.field} that is found + in \code{colnames(df)} is used. + If no one is found, then an error is raised. + } + \item{start.field}{ + A character vector of recognized names for the column in \code{df} + that contains the start positions of the genomic ranges. + Only the first name in \code{start.field} that is found + in \code{colnames(df)} is used. + If no one is found, then an error is raised. + } + \item{end.field}{ + A character vector of recognized names for the column in \code{df} + that contains the end positions of the genomic ranges. + Only the first name in \code{start.field} that is found + in \code{colnames(df)} is used. + If no one is found, then an error is raised. + } + \item{strand.field}{ + A character vector of recognized names for the column in \code{df} + that contains the strand associated with each genomic range. + Only the first name in \code{strand.field} that is found + in \code{colnames(df)} is used. + If no one is found or if \code{ignore.strand} is \code{TRUE}, + then the strand of the returned \link{GPos} object is + set to \code{"*"}. + } + \item{starts.in.df.are.0based}{ + \code{TRUE} or \code{FALSE} (the default). + If \code{TRUE}, then the start positions of the genomic ranges in + \code{df} are considered to be \emph{0-based} and are converted to + \emph{1-based} in the returned \link{GPos} object. + This feature is intended to make it more convenient to handle input + that contains data obtained from resources using the "0-based + start" convention. A notorious example of such resource is the UCSC + Table Browser (\url{http://genome.ucsc.edu/cgi-bin/hgTables}). + } +} + +\value{ + A \link{GPos} object with one element per row in the input. + + If the \code{seqinfo} argument was supplied, the returned object will + have exactly the seqlevels specified in \code{seqinfo} and in the same + order. Otherwise, the seqlevels are ordered according to the output of + the \code{\link[GenomeInfoDb]{rankSeqlevels}} function (except if + \code{df} contains the seqnames in the form of a factor-Rle, in which + case the levels of the factor-Rle become the seqlevels of the returned + object and with no re-ordering). + + If \code{df} has non-automatic row names (i.e. \code{rownames(df)} is + not \code{NULL} and is not \code{seq_len(nrow(df))}), then they will be + used to set names on the returned \link{GPos} object. +} + +\note{ + Coercing data.frame or \link[S4Vectors]{DataFrame} \code{df} into + a \link{GPos} object (with \code{as(df, "GPos")}), or + calling \code{GPos(df)}, are both equivalent to calling + \code{makeGPosFromDataFrame(df, keep.extra.columns=TRUE)}. +} + +\author{ + H. Pagès, based on a proposal by Kasper Daniel Hansen +} + +\seealso{ + \itemize{ + \item \link{GPos} objects. + + \item \link[GenomeInfoDb]{Seqinfo} objects and the + \code{\link[GenomeInfoDb]{rankSeqlevels}} function in the + \pkg{GenomeInfoDb} package. + + \item The \code{\link[rtracklayer]{getTable}} function in the + \pkg{rtracklayer} package for an R interface to the UCSC + Table Browser. + + \item \link[S4Vectors]{DataFrame} objects in the \pkg{S4Vectors} package. + } +} + +\examples{ +## --------------------------------------------------------------------- +## BASIC EXAMPLES +## --------------------------------------------------------------------- + +df <- data.frame(chr="chr1", pos = 11:15, score=1:5) +makeGPosFromDataFrame(df) + +df <- data.frame(chr="chr1", start=11:15, end=11:15, + strand=c("+","-","+","*","."), score=1:5) +df +makeGPosFromDataFrame(df) # strand value "." is replaced with "*" + +## The strand column is optional: +df <- data.frame(chr="chr1", start=11:15, end=11:15, score=1:5) +makeGPosFromDataFrame(df) + +gr <- makeGPosFromDataFrame(df, keep.extra.columns=TRUE) +gr2 <- as(df, "GPos") # equivalent to the above +stopifnot(identical(gr, gr2)) +gr2 <- GPos(df) # equivalent to the above +stopifnot(identical(gr, gr2)) + +makeGPosFromDataFrame(df, ignore.strand=TRUE) +makeGPosFromDataFrame(df, keep.extra.columns=TRUE, + ignore.strand=TRUE) + +makeGPosFromDataFrame(df, seqinfo=paste0("chr", 4:1)) +makeGPosFromDataFrame(df, seqinfo=c(chrM=NA, chr1=500, chrX=100)) +makeGPosFromDataFrame(df, seqinfo=Seqinfo(paste0("chr", 4:1))) + +## --------------------------------------------------------------------- +## ABOUT AUTOMATIC DETECTION OF THE seqnames/start/end/strand COLUMNS +## --------------------------------------------------------------------- + +## Automatic detection of the seqnames/start/end/strand columns is +## case insensitive: +df <- data.frame(ChRoM="chr1", StarT=11:15, stoP=11:15, + STRAND=c("+","-","+","*","."), score=1:5) +makeGPosFromDataFrame(df) + +## It also ignores a common prefix between the start and end columns: +df <- data.frame(seqnames="chr1", tx_start=11:15, tx_end=11:15, + strand=c("+","-","+","*","."), score=1:5) +makeGPosFromDataFrame(df) + +## The common prefix between the start and end columns is used to +## disambiguate between more than one seqnames column: +df <- data.frame(chrom="chr1", tx_start=11:15, tx_end=11:15, + tx_chr="chr2", score=1:5) +makeGPosFromDataFrame(df) +} + +\keyword{manip} From 46f5b824cae5cc18aa705c2b7eb301c75b6120b3 Mon Sep 17 00:00:00 2001 From: LiNk-NY Date: Tue, 4 Feb 2020 18:24:30 -0500 Subject: [PATCH 2/2] merge functionality into makeGRangesFromDataFrame --- DESCRIPTION | 1 - NAMESPACE | 1 - R/makeGPosFromDataFrame.R | 37 ---- R/makeGRangesFromDataFrame.R | 52 ++--- .../unitTests/test_makeGRangesFromDataFrame.R | 15 ++ man/makeGPosFromDataFrame.Rd | 201 ------------------ man/makeGRangesFromDataFrame.Rd | 41 +++- 7 files changed, 77 insertions(+), 271 deletions(-) delete mode 100644 R/makeGPosFromDataFrame.R delete mode 100644 man/makeGPosFromDataFrame.Rd diff --git a/DESCRIPTION b/DESCRIPTION index 5e8dfeee..0d6e0728 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -53,7 +53,6 @@ Collate: normarg-utils.R GenomicRangesList-class.R GRangesList-class.R makeGRangesFromDataFrame.R - makeGPosFromDataFrame.R makeGRangesListFromDataFrame.R RangedData-methods.R findOverlaps-methods.R diff --git a/NAMESPACE b/NAMESPACE index 6a444ab2..4b7e75b4 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -134,7 +134,6 @@ export( GNCList, GenomicRangesList, GRangesList, - makeGPosFromDataFrame, makeGRangesFromDataFrame, makeGRangesListFromDataFrame, makeGRangesListFromFeatureFragments, diff --git a/R/makeGPosFromDataFrame.R b/R/makeGPosFromDataFrame.R deleted file mode 100644 index ff79f5b3..00000000 --- a/R/makeGPosFromDataFrame.R +++ /dev/null @@ -1,37 +0,0 @@ -### ========================================================================= -### makeGPosFromDataFrame() -### ------------------------------------------------------------------------- - -### 'df' must be a data.frame or DataFrame object. -makeGPosFromDataFrame <- function(df, - keep.extra.columns=FALSE, - ignore.strand=FALSE, - seqinfo=NULL, - seqnames.field=c("seqnames", "seqname", - "chromosome", "chrom", - "chr", "chromosome_name", - "seqid"), - start.field=c("start", "pos"), - end.field=c("end", "stop", "pos"), - strand.field="strand", - starts.in.df.are.0based=FALSE) -{ - .makeXFromDataFrame(df = df, x = "GPos", - keep.extra.columns=keep.extra.columns, - ignore.strand=ignore.strand, - seqinfo=seqinfo, - seqnames.field=seqnames.field, - start.field=start.field, - end.field=end.field, - strand.field=strand.field, - starts.in.df.are.0based=starts.in.df.are.0based) -} - -setAs("data.frame", "GPos", - function(from) makeGPosFromDataFrame(from, keep.extra.columns=TRUE) -) - -setAs("DataFrame", "GPos", - function(from) makeGPosFromDataFrame(from, keep.extra.columns=TRUE) -) - diff --git a/R/makeGRangesFromDataFrame.R b/R/makeGRangesFromDataFrame.R index ec53fcf2..a0ab8acc 100644 --- a/R/makeGRangesFromDataFrame.R +++ b/R/makeGRangesFromDataFrame.R @@ -9,7 +9,6 @@ stop("'", what, ".field' must be a character vector with no NAs") tolower(field) } - .collect_prefixes <- function(df_colnames, field) { df_colnames_nc <- nchar(df_colnames) @@ -29,6 +28,10 @@ idx2 <- which(df_colnames %in% end.field) if (length(idx1) == 1L && length(idx2) == 1L) return(list(c(start=idx1, end=idx2), "")) + if (length(idx1) == 1L && length(idx2) == 0L) + return(list(c(start=idx1, end=idx1), "")) + if (length(idx1) == 0L && length(idx2) == 1L) + return(list(c(start=idx2, end=idx2), "")) if (length(idx1) == 0L && length(idx2) == 0L) { prefixes1 <- .collect_prefixes(df_colnames, start.field) prefixes2 <- .collect_prefixes(df_colnames, end.field) @@ -93,7 +96,7 @@ "chromosome", "chrom", "chr", "chromosome_name", "seqid"), - start.field="start", + start.field=c("start", "pos"), end.field=c("end", "stop"), strand.field="strand", ignore.strand=FALSE) @@ -152,34 +155,11 @@ makeGRangesFromDataFrame <- function(df, "chromosome", "chrom", "chr", "chromosome_name", "seqid"), - start.field="start", - end.field=c("end", "stop"), - strand.field="strand", - starts.in.df.are.0based=FALSE) -{ - .makeXFromDataFrame(df = df, x = "GRanges", - keep.extra.columns=keep.extra.columns, - ignore.strand=ignore.strand, - seqinfo=seqinfo, - seqnames.field=seqnames.field, - start.field=start.field, - end.field=end.field, - strand.field=strand.field, - starts.in.df.are.0based=starts.in.df.are.0based) -} - -.makeXFromDataFrame <- function(df, x = c("GRanges", "GPos"), - keep.extra.columns=FALSE, - ignore.strand=FALSE, - seqinfo=NULL, - seqnames.field=c("seqnames", "seqname", - "chromosome", "chrom", - "chr", "chromosome_name", - "seqid"), - start.field="start", + start.field=c("start", "pos"), end.field=c("end", "stop"), strand.field="strand", - starts.in.df.are.0based=FALSE) + starts.in.df.are.0based=FALSE, + as=c("auto", "GRanges", "GPos")) { ### 'df' must be a data.frame or DataFrame object. ## Check args. @@ -194,6 +174,7 @@ makeGRangesFromDataFrame <- function(df, ans_seqinfo <- normarg_seqinfo1(seqinfo) if (!isTRUEorFALSE(starts.in.df.are.0based)) stop("'starts.in.df.are.0based' must be TRUE or FALSE") + as <- match.arg(as) granges_cols <- .find_GRanges_cols(names(df), seqnames.field=seqnames.field, @@ -201,13 +182,18 @@ makeGRangesFromDataFrame <- function(df, end.field=end.field, strand.field=strand.field, ignore.strand=ignore.strand) - FUN <- switch(x, GRanges = GRanges, GPos = GPos) ## Prepare 'ans_seqnames'. ans_seqnames <- df[[granges_cols[["seqnames"]]]] ## Prepare 'ans_ranges'. ans_start <- .get_data_frame_col_as_numeric(df, granges_cols[["start"]]) ans_end <- .get_data_frame_col_as_numeric(df, granges_cols[["end"]]) + + if (identical(ans_start, ans_end) && identical(as, "auto")) + as <- "GPos" + else + as <- "GRanges" + if (starts.in.df.are.0based) ans_start <- ans_start + 1L ans_names <- rownames(df) @@ -253,6 +239,7 @@ makeGRangesFromDataFrame <- function(df, ans_seqinfo <- Seqinfo(seqlevels) } + FUN <- switch(as, GRanges = GRanges, GPos = GPos) ## Make and return the GRanges object. FUN(ans_seqnames, ans_ranges, strand=ans_strand, ans_mcols, seqinfo=ans_seqinfo) @@ -266,3 +253,10 @@ setAs("DataFrame", "GRanges", function(from) makeGRangesFromDataFrame(from, keep.extra.columns=TRUE) ) +setAs("data.frame", "GPos", + function(from) makeGRangesFromDataFrame(from, keep.extra.columns=TRUE) +) + +setAs("DataFrame", "GPos", + function(from) makeGRangesFromDataFrame(from, keep.extra.columns=TRUE) +) diff --git a/inst/unitTests/test_makeGRangesFromDataFrame.R b/inst/unitTests/test_makeGRangesFromDataFrame.R index 18830b52..3404db62 100644 --- a/inst/unitTests/test_makeGRangesFromDataFrame.R +++ b/inst/unitTests/test_makeGRangesFromDataFrame.R @@ -152,3 +152,18 @@ test_find_GRanges_cols <- function() checkIdentical(target, current) } + +test_makeGRangesFromDataFrame <- function() +{ + post <- data.frame(chr = rep(1, 3), pos = 11:13) + star <- data.frame(chr = rep(1, 3), start = 11:13) + endo <- data.frame(chr = rep(1, 3), end = 11:13) + checkTrue(validObject(makeGRangesFromDataFrame(post))) + checkTrue(validObject(makeGRangesFromDataFrame(star))) + checkTrue(validObject(makeGRangesFromDataFrame(endo))) + + target <- makeGRangesFromDataFrame(data.frame(seqnames=1:6, start=11:16)) + checkTrue(validObject(target)) + target <- makeGRangesFromDataFrame(data.frame(seqnames=1:6, end=11:16)) + checkTrue(validObject(target)) +} diff --git a/man/makeGPosFromDataFrame.Rd b/man/makeGPosFromDataFrame.Rd deleted file mode 100644 index 0f7961c0..00000000 --- a/man/makeGPosFromDataFrame.Rd +++ /dev/null @@ -1,201 +0,0 @@ -\name{makeGPosFromDataFrame} - -\alias{makeGPosFromDataFrame} - -\alias{coerce,data.frame,GPos-method} -\alias{coerce,DataFrame,GPos-method} - -\title{Make a GPos object from a data.frame or DataFrame} - -\description{ - \code{makeGPosFromDataFrame} takes a data-frame-like object as - input and tries to automatically find the columns that describe - a genomic position. It returns them as a \link{GPos} object. - - \code{makeGPosFromDataFrame} is also the workhorse behind the - coercion method from data.frame (or \link[S4Vectors]{DataFrame}) to - \link{GPos}. -} - -\usage{ -makeGPosFromDataFrame(df, - keep.extra.columns=FALSE, - ignore.strand=FALSE, - seqinfo=NULL, - seqnames.field=c("seqnames", "seqname", - "chromosome", "chrom", - "chr", "chromosome_name", - "seqid"), - start.field=c("start", "pos"), - end.field=c("end", "stop", "pos"), - strand.field="strand", - starts.in.df.are.0based=FALSE) -} - -\arguments{ - \item{df}{ - A data.frame or \link[S4Vectors]{DataFrame} object. If not, then - the function first tries to turn \code{df} into a data frame with - \code{as.data.frame(df)}. - } - \item{keep.extra.columns}{ - \code{TRUE} or \code{FALSE} (the default). - If \code{TRUE}, the columns in \code{df} that are not used to form - the genomic ranges of the returned \link{GPos} object are then - returned as metadata columns on the object. Otherwise, they are ignored. - If \code{df} has a \code{width} column, then it's always ignored. - } - \item{ignore.strand}{ - \code{TRUE} or \code{FALSE} (the default). - If \code{TRUE}, then the strand of the returned \link{GPos} object - is set to \code{"*"}. - } - \item{seqinfo}{ - Either \code{NULL}, or a \link[GenomeInfoDb]{Seqinfo} object, - or a character vector of unique sequence names (a.k.a. \emph{seqlevels}), - or a named numeric vector of sequence lengths. - When not \code{NULL}, \code{seqinfo} must be compatible with the genomic - ranges in \code{df}, that is, it must have one entry for each unique - sequence name represented in \code{df}. Note that it can have additional - entries i.e. entries for seqlevels not represented in \code{df}. - } - \item{seqnames.field}{ - A character vector of recognized names for the column in \code{df} - that contains the chromosome name (a.k.a. sequence name) associated - with each genomic range. - Only the first name in \code{seqnames.field} that is found - in \code{colnames(df)} is used. - If no one is found, then an error is raised. - } - \item{start.field}{ - A character vector of recognized names for the column in \code{df} - that contains the start positions of the genomic ranges. - Only the first name in \code{start.field} that is found - in \code{colnames(df)} is used. - If no one is found, then an error is raised. - } - \item{end.field}{ - A character vector of recognized names for the column in \code{df} - that contains the end positions of the genomic ranges. - Only the first name in \code{start.field} that is found - in \code{colnames(df)} is used. - If no one is found, then an error is raised. - } - \item{strand.field}{ - A character vector of recognized names for the column in \code{df} - that contains the strand associated with each genomic range. - Only the first name in \code{strand.field} that is found - in \code{colnames(df)} is used. - If no one is found or if \code{ignore.strand} is \code{TRUE}, - then the strand of the returned \link{GPos} object is - set to \code{"*"}. - } - \item{starts.in.df.are.0based}{ - \code{TRUE} or \code{FALSE} (the default). - If \code{TRUE}, then the start positions of the genomic ranges in - \code{df} are considered to be \emph{0-based} and are converted to - \emph{1-based} in the returned \link{GPos} object. - This feature is intended to make it more convenient to handle input - that contains data obtained from resources using the "0-based - start" convention. A notorious example of such resource is the UCSC - Table Browser (\url{http://genome.ucsc.edu/cgi-bin/hgTables}). - } -} - -\value{ - A \link{GPos} object with one element per row in the input. - - If the \code{seqinfo} argument was supplied, the returned object will - have exactly the seqlevels specified in \code{seqinfo} and in the same - order. Otherwise, the seqlevels are ordered according to the output of - the \code{\link[GenomeInfoDb]{rankSeqlevels}} function (except if - \code{df} contains the seqnames in the form of a factor-Rle, in which - case the levels of the factor-Rle become the seqlevels of the returned - object and with no re-ordering). - - If \code{df} has non-automatic row names (i.e. \code{rownames(df)} is - not \code{NULL} and is not \code{seq_len(nrow(df))}), then they will be - used to set names on the returned \link{GPos} object. -} - -\note{ - Coercing data.frame or \link[S4Vectors]{DataFrame} \code{df} into - a \link{GPos} object (with \code{as(df, "GPos")}), or - calling \code{GPos(df)}, are both equivalent to calling - \code{makeGPosFromDataFrame(df, keep.extra.columns=TRUE)}. -} - -\author{ - H. Pagès, based on a proposal by Kasper Daniel Hansen -} - -\seealso{ - \itemize{ - \item \link{GPos} objects. - - \item \link[GenomeInfoDb]{Seqinfo} objects and the - \code{\link[GenomeInfoDb]{rankSeqlevels}} function in the - \pkg{GenomeInfoDb} package. - - \item The \code{\link[rtracklayer]{getTable}} function in the - \pkg{rtracklayer} package for an R interface to the UCSC - Table Browser. - - \item \link[S4Vectors]{DataFrame} objects in the \pkg{S4Vectors} package. - } -} - -\examples{ -## --------------------------------------------------------------------- -## BASIC EXAMPLES -## --------------------------------------------------------------------- - -df <- data.frame(chr="chr1", pos = 11:15, score=1:5) -makeGPosFromDataFrame(df) - -df <- data.frame(chr="chr1", start=11:15, end=11:15, - strand=c("+","-","+","*","."), score=1:5) -df -makeGPosFromDataFrame(df) # strand value "." is replaced with "*" - -## The strand column is optional: -df <- data.frame(chr="chr1", start=11:15, end=11:15, score=1:5) -makeGPosFromDataFrame(df) - -gr <- makeGPosFromDataFrame(df, keep.extra.columns=TRUE) -gr2 <- as(df, "GPos") # equivalent to the above -stopifnot(identical(gr, gr2)) -gr2 <- GPos(df) # equivalent to the above -stopifnot(identical(gr, gr2)) - -makeGPosFromDataFrame(df, ignore.strand=TRUE) -makeGPosFromDataFrame(df, keep.extra.columns=TRUE, - ignore.strand=TRUE) - -makeGPosFromDataFrame(df, seqinfo=paste0("chr", 4:1)) -makeGPosFromDataFrame(df, seqinfo=c(chrM=NA, chr1=500, chrX=100)) -makeGPosFromDataFrame(df, seqinfo=Seqinfo(paste0("chr", 4:1))) - -## --------------------------------------------------------------------- -## ABOUT AUTOMATIC DETECTION OF THE seqnames/start/end/strand COLUMNS -## --------------------------------------------------------------------- - -## Automatic detection of the seqnames/start/end/strand columns is -## case insensitive: -df <- data.frame(ChRoM="chr1", StarT=11:15, stoP=11:15, - STRAND=c("+","-","+","*","."), score=1:5) -makeGPosFromDataFrame(df) - -## It also ignores a common prefix between the start and end columns: -df <- data.frame(seqnames="chr1", tx_start=11:15, tx_end=11:15, - strand=c("+","-","+","*","."), score=1:5) -makeGPosFromDataFrame(df) - -## The common prefix between the start and end columns is used to -## disambiguate between more than one seqnames column: -df <- data.frame(chrom="chr1", tx_start=11:15, tx_end=11:15, - tx_chr="chr2", score=1:5) -makeGPosFromDataFrame(df) -} - -\keyword{manip} diff --git a/man/makeGRangesFromDataFrame.Rd b/man/makeGRangesFromDataFrame.Rd index e42e2b65..6c983671 100644 --- a/man/makeGRangesFromDataFrame.Rd +++ b/man/makeGRangesFromDataFrame.Rd @@ -3,7 +3,9 @@ \alias{makeGRangesFromDataFrame} \alias{coerce,data.frame,GRanges-method} +\alias{coerce,data.frame,GPos-method} \alias{coerce,DataFrame,GRanges-method} +\alias{coerce,DataFrame,GPos-method} \title{Make a GRanges object from a data.frame or DataFrame} @@ -26,10 +28,11 @@ makeGRangesFromDataFrame(df, "chromosome", "chrom", "chr", "chromosome_name", "seqid"), - start.field="start", + start.field=c("start", "pos"), end.field=c("end", "stop"), strand.field="strand", - starts.in.df.are.0based=FALSE) + starts.in.df.are.0based=FALSE, + as=c("auto", "GRanges", "GPos")) } \arguments{ @@ -100,6 +103,12 @@ makeGRangesFromDataFrame(df, start" convention. A notorious example of such resource is the UCSC Table Browser (\url{http://genome.ucsc.edu/cgi-bin/hgTables}). } + \item{as}{ + A scalar character vector indicating the derivative of + \link{GenomicRanges} object to return, either \link{GRanges} or + \link{GPos}. By default, the derivative will be chosen based on + the data provided (\code{"auto"}). + } } \value{ @@ -177,6 +186,34 @@ makeGRangesFromDataFrame(df, seqinfo=paste0("chr", 4:1)) makeGRangesFromDataFrame(df, seqinfo=c(chrM=NA, chr1=500, chrX=100)) makeGRangesFromDataFrame(df, seqinfo=Seqinfo(paste0("chr", 4:1))) +## GPos objects are returned where appropriate +df <- data.frame(chr="chr1", pos = 11:15, score=1:5) +makeGRangesFromDataFrame(df) + +df <- data.frame(chr="chr1", start=11:15, end=11:15, + strand=c("+","-","+","*","."), score=1:5) +makeGRangesFromDataFrame(df) + +df <- data.frame(chr="chr1", start=11:15, score=1:5) +makeGRangesFromDataFrame(df) + +df <- data.frame(chr="chr1", end=11:15, score=1:5) +makeGRangesFromDataFrame(df) + +gr <- makeGRangesFromDataFrame(df, keep.extra.columns=TRUE) +gr2 <- as(df, "GPos") # equivalent to the above +stopifnot(identical(gr, gr2)) +gr2 <- GPos(df) # equivalent to the above +stopifnot(identical(gr, gr2)) + +makeGRangesFromDataFrame(df, ignore.strand=TRUE) +makeGRangesFromDataFrame(df, keep.extra.columns=TRUE, + ignore.strand=TRUE) + +makeGRangesFromDataFrame(df, seqinfo=paste0("chr", 4:1)) +makeGRangesFromDataFrame(df, seqinfo=c(chrM=NA, chr1=500, chrX=100)) +makeGRangesFromDataFrame(df, seqinfo=Seqinfo(paste0("chr", 4:1))) + ## --------------------------------------------------------------------- ## ABOUT AUTOMATIC DETECTION OF THE seqnames/start/end/strand COLUMNS ## ---------------------------------------------------------------------