add helper makeGPosFromDataFrame

LiNk-NY · LiNk-NY · commit 052f70f7b418 · 2020-02-03T18:12:55.000-05:00
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -53,6 +53,7 @@ Collate: normarg-utils.R
 	GenomicRangesList-class.R
 	GRangesList-class.R
 	makeGRangesFromDataFrame.R
+    makeGPosFromDataFrame.R
 	makeGRangesListFromDataFrame.R
 	RangedData-methods.R
 	findOverlaps-methods.R
diff --git a/NAMESPACE b/NAMESPACE
@@ -134,6 +134,7 @@ export(
     GNCList,
     GenomicRangesList, GRangesList,
 
+    makeGPosFromDataFrame,
     makeGRangesFromDataFrame,
     makeGRangesListFromDataFrame,
     makeGRangesListFromFeatureFragments,
diff --git a/R/GPos-class.R b/R/GPos-class.R
@@ -176,7 +176,7 @@ GPos <- function(seqnames=NULL, pos=NULL, strand=NULL,
             if (is(x_ranges, "IRanges"))  # i.e. 'x' is not a GPos
                 strand <- rep.int(strand, width(x_ranges))
         }
-        if (length(mcols) == 0L && is(x, "GPos"))
+        if (length(mcols) == 0L && inherits(x, "GenomicRanges"))
             mcols <- mcols(x, use.names=FALSE)
         if (is.null(seqinfo))
             seqinfo <- seqinfo(x)
diff --git a/R/makeGPosFromDataFrame.R b/R/makeGPosFromDataFrame.R
@@ -0,0 +1,37 @@
+### =========================================================================
+### makeGPosFromDataFrame()
+### -------------------------------------------------------------------------
+
+### 'df' must be a data.frame or DataFrame object.
+makeGPosFromDataFrame <- function(df,
+                                  keep.extra.columns=FALSE,
+                                  ignore.strand=FALSE,
+                                  seqinfo=NULL,
+                                  seqnames.field=c("seqnames", "seqname",
+                                                   "chromosome", "chrom",
+                                                   "chr", "chromosome_name",
+                                                   "seqid"),
+                                  start.field=c("start", "pos"),
+                                  end.field=c("end", "stop", "pos"),
+                                  strand.field="strand",
+                                  starts.in.df.are.0based=FALSE)
+{
+    .makeXFromDataFrame(df = df, x = "GPos",
+        keep.extra.columns=keep.extra.columns,
+        ignore.strand=ignore.strand,
+        seqinfo=seqinfo,
+        seqnames.field=seqnames.field,
+        start.field=start.field,
+        end.field=end.field,
+        strand.field=strand.field,
+        starts.in.df.are.0based=starts.in.df.are.0based)
+}
+
+setAs("data.frame", "GPos",
+    function(from) makeGPosFromDataFrame(from, keep.extra.columns=TRUE)
+)
+
+setAs("DataFrame", "GPos",
+    function(from) makeGPosFromDataFrame(from, keep.extra.columns=TRUE)
+)
+
diff --git a/R/makeGRangesFromDataFrame.R b/R/makeGRangesFromDataFrame.R
@@ -74,7 +74,7 @@
 .find_strand_col <- function(df_colnames, strand.field, prefix)
 {
     idx <- which(df_colnames %in% paste0(prefix, strand.field))
-    if (length(idx) == 0L) 
+    if (length(idx) == 0L)
         idx <- which(df_colnames %in% strand.field)
     if (length(idx) == 0L)
         return(NA_integer_)
@@ -144,7 +144,6 @@
     ans
 }
 
-### 'df' must be a data.frame or DataFrame object.
 makeGRangesFromDataFrame <- function(df,
                                      keep.extra.columns=FALSE,
                                      ignore.strand=FALSE,
@@ -158,7 +157,32 @@ makeGRangesFromDataFrame <- function(df,
                                      strand.field="strand",
                                      starts.in.df.are.0based=FALSE)
 {
-    ## Check args.
+    .makeXFromDataFrame(df = df, x = "GRanges",
+        keep.extra.columns=keep.extra.columns,
+        ignore.strand=ignore.strand,
+        seqinfo=seqinfo,
+        seqnames.field=seqnames.field,
+        start.field=start.field,
+        end.field=end.field,
+        strand.field=strand.field,
+        starts.in.df.are.0based=starts.in.df.are.0based)
+}
+
+.makeXFromDataFrame <- function(df, x = c("GRanges", "GPos"),
+                                     keep.extra.columns=FALSE,
+                                     ignore.strand=FALSE,
+                                     seqinfo=NULL,
+                                     seqnames.field=c("seqnames", "seqname",
+                                                      "chromosome", "chrom",
+                                                      "chr", "chromosome_name",
+                                                      "seqid"),
+                                     start.field="start",
+                                     end.field=c("end", "stop"),
+                                     strand.field="strand",
+                                     starts.in.df.are.0based=FALSE)
+{
+### 'df' must be a data.frame or DataFrame object.
+   ## Check args.
     if (is.character(df))  # for people that provide the path to a file
         stop("'df' must be a data.frame or DataFrame object")
     if (!(is.data.frame(df) || is(df, "DataFrame")))
@@ -177,7 +201,7 @@ makeGRangesFromDataFrame <- function(df,
                                        end.field=end.field,
                                        strand.field=strand.field,
                                        ignore.strand=ignore.strand)
-
+    FUN <- switch(x, GRanges = GRanges, GPos = GPos)
     ## Prepare 'ans_seqnames'.
     ans_seqnames <- df[[granges_cols[["seqnames"]]]]
 
@@ -230,7 +254,7 @@ makeGRangesFromDataFrame <- function(df,
     }
 
     ## Make and return the GRanges object.
-    GRanges(ans_seqnames, ans_ranges, strand=ans_strand,
+    FUN(ans_seqnames, ans_ranges, strand=ans_strand,
             ans_mcols, seqinfo=ans_seqinfo)
 }
 
diff --git a/man/GenomicRangesList-class.Rd b/man/GenomicRangesList-class.Rd
@@ -89,7 +89,7 @@
   }
   Note that the \emph{Vector class hierarchy} has many more classes.
   In particular \link[S4Vectors]{Vector}, \link[S4Vectors]{List},
-  \link[IRanges]{RangesList}, and \link[IRanges]{IntegerRangesList}
+  \link[IRanges]{IRangesList}, and \link[IRanges]{IntegerRangesList}
   have other subclasses not shown here.
 }
 
diff --git a/man/makeGPosFromDataFrame.Rd b/man/makeGPosFromDataFrame.Rd
@@ -0,0 +1,201 @@
+\name{makeGPosFromDataFrame}
+
+\alias{makeGPosFromDataFrame}
+
+\alias{coerce,data.frame,GPos-method}
+\alias{coerce,DataFrame,GPos-method}
+
+\title{Make a GPos object from a data.frame or DataFrame}
+
+\description{
+  \code{makeGPosFromDataFrame} takes a data-frame-like object as
+  input and tries to automatically find the columns that describe
+  a genomic position. It returns them as a \link{GPos} object.
+
+  \code{makeGPosFromDataFrame} is also the workhorse behind the
+  coercion method from data.frame (or \link[S4Vectors]{DataFrame}) to
+  \link{GPos}.
+}
+
+\usage{
+makeGPosFromDataFrame(df,
+                      keep.extra.columns=FALSE,
+                      ignore.strand=FALSE,
+                      seqinfo=NULL,
+                      seqnames.field=c("seqnames", "seqname",
+                                       "chromosome", "chrom",
+                                       "chr", "chromosome_name",
+                                       "seqid"),
+                      start.field=c("start", "pos"),
+                      end.field=c("end", "stop", "pos"),
+                      strand.field="strand",
+                      starts.in.df.are.0based=FALSE)
+}
+
+\arguments{
+  \item{df}{
+    A data.frame or \link[S4Vectors]{DataFrame} object. If not, then
+    the function first tries to turn \code{df} into a data frame with
+    \code{as.data.frame(df)}.
+  }
+  \item{keep.extra.columns}{
+    \code{TRUE} or \code{FALSE} (the default).
+    If \code{TRUE}, the columns in \code{df} that are not used to form
+    the genomic ranges of the returned \link{GPos} object are then
+    returned as metadata columns on the object. Otherwise, they are ignored.
+    If \code{df} has a \code{width} column, then it's always ignored.
+  }
+  \item{ignore.strand}{
+    \code{TRUE} or \code{FALSE} (the default).
+    If \code{TRUE}, then the strand of the returned \link{GPos} object
+    is set to \code{"*"}.
+  }
+  \item{seqinfo}{
+    Either \code{NULL}, or a \link[GenomeInfoDb]{Seqinfo} object,
+    or a character vector of unique sequence names (a.k.a. \emph{seqlevels}),
+    or a named numeric vector of sequence lengths.
+    When not \code{NULL}, \code{seqinfo} must be compatible with the genomic
+    ranges in \code{df}, that is, it must have one entry for each unique
+    sequence name represented in \code{df}. Note that it can have additional
+    entries i.e. entries for seqlevels not represented in \code{df}.
+  }
+  \item{seqnames.field}{
+    A character vector of recognized names for the column in \code{df}
+    that contains the chromosome name (a.k.a. sequence name) associated
+    with each genomic range.
+    Only the first name in \code{seqnames.field} that is found
+    in \code{colnames(df)} is used.
+    If no one is found, then an error is raised.
+  }
+  \item{start.field}{
+    A character vector of recognized names for the column in \code{df}
+    that contains the start positions of the genomic ranges.
+    Only the first name in \code{start.field} that is found
+    in \code{colnames(df)} is used.
+    If no one is found, then an error is raised.
+  }
+  \item{end.field}{
+    A character vector of recognized names for the column in \code{df}
+    that contains the end positions of the genomic ranges.
+    Only the first name in \code{start.field} that is found
+    in \code{colnames(df)} is used.
+    If no one is found, then an error is raised.
+  }
+  \item{strand.field}{
+    A character vector of recognized names for the column in \code{df}
+    that contains the strand associated with each genomic range.
+    Only the first name in \code{strand.field} that is found
+    in \code{colnames(df)} is used.
+    If no one is found or if \code{ignore.strand} is \code{TRUE},
+    then the strand of the returned \link{GPos} object is
+    set to \code{"*"}.
+  }
+  \item{starts.in.df.are.0based}{
+    \code{TRUE} or \code{FALSE} (the default).
+    If \code{TRUE}, then the start positions of the genomic ranges in
+    \code{df} are considered to be \emph{0-based} and are converted to
+    \emph{1-based} in the returned \link{GPos} object.
+    This feature is intended to make it more convenient to handle input
+    that contains data obtained from resources using the "0-based
+    start" convention. A notorious example of such resource is the UCSC
+    Table Browser (\url{http://genome.ucsc.edu/cgi-bin/hgTables}).
+  }
+}
+
+\value{
+  A \link{GPos} object with one element per row in the input.
+
+  If the \code{seqinfo} argument was supplied, the returned object will
+  have exactly the seqlevels specified in \code{seqinfo} and in the same
+  order. Otherwise, the seqlevels are ordered according to the output of
+  the \code{\link[GenomeInfoDb]{rankSeqlevels}} function (except if
+  \code{df} contains the seqnames in the form of a factor-Rle, in which
+  case the levels of the factor-Rle become the seqlevels of the returned
+  object and with no re-ordering).
+
+  If \code{df} has non-automatic row names (i.e. \code{rownames(df)} is
+  not \code{NULL} and is not \code{seq_len(nrow(df))}), then they will be
+  used to set names on the returned \link{GPos} object.
+}
+
+\note{
+  Coercing data.frame or \link[S4Vectors]{DataFrame} \code{df} into
+  a \link{GPos} object (with \code{as(df, "GPos")}), or
+  calling \code{GPos(df)}, are both equivalent to calling
+  \code{makeGPosFromDataFrame(df, keep.extra.columns=TRUE)}.
+}
+
+\author{
+  H. Pagès, based on a proposal by Kasper Daniel Hansen
+}
+
+\seealso{
+  \itemize{
+    \item \link{GPos} objects.
+
+    \item \link[GenomeInfoDb]{Seqinfo} objects and the
+          \code{\link[GenomeInfoDb]{rankSeqlevels}} function in the
+          \pkg{GenomeInfoDb} package.
+
+    \item The \code{\link[rtracklayer]{getTable}} function in the
+          \pkg{rtracklayer} package for an R interface to the UCSC
+          Table Browser.
+
+    \item \link[S4Vectors]{DataFrame} objects in the \pkg{S4Vectors} package.
+  }
+}
+
+\examples{
+## ---------------------------------------------------------------------
+## BASIC EXAMPLES
+## ---------------------------------------------------------------------
+
+df <- data.frame(chr="chr1", pos = 11:15, score=1:5)
+makeGPosFromDataFrame(df)
+
+df <- data.frame(chr="chr1", start=11:15, end=11:15,
+    strand=c("+","-","+","*","."), score=1:5)
+df
+makeGPosFromDataFrame(df)  # strand value "." is replaced with "*"
+
+## The strand column is optional:
+df <- data.frame(chr="chr1", start=11:15, end=11:15, score=1:5)
+makeGPosFromDataFrame(df)
+
+gr <- makeGPosFromDataFrame(df, keep.extra.columns=TRUE)
+gr2 <- as(df, "GPos")  # equivalent to the above
+stopifnot(identical(gr, gr2))
+gr2 <- GPos(df)        # equivalent to the above
+stopifnot(identical(gr, gr2))
+
+makeGPosFromDataFrame(df, ignore.strand=TRUE)
+makeGPosFromDataFrame(df, keep.extra.columns=TRUE,
+                             ignore.strand=TRUE)
+
+makeGPosFromDataFrame(df, seqinfo=paste0("chr", 4:1))
+makeGPosFromDataFrame(df, seqinfo=c(chrM=NA, chr1=500, chrX=100))
+makeGPosFromDataFrame(df, seqinfo=Seqinfo(paste0("chr", 4:1)))
+
+## ---------------------------------------------------------------------
+## ABOUT AUTOMATIC DETECTION OF THE seqnames/start/end/strand COLUMNS
+## ---------------------------------------------------------------------
+
+## Automatic detection of the seqnames/start/end/strand columns is
+## case insensitive:
+df <- data.frame(ChRoM="chr1", StarT=11:15, stoP=11:15,
+                 STRAND=c("+","-","+","*","."), score=1:5)
+makeGPosFromDataFrame(df)
+
+## It also ignores a common prefix between the start and end columns:
+df <- data.frame(seqnames="chr1", tx_start=11:15, tx_end=11:15,
+                 strand=c("+","-","+","*","."), score=1:5)
+makeGPosFromDataFrame(df)
+
+## The common prefix between the start and end columns is used to
+## disambiguate between more than one seqnames column:
+df <- data.frame(chrom="chr1", tx_start=11:15, tx_end=11:15,
+                 tx_chr="chr2", score=1:5)
+makeGPosFromDataFrame(df)
+}
+
+\keyword{manip}

Original file line number	Diff line number	Diff line change
`@@ -176,7 +176,7 @@ GPos <- function(seqnames=NULL, pos=NULL, strand=NULL,`
`176`	`176`	`if (is(x_ranges, "IRanges")) # i.e. 'x' is not a GPos`
`177`	`177`	`strand <- rep.int(strand, width(x_ranges))`
`178`	`178`	`}`
`179`		`- if (length(mcols) == 0L && is(x, "GPos"))`
	`179`	`+ if (length(mcols) == 0L && inherits(x, "GenomicRanges"))`
`180`	`180`	`mcols <- mcols(x, use.names=FALSE)`
`181`	`181`	`if (is.null(seqinfo))`
`182`	`182`	`seqinfo <- seqinfo(x)`
Original file line number	Diff line number	Diff line change
`@@ -89,7 +89,7 @@`
`89`	`89`	`}`
`90`	`90`	`Note that the \emph{Vector class hierarchy} has many more classes.`
`91`	`91`	`In particular \link[S4Vectors]{Vector}, \link[S4Vectors]{List},`
`92`		`- \link[IRanges]{RangesList}, and \link[IRanges]{IntegerRangesList}`
	`92`	`+ \link[IRanges]{IRangesList}, and \link[IRanges]{IntegerRangesList}`
`93`	`93`	`have other subclasses not shown here.`
`94`	`94`	`}`
`95`	`95`