Skip to content

Commit 052f70f

Browse files
committed
add helper makeGPosFromDataFrame
1 parent b8a3cd5 commit 052f70f

File tree

7 files changed

+271
-7
lines changed

7 files changed

+271
-7
lines changed

DESCRIPTION

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ Collate: normarg-utils.R
5353
GenomicRangesList-class.R
5454
GRangesList-class.R
5555
makeGRangesFromDataFrame.R
56+
makeGPosFromDataFrame.R
5657
makeGRangesListFromDataFrame.R
5758
RangedData-methods.R
5859
findOverlaps-methods.R

NAMESPACE

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,7 @@ export(
134134
GNCList,
135135
GenomicRangesList, GRangesList,
136136

137+
makeGPosFromDataFrame,
137138
makeGRangesFromDataFrame,
138139
makeGRangesListFromDataFrame,
139140
makeGRangesListFromFeatureFragments,

R/GPos-class.R

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -176,7 +176,7 @@ GPos <- function(seqnames=NULL, pos=NULL, strand=NULL,
176176
if (is(x_ranges, "IRanges")) # i.e. 'x' is not a GPos
177177
strand <- rep.int(strand, width(x_ranges))
178178
}
179-
if (length(mcols) == 0L && is(x, "GPos"))
179+
if (length(mcols) == 0L && inherits(x, "GenomicRanges"))
180180
mcols <- mcols(x, use.names=FALSE)
181181
if (is.null(seqinfo))
182182
seqinfo <- seqinfo(x)

R/makeGPosFromDataFrame.R

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
### =========================================================================
2+
### makeGPosFromDataFrame()
3+
### -------------------------------------------------------------------------
4+
5+
### 'df' must be a data.frame or DataFrame object.
6+
makeGPosFromDataFrame <- function(df,
7+
keep.extra.columns=FALSE,
8+
ignore.strand=FALSE,
9+
seqinfo=NULL,
10+
seqnames.field=c("seqnames", "seqname",
11+
"chromosome", "chrom",
12+
"chr", "chromosome_name",
13+
"seqid"),
14+
start.field=c("start", "pos"),
15+
end.field=c("end", "stop", "pos"),
16+
strand.field="strand",
17+
starts.in.df.are.0based=FALSE)
18+
{
19+
.makeXFromDataFrame(df = df, x = "GPos",
20+
keep.extra.columns=keep.extra.columns,
21+
ignore.strand=ignore.strand,
22+
seqinfo=seqinfo,
23+
seqnames.field=seqnames.field,
24+
start.field=start.field,
25+
end.field=end.field,
26+
strand.field=strand.field,
27+
starts.in.df.are.0based=starts.in.df.are.0based)
28+
}
29+
30+
setAs("data.frame", "GPos",
31+
function(from) makeGPosFromDataFrame(from, keep.extra.columns=TRUE)
32+
)
33+
34+
setAs("DataFrame", "GPos",
35+
function(from) makeGPosFromDataFrame(from, keep.extra.columns=TRUE)
36+
)
37+

R/makeGRangesFromDataFrame.R

Lines changed: 29 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@
7474
.find_strand_col <- function(df_colnames, strand.field, prefix)
7575
{
7676
idx <- which(df_colnames %in% paste0(prefix, strand.field))
77-
if (length(idx) == 0L)
77+
if (length(idx) == 0L)
7878
idx <- which(df_colnames %in% strand.field)
7979
if (length(idx) == 0L)
8080
return(NA_integer_)
@@ -144,7 +144,6 @@
144144
ans
145145
}
146146

147-
### 'df' must be a data.frame or DataFrame object.
148147
makeGRangesFromDataFrame <- function(df,
149148
keep.extra.columns=FALSE,
150149
ignore.strand=FALSE,
@@ -158,7 +157,32 @@ makeGRangesFromDataFrame <- function(df,
158157
strand.field="strand",
159158
starts.in.df.are.0based=FALSE)
160159
{
161-
## Check args.
160+
.makeXFromDataFrame(df = df, x = "GRanges",
161+
keep.extra.columns=keep.extra.columns,
162+
ignore.strand=ignore.strand,
163+
seqinfo=seqinfo,
164+
seqnames.field=seqnames.field,
165+
start.field=start.field,
166+
end.field=end.field,
167+
strand.field=strand.field,
168+
starts.in.df.are.0based=starts.in.df.are.0based)
169+
}
170+
171+
.makeXFromDataFrame <- function(df, x = c("GRanges", "GPos"),
172+
keep.extra.columns=FALSE,
173+
ignore.strand=FALSE,
174+
seqinfo=NULL,
175+
seqnames.field=c("seqnames", "seqname",
176+
"chromosome", "chrom",
177+
"chr", "chromosome_name",
178+
"seqid"),
179+
start.field="start",
180+
end.field=c("end", "stop"),
181+
strand.field="strand",
182+
starts.in.df.are.0based=FALSE)
183+
{
184+
### 'df' must be a data.frame or DataFrame object.
185+
## Check args.
162186
if (is.character(df)) # for people that provide the path to a file
163187
stop("'df' must be a data.frame or DataFrame object")
164188
if (!(is.data.frame(df) || is(df, "DataFrame")))
@@ -177,7 +201,7 @@ makeGRangesFromDataFrame <- function(df,
177201
end.field=end.field,
178202
strand.field=strand.field,
179203
ignore.strand=ignore.strand)
180-
204+
FUN <- switch(x, GRanges = GRanges, GPos = GPos)
181205
## Prepare 'ans_seqnames'.
182206
ans_seqnames <- df[[granges_cols[["seqnames"]]]]
183207

@@ -230,7 +254,7 @@ makeGRangesFromDataFrame <- function(df,
230254
}
231255

232256
## Make and return the GRanges object.
233-
GRanges(ans_seqnames, ans_ranges, strand=ans_strand,
257+
FUN(ans_seqnames, ans_ranges, strand=ans_strand,
234258
ans_mcols, seqinfo=ans_seqinfo)
235259
}
236260

man/GenomicRangesList-class.Rd

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,7 @@
8989
}
9090
Note that the \emph{Vector class hierarchy} has many more classes.
9191
In particular \link[S4Vectors]{Vector}, \link[S4Vectors]{List},
92-
\link[IRanges]{RangesList}, and \link[IRanges]{IntegerRangesList}
92+
\link[IRanges]{IRangesList}, and \link[IRanges]{IntegerRangesList}
9393
have other subclasses not shown here.
9494
}
9595

man/makeGPosFromDataFrame.Rd

Lines changed: 201 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,201 @@
1+
\name{makeGPosFromDataFrame}
2+
3+
\alias{makeGPosFromDataFrame}
4+
5+
\alias{coerce,data.frame,GPos-method}
6+
\alias{coerce,DataFrame,GPos-method}
7+
8+
\title{Make a GPos object from a data.frame or DataFrame}
9+
10+
\description{
11+
\code{makeGPosFromDataFrame} takes a data-frame-like object as
12+
input and tries to automatically find the columns that describe
13+
a genomic position. It returns them as a \link{GPos} object.
14+
15+
\code{makeGPosFromDataFrame} is also the workhorse behind the
16+
coercion method from data.frame (or \link[S4Vectors]{DataFrame}) to
17+
\link{GPos}.
18+
}
19+
20+
\usage{
21+
makeGPosFromDataFrame(df,
22+
keep.extra.columns=FALSE,
23+
ignore.strand=FALSE,
24+
seqinfo=NULL,
25+
seqnames.field=c("seqnames", "seqname",
26+
"chromosome", "chrom",
27+
"chr", "chromosome_name",
28+
"seqid"),
29+
start.field=c("start", "pos"),
30+
end.field=c("end", "stop", "pos"),
31+
strand.field="strand",
32+
starts.in.df.are.0based=FALSE)
33+
}
34+
35+
\arguments{
36+
\item{df}{
37+
A data.frame or \link[S4Vectors]{DataFrame} object. If not, then
38+
the function first tries to turn \code{df} into a data frame with
39+
\code{as.data.frame(df)}.
40+
}
41+
\item{keep.extra.columns}{
42+
\code{TRUE} or \code{FALSE} (the default).
43+
If \code{TRUE}, the columns in \code{df} that are not used to form
44+
the genomic ranges of the returned \link{GPos} object are then
45+
returned as metadata columns on the object. Otherwise, they are ignored.
46+
If \code{df} has a \code{width} column, then it's always ignored.
47+
}
48+
\item{ignore.strand}{
49+
\code{TRUE} or \code{FALSE} (the default).
50+
If \code{TRUE}, then the strand of the returned \link{GPos} object
51+
is set to \code{"*"}.
52+
}
53+
\item{seqinfo}{
54+
Either \code{NULL}, or a \link[GenomeInfoDb]{Seqinfo} object,
55+
or a character vector of unique sequence names (a.k.a. \emph{seqlevels}),
56+
or a named numeric vector of sequence lengths.
57+
When not \code{NULL}, \code{seqinfo} must be compatible with the genomic
58+
ranges in \code{df}, that is, it must have one entry for each unique
59+
sequence name represented in \code{df}. Note that it can have additional
60+
entries i.e. entries for seqlevels not represented in \code{df}.
61+
}
62+
\item{seqnames.field}{
63+
A character vector of recognized names for the column in \code{df}
64+
that contains the chromosome name (a.k.a. sequence name) associated
65+
with each genomic range.
66+
Only the first name in \code{seqnames.field} that is found
67+
in \code{colnames(df)} is used.
68+
If no one is found, then an error is raised.
69+
}
70+
\item{start.field}{
71+
A character vector of recognized names for the column in \code{df}
72+
that contains the start positions of the genomic ranges.
73+
Only the first name in \code{start.field} that is found
74+
in \code{colnames(df)} is used.
75+
If no one is found, then an error is raised.
76+
}
77+
\item{end.field}{
78+
A character vector of recognized names for the column in \code{df}
79+
that contains the end positions of the genomic ranges.
80+
Only the first name in \code{start.field} that is found
81+
in \code{colnames(df)} is used.
82+
If no one is found, then an error is raised.
83+
}
84+
\item{strand.field}{
85+
A character vector of recognized names for the column in \code{df}
86+
that contains the strand associated with each genomic range.
87+
Only the first name in \code{strand.field} that is found
88+
in \code{colnames(df)} is used.
89+
If no one is found or if \code{ignore.strand} is \code{TRUE},
90+
then the strand of the returned \link{GPos} object is
91+
set to \code{"*"}.
92+
}
93+
\item{starts.in.df.are.0based}{
94+
\code{TRUE} or \code{FALSE} (the default).
95+
If \code{TRUE}, then the start positions of the genomic ranges in
96+
\code{df} are considered to be \emph{0-based} and are converted to
97+
\emph{1-based} in the returned \link{GPos} object.
98+
This feature is intended to make it more convenient to handle input
99+
that contains data obtained from resources using the "0-based
100+
start" convention. A notorious example of such resource is the UCSC
101+
Table Browser (\url{http://genome.ucsc.edu/cgi-bin/hgTables}).
102+
}
103+
}
104+
105+
\value{
106+
A \link{GPos} object with one element per row in the input.
107+
108+
If the \code{seqinfo} argument was supplied, the returned object will
109+
have exactly the seqlevels specified in \code{seqinfo} and in the same
110+
order. Otherwise, the seqlevels are ordered according to the output of
111+
the \code{\link[GenomeInfoDb]{rankSeqlevels}} function (except if
112+
\code{df} contains the seqnames in the form of a factor-Rle, in which
113+
case the levels of the factor-Rle become the seqlevels of the returned
114+
object and with no re-ordering).
115+
116+
If \code{df} has non-automatic row names (i.e. \code{rownames(df)} is
117+
not \code{NULL} and is not \code{seq_len(nrow(df))}), then they will be
118+
used to set names on the returned \link{GPos} object.
119+
}
120+
121+
\note{
122+
Coercing data.frame or \link[S4Vectors]{DataFrame} \code{df} into
123+
a \link{GPos} object (with \code{as(df, "GPos")}), or
124+
calling \code{GPos(df)}, are both equivalent to calling
125+
\code{makeGPosFromDataFrame(df, keep.extra.columns=TRUE)}.
126+
}
127+
128+
\author{
129+
H. Pagès, based on a proposal by Kasper Daniel Hansen
130+
}
131+
132+
\seealso{
133+
\itemize{
134+
\item \link{GPos} objects.
135+
136+
\item \link[GenomeInfoDb]{Seqinfo} objects and the
137+
\code{\link[GenomeInfoDb]{rankSeqlevels}} function in the
138+
\pkg{GenomeInfoDb} package.
139+
140+
\item The \code{\link[rtracklayer]{getTable}} function in the
141+
\pkg{rtracklayer} package for an R interface to the UCSC
142+
Table Browser.
143+
144+
\item \link[S4Vectors]{DataFrame} objects in the \pkg{S4Vectors} package.
145+
}
146+
}
147+
148+
\examples{
149+
## ---------------------------------------------------------------------
150+
## BASIC EXAMPLES
151+
## ---------------------------------------------------------------------
152+
153+
df <- data.frame(chr="chr1", pos = 11:15, score=1:5)
154+
makeGPosFromDataFrame(df)
155+
156+
df <- data.frame(chr="chr1", start=11:15, end=11:15,
157+
strand=c("+","-","+","*","."), score=1:5)
158+
df
159+
makeGPosFromDataFrame(df) # strand value "." is replaced with "*"
160+
161+
## The strand column is optional:
162+
df <- data.frame(chr="chr1", start=11:15, end=11:15, score=1:5)
163+
makeGPosFromDataFrame(df)
164+
165+
gr <- makeGPosFromDataFrame(df, keep.extra.columns=TRUE)
166+
gr2 <- as(df, "GPos") # equivalent to the above
167+
stopifnot(identical(gr, gr2))
168+
gr2 <- GPos(df) # equivalent to the above
169+
stopifnot(identical(gr, gr2))
170+
171+
makeGPosFromDataFrame(df, ignore.strand=TRUE)
172+
makeGPosFromDataFrame(df, keep.extra.columns=TRUE,
173+
ignore.strand=TRUE)
174+
175+
makeGPosFromDataFrame(df, seqinfo=paste0("chr", 4:1))
176+
makeGPosFromDataFrame(df, seqinfo=c(chrM=NA, chr1=500, chrX=100))
177+
makeGPosFromDataFrame(df, seqinfo=Seqinfo(paste0("chr", 4:1)))
178+
179+
## ---------------------------------------------------------------------
180+
## ABOUT AUTOMATIC DETECTION OF THE seqnames/start/end/strand COLUMNS
181+
## ---------------------------------------------------------------------
182+
183+
## Automatic detection of the seqnames/start/end/strand columns is
184+
## case insensitive:
185+
df <- data.frame(ChRoM="chr1", StarT=11:15, stoP=11:15,
186+
STRAND=c("+","-","+","*","."), score=1:5)
187+
makeGPosFromDataFrame(df)
188+
189+
## It also ignores a common prefix between the start and end columns:
190+
df <- data.frame(seqnames="chr1", tx_start=11:15, tx_end=11:15,
191+
strand=c("+","-","+","*","."), score=1:5)
192+
makeGPosFromDataFrame(df)
193+
194+
## The common prefix between the start and end columns is used to
195+
## disambiguate between more than one seqnames column:
196+
df <- data.frame(chrom="chr1", tx_start=11:15, tx_end=11:15,
197+
tx_chr="chr2", score=1:5)
198+
makeGPosFromDataFrame(df)
199+
}
200+
201+
\keyword{manip}

0 commit comments

Comments
 (0)