Skip to content

Commit 295f057

Browse files
authored
Allow compressed data in R data files (#308)
Fixes #225 Fixes #32 xref JuliaStats/RDatasets.jl#117
1 parent 3db3315 commit 295f057

File tree

3 files changed

+42
-5
lines changed

3 files changed

+42
-5
lines changed

src/registry.jl

Lines changed: 39 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -22,14 +22,45 @@ add_format(format"GZIP", [0x1f, 0x8b], ".gz", [:Libz => UUID("2ec943e9-cfe8-584d
2222
add_format(format"BSON",(),".bson", [:BSON => UUID("fbb218c0-5317-5bc6-957e-2ee96dd4b1f0")])
2323
add_format(format"JLSO", (), ".jlso", [:JLSO => UUID("9da8a3cd-07a3-59c0-a743-3fdc52c30d11")])
2424

25+
function detect_compressed(io, len=getlength(io); formats=["GZIP", "BZIP2", "XZ", "LZ4"])
26+
seekstart(io)
27+
len < 2 && return false
28+
b1 = read(io, UInt8)
29+
b2 = read(io, UInt8)
30+
if "GZIP" formats
31+
b1 == 0x1f && b2 == 0x8b && return true
32+
end
33+
len < 3 && return false
34+
b3 = read(io, UInt8)
35+
if "BZIP2" formats
36+
b1 == 0x42 && b2 == 0x5A && b3 == 68 && return true
37+
end
38+
len < 4 && return false
39+
b4 = read(io, UInt8)
40+
if "LZ4" formats
41+
b1 == 0x04 && b2 == 0x22 && b3 == 0x4D && b4 == 0x18 && return true
42+
end
43+
len < 5 && return false
44+
b5 = read(io, UInt8)
45+
len < 6 && return false
46+
b6 = read(io, UInt8)
47+
if "XZ" formats
48+
b1 == 0xFD && b2 == 0x37 && b3 == 0x7A && b4 == 0x58 && b5 == 0x5A && b6 == 0x00 && return true
49+
end
50+
return false
51+
end
52+
2553
# test for RD?n magic sequence at the beginning of R data input stream
2654
function detect_rdata(io)
2755
seekstart(io)
28-
read(io, UInt8) == UInt8('R') &&
29-
read(io, UInt8) == UInt8('D') &&
30-
read(io, UInt8) in (UInt8('A'), UInt8('B'), UInt8('X')) &&
31-
read(io, UInt8) in (UInt8('2'), UInt8('3')) &&
32-
(c = read(io, UInt8); c == UInt8('\n') || (c == UInt8('\r') && read(io, UInt8) == UInt8('\n')))
56+
b = read(io, UInt8)
57+
if b == UInt8('R')
58+
return read(io, UInt8) == UInt8('D') &&
59+
read(io, UInt8) in (UInt8('A'), UInt8('B'), UInt8('X')) &&
60+
read(io, UInt8) in (UInt8('2'), UInt8('3')) &&
61+
(c = read(io, UInt8); c == UInt8('\n') || (c == UInt8('\r') && read(io, UInt8) == UInt8('\n')))
62+
end
63+
return detect_compressed(io; formats=["GZIP", "BZIP2", "XZ"])
3364
end
3465

3566
add_format(format"RData", detect_rdata, [".rda", ".RData", ".rdata"], [idRData, LOAD])
@@ -38,6 +69,9 @@ function detect_rdata_single(io)
3869
seekstart(io)
3970
res = read(io, UInt8) in (UInt8('A'), UInt8('B'), UInt8('X')) &&
4071
(c = read(io, UInt8); c == UInt8('\n') || (c == UInt8('\r') && read(io, UInt8) == UInt8('\n')))
72+
if !res
73+
res = detect_compressed(io; formats=["GZIP", "BZIP2", "XZ"])
74+
end
4175
seekstart(io)
4276
return res
4377
end

test/files/iris.rda

1.07 KB
Binary file not shown.

test/query.jl

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -384,6 +384,9 @@ let file_dir = joinpath(@__DIR__, "files"), file_path = Path(file_dir)
384384
# 6 for /r/n and 5 for /n
385385
@test (position(io) in (5, 6))
386386
end
387+
# A GZipped file
388+
q = query(joinpath(file_dir, "iris.rda"))
389+
@test typeof(q) <: File{format"RData"}
387390
end
388391
@testset "RDS detection" begin
389392
q = query(joinpath(file_dir, "minimal_ascii.rds"))

0 commit comments

Comments
 (0)