Skip to content

Commit 0bb862d

Browse files
committed
Precompute string lengths sgkit-dev#12
1 parent 5ef6aac commit 0bb862d

File tree

4 files changed

+110
-83
lines changed

4 files changed

+110
-83
lines changed

sgkit_plink/pysnptools.py

+22-6
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,12 @@
11
"""PLINK 1.9 reader implementation"""
22
from pathlib import Path
3-
from typing import Optional, Union
3+
from typing import Optional, Union, Mapping, Any
44

55
import dask.array as da
66
import dask.dataframe as dd
77
import numpy as np
88
from dask.dataframe import DataFrame
9+
from dask.array import Array
910
from pysnptools.snpreader import Bed
1011
from xarray import Dataset
1112

@@ -92,11 +93,26 @@ def close(self):
9293
self.bed._close_bed() # pragma: no cover
9394

9495

95-
def _to_dict(df, dtype=None):
96-
return {
97-
c: df[c].to_dask_array(lengths=True).astype(dtype[c] if dtype else df[c].dtype)
98-
for c in df
99-
}
96+
def _max_str_len(arr: Array) -> Array:
97+
return arr.map_blocks(
98+
lambda s: np.char.str_len(s.astype(str)), dtype=np.int8
99+
).max()
100+
101+
102+
def _to_dict(df: DataFrame, dtype: Mapping[str, Any]=None):
103+
arrs = {}
104+
for c in df:
105+
a = df[c].to_dask_array(lengths=True)
106+
dt = df[c].dtype
107+
if dtype:
108+
dt = dtype[c]
109+
kind = np.dtype(dt).kind
110+
if kind in ['U', 'S']:
111+
# Compute fixed-length string dtype for array
112+
max_len = _max_str_len(a).compute()
113+
dt = f"{kind}{max_len}"
114+
arrs[c] = a.astype(dt)
115+
return arrs
100116

101117

102118
def read_fam(path: PathType, sep: str = " ") -> DataFrame:
Original file line numberDiff line numberDiff line change
@@ -1,100 +1,100 @@
1-
1 1:1:A:C 0.0 1 C A
2-
1 1:2:A:C 0.0 2 C A
3-
1 1:3:A:C 0.0 3 C A
4-
1 1:4:A:C 0.0 4 C A
5-
1 1:5:A:C 0.0 5 C A
6-
1 1:6:A:C 0.0 6 C A
7-
1 1:7:A:C 0.0 7 C A
8-
1 1:8:A:C 0.0 8 C A
9-
1 1:9:A:C 0.0 9 C A
1+
1 1:1:G:CGCGCG 0.0 1 CGCGCG G
2+
1 1:2:ACT:G 0.0 2 G ACT
3+
1 1:3:ACT:G 0.0 3 G ACT
4+
1 1:4:G:CGCGCG 0.0 4 CGCGCG G
5+
1 1:5:G:CGCGCG 0.0 5 CGCGCG G
6+
1 1:6:ACT:G 0.0 6 G ACT
7+
1 1:7:G:CGCGCG 0.0 7 CGCGCG G
8+
1 1:8:T:GTGG 0.0 8 GTGG T
9+
1 1:9:T:GTGG 0.0 9 GTGG T
1010
1 1:10:A:C 0.0 10 C A
11-
1 1:11:A:C 0.0 11 C A
12-
1 1:12:A:C 0.0 12 C A
13-
1 1:13:A:C 0.0 13 C A
14-
1 1:14:A:C 0.0 14 C A
15-
1 1:15:A:C 0.0 15 C A
11+
1 1:11:ACT:G 0.0 11 G ACT
12+
1 1:12:G:CGCGCG 0.0 12 CGCGCG G
13+
1 1:13:G:CGCGCG 0.0 13 CGCGCG G
14+
1 1:14:T:GTGG 0.0 14 GTGG T
15+
1 1:15:ACT:G 0.0 15 G ACT
1616
1 1:16:A:C 0.0 16 C A
17-
1 1:17:A:C 0.0 17 C A
18-
1 1:18:A:C 0.0 18 C A
17+
1 1:17:ACT:G 0.0 17 G ACT
18+
1 1:18:T:GTGG 0.0 18 GTGG T
1919
1 1:19:A:C 0.0 19 C A
2020
1 1:20:A:C 0.0 20 C A
21-
1 1:21:A:C 0.0 21 C A
22-
1 1:22:A:C 0.0 22 C A
23-
1 1:23:A:C 0.0 23 C A
21+
1 1:21:T:GTGG 0.0 21 GTGG T
22+
1 1:22:G:CGCGCG 0.0 22 CGCGCG G
23+
1 1:23:T:GTGG 0.0 23 GTGG T
2424
1 1:24:A:C 0.0 24 C A
2525
1 1:25:A:C 0.0 25 C A
26-
1 1:26:A:C 0.0 26 C A
27-
1 1:27:A:C 0.0 27 C A
28-
1 1:28:A:C 0.0 28 C A
29-
1 1:29:A:C 0.0 29 C A
26+
1 1:26:ACT:G 0.0 26 G ACT
27+
1 1:27:G:CGCGCG 0.0 27 CGCGCG G
28+
1 1:28:ACT:G 0.0 28 G ACT
29+
1 1:29:T:GTGG 0.0 29 GTGG T
3030
1 1:30:A:C 0.0 30 C A
31-
1 1:31:A:C 0.0 31 C A
32-
1 1:32:A:C 0.0 32 C A
33-
1 1:33:A:C 0.0 33 C A
34-
1 1:34:A:C 0.0 34 C A
31+
1 1:31:T:GTGG 0.0 31 GTGG T
32+
1 1:32:G:CGCGCG 0.0 32 CGCGCG G
33+
1 1:33:ACT:G 0.0 33 G ACT
34+
1 1:34:G:CGCGCG 0.0 34 CGCGCG G
3535
1 1:35:A:C 0.0 35 C A
36-
1 1:36:A:C 0.0 36 C A
37-
1 1:37:A:C 0.0 37 C A
36+
1 1:36:G:CGCGCG 0.0 36 CGCGCG G
37+
1 1:37:T:GTGG 0.0 37 GTGG T
3838
1 1:38:A:C 0.0 38 C A
3939
1 1:39:A:C 0.0 39 C A
40-
1 1:40:A:C 0.0 40 C A
40+
1 1:40:T:GTGG 0.0 40 GTGG T
4141
1 1:41:A:C 0.0 41 C A
42-
1 1:42:A:C 0.0 42 C A
43-
1 1:43:A:C 0.0 43 C A
44-
1 1:44:A:C 0.0 44 C A
45-
1 1:45:A:C 0.0 45 C A
46-
1 1:46:A:C 0.0 46 C A
47-
1 1:47:A:C 0.0 47 C A
42+
1 1:42:G:CGCGCG 0.0 42 CGCGCG G
43+
1 1:43:T:GTGG 0.0 43 GTGG T
44+
1 1:44:ACT:G 0.0 44 G ACT
45+
1 1:45:G:CGCGCG 0.0 45 CGCGCG G
46+
1 1:46:ACT:G 0.0 46 G ACT
47+
1 1:47:G:CGCGCG 0.0 47 CGCGCG G
4848
1 1:48:A:C 0.0 48 C A
4949
1 1:49:A:C 0.0 49 C A
5050
1 1:50:A:C 0.0 50 C A
51-
1 1:51:A:C 0.0 51 C A
51+
1 1:51:G:CGCGCG 0.0 51 CGCGCG G
5252
1 1:52:A:C 0.0 52 C A
53-
1 1:53:A:C 0.0 53 C A
53+
1 1:53:ACT:G 0.0 53 G ACT
5454
1 1:54:A:C 0.0 54 C A
55-
1 1:55:A:C 0.0 55 C A
56-
1 1:56:A:C 0.0 56 C A
57-
1 1:57:A:C 0.0 57 C A
55+
1 1:55:G:CGCGCG 0.0 55 CGCGCG G
56+
1 1:56:T:GTGG 0.0 56 GTGG T
57+
1 1:57:G:CGCGCG 0.0 57 CGCGCG G
5858
1 1:58:A:C 0.0 58 C A
59-
1 1:59:A:C 0.0 59 C A
60-
1 1:60:A:C 0.0 60 C A
61-
1 1:61:A:C 0.0 61 C A
59+
1 1:59:T:GTGG 0.0 59 GTGG T
60+
1 1:60:G:CGCGCG 0.0 60 CGCGCG G
61+
1 1:61:ACT:G 0.0 61 G ACT
6262
1 1:62:A:C 0.0 62 C A
63-
1 1:63:A:C 0.0 63 C A
64-
1 1:64:A:C 0.0 64 C A
65-
1 1:65:A:C 0.0 65 C A
66-
1 1:66:A:C 0.0 66 C A
67-
1 1:67:A:C 0.0 67 C A
68-
1 1:68:A:C 0.0 68 C A
69-
1 1:69:A:C 0.0 69 C A
70-
1 1:70:A:C 0.0 70 C A
71-
1 1:71:A:C 0.0 71 C A
72-
1 1:72:A:C 0.0 72 C A
63+
1 1:63:G:CGCGCG 0.0 63 CGCGCG G
64+
1 1:64:T:GTGG 0.0 64 GTGG T
65+
1 1:65:T:GTGG 0.0 65 GTGG T
66+
1 1:66:ACT:G 0.0 66 G ACT
67+
1 1:67:T:GTGG 0.0 67 GTGG T
68+
1 1:68:ACT:G 0.0 68 G ACT
69+
1 1:69:G:CGCGCG 0.0 69 CGCGCG G
70+
1 1:70:G:CGCGCG 0.0 70 CGCGCG G
71+
1 1:71:ACT:G 0.0 71 G ACT
72+
1 1:72:G:CGCGCG 0.0 72 CGCGCG G
7373
1 1:73:A:C 0.0 73 C A
7474
1 1:74:A:C 0.0 74 C A
75-
1 1:75:A:C 0.0 75 C A
75+
1 1:75:T:GTGG 0.0 75 GTGG T
7676
1 1:76:A:C 0.0 76 C A
77-
1 1:77:A:C 0.0 77 C A
78-
1 1:78:A:C 0.0 78 C A
77+
1 1:77:ACT:G 0.0 77 G ACT
78+
1 1:78:ACT:G 0.0 78 G ACT
7979
1 1:79:A:C 0.0 79 C A
8080
1 1:80:A:C 0.0 80 C A
8181
1 1:81:A:C 0.0 81 C A
82-
1 1:82:A:C 0.0 82 C A
82+
1 1:82:T:GTGG 0.0 82 GTGG T
8383
1 1:83:A:C 0.0 83 C A
84-
1 1:84:A:C 0.0 84 C A
84+
1 1:84:ACT:G 0.0 84 G ACT
8585
1 1:85:A:C 0.0 85 C A
86-
1 1:86:A:C 0.0 86 C A
87-
1 1:87:A:C 0.0 87 C A
86+
1 1:86:G:CGCGCG 0.0 86 CGCGCG G
87+
1 1:87:ACT:G 0.0 87 G ACT
8888
1 1:88:A:C 0.0 88 C A
8989
1 1:89:A:C 0.0 89 C A
90-
1 1:90:A:C 0.0 90 C A
91-
1 1:91:A:C 0.0 91 C A
92-
1 1:92:A:C 0.0 92 C A
90+
1 1:90:T:GTGG 0.0 90 GTGG T
91+
1 1:91:T:GTGG 0.0 91 GTGG T
92+
1 1:92:T:GTGG 0.0 92 GTGG T
9393
1 1:93:A:C 0.0 93 C A
9494
1 1:94:A:C 0.0 94 C A
9595
1 1:95:A:C 0.0 95 C A
9696
1 1:96:A:C 0.0 96 C A
97-
1 1:97:A:C 0.0 97 C A
98-
1 1:98:A:C 0.0 98 C A
99-
1 1:99:A:C 0.0 99 C A
97+
1 1:97:T:GTGG 0.0 97 GTGG T
98+
1 1:98:ACT:G 0.0 98 G ACT
99+
1 1:99:T:GTGG 0.0 99 GTGG T
100100
1 1:100:A:C 0.0 100 C A
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
1-
0 0 0 0 0 NA
2-
0 1 0 0 0 NA
3-
0 2 0 0 0 NA
4-
0 3 0 0 0 NA
5-
0 4 0 0 0 NA
6-
0 5 0 0 0 NA
7-
0 6 0 0 0 NA
8-
0 7 0 0 0 NA
9-
0 8 0 0 0 NA
10-
0 9 0 0 0 NA
1+
0 000 0 0 0 NA
2+
0 001 0 0 0 NA
3+
0 002 0 0 0 NA
4+
0 003 0 0 0 NA
5+
0 004 0 0 0 NA
6+
0 005 0 0 0 NA
7+
0 006 0 0 0 NA
8+
0 007 0 0 0 NA
9+
0 008 0 0 0 NA
10+
0 009 0 0 0 NA

sgkit_plink/tests/test_pysnptools.py

+11
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,17 @@ def test_raise_on_both_path_types():
3232
read_plink(path="x", bed_path="x")
3333

3434

35+
def test_fixlen_str_variable(ds1):
36+
assert ds1["sample_id"].dtype == np.dtype("<U3")
37+
assert ds1["variant_id"].dtype == np.dtype("<U13")
38+
assert ds1["variant_allele"].dtype == np.dtype("|S6")
39+
assert ds1["sample_family_id"].dtype == np.dtype("<U1")
40+
# TODO: Remove 'None' strings https://github.com/pystatgen/sgkit-plink/issues/16
41+
# which should make these <U1
42+
assert ds1["sample_maternal_id"].dtype == np.dtype("<U4")
43+
assert ds1["sample_paternal_id"].dtype == np.dtype("<U4")
44+
45+
3546
def test_read_slicing(ds1):
3647
gt = ds1["call_genotype"]
3748
shape = gt.shape

0 commit comments

Comments
 (0)