File tree 1 file changed +22
-6
lines changed
1 file changed +22
-6
lines changed Original file line number Diff line number Diff line change 1
1
"""PLINK 1.9 reader implementation"""
2
2
from pathlib import Path
3
- from typing import Optional , Union
3
+ from typing import Optional , Union , Mapping , Any
4
4
5
5
import dask .array as da
6
6
import dask .dataframe as dd
7
7
import numpy as np
8
8
from dask .dataframe import DataFrame
9
+ from dask .array import Array
9
10
from pysnptools .snpreader import Bed
10
11
from xarray import Dataset
11
12
@@ -92,11 +93,26 @@ def close(self):
92
93
self .bed ._close_bed () # pragma: no cover
93
94
94
95
95
- def _to_dict (df , dtype = None ):
96
- return {
97
- c : df [c ].to_dask_array (lengths = True ).astype (dtype [c ] if dtype else df [c ].dtype )
98
- for c in df
99
- }
96
+ def _max_str_len (arr : Array ) -> Array :
97
+ return arr .map_blocks (
98
+ lambda s : np .char .str_len (s .astype (str )), dtype = np .int8
99
+ ).max ()
100
+
101
+
102
+ def _to_dict (df : DataFrame , dtype : Mapping [str , Any ]= None ):
103
+ arrs = {}
104
+ for c in df :
105
+ a = df [c ].to_dask_array (lengths = True )
106
+ dt = df [c ].dtype
107
+ if dtype :
108
+ dt = dtype [c ]
109
+ kind = np .dtype (dt ).kind
110
+ if kind in ['U' , 'S' ]:
111
+ # Compute fixed-length string dtype for array
112
+ max_len = _max_str_len (a ).compute ()
113
+ dt = f"{ kind } { max_len } "
114
+ arrs [c ] = a .astype (dt )
115
+ return arrs
100
116
101
117
102
118
def read_fam (path : PathType , sep : str = " " ) -> DataFrame :
You can’t perform that action at this time.
0 commit comments