Skip to content

Commit 20095c9

Browse files
Add new path expansion implementation
Users are allowed to pass either directories or globs as paths, and the desired behavior is to perform glob-expansion and recursively add .ROOT files in directories. Add a new, better, implementation of this path expansion function. This version accepts a list of paths, which lets us elide many per-file, synchronous roundtrips if files are in the same directory. This is important for cases where the file files are remote to not incur a per-file RTT.
1 parent c5ab2e9 commit 20095c9

File tree

31 files changed

+322
-1
lines changed

31 files changed

+322
-1
lines changed

src/main/java/edu/vanderbilt/accre/laurelin/root_proxy/IOFactory.java

Lines changed: 169 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,28 @@
55
package edu.vanderbilt.accre.laurelin.root_proxy;
66

77
import java.io.IOException;
8+
import java.util.ArrayList;
9+
import java.util.Arrays;
10+
import java.util.HashMap;
811
import java.util.List;
12+
import java.util.Map;
13+
import java.util.Map.Entry;
914
import java.util.regex.Pattern;
1015

16+
import org.apache.hadoop.conf.Configuration;
17+
import org.apache.hadoop.fs.FileStatus;
18+
import org.apache.hadoop.fs.FileSystem;
19+
import org.apache.hadoop.fs.LocatedFileStatus;
20+
import org.apache.hadoop.fs.Path;
21+
import org.apache.hadoop.fs.RemoteIterator;
22+
import org.apache.spark.deploy.SparkHadoopUtil;
23+
import org.apache.spark.sql.SparkSession;
24+
25+
import scala.collection.JavaConverters;
26+
import scala.collection.Seq;
27+
1128
public class IOFactory {
12-
static final String hadoopPattern = "^[a-zA-Z]+:/.*";
29+
static final String hadoopPattern = "^[a-zA-Z]+:.*";
1330

1431
public static FileInterface openForRead(String path) throws IOException {
1532
/**
@@ -36,6 +53,157 @@ public static FileInterface openForRead(String path) throws IOException {
3653
return ret;
3754
}
3855

56+
/**
57+
* Perform glob-expansion on a list of paths, then recursively expand any
58+
* directories listed in the list.
59+
*
60+
* @param paths Paths to be expanded
61+
* @return Fully expanded list of ROOT file paths
62+
* @throws IOException If any globs don't resolve or paths don't exist
63+
*/
64+
public static List<Path> resolvePathList(List<String> paths) throws IOException {
65+
Configuration hadoopConf;
66+
try {
67+
hadoopConf = SparkSession.active().sparkContext().hadoopConfiguration();
68+
} catch (IllegalStateException e) {
69+
hadoopConf = new Configuration();
70+
}
71+
72+
List<Path> globResolved = new ArrayList<Path>(paths.size());
73+
// First perform any globbing
74+
for (String path: paths) {
75+
if (isGlob(path)) {
76+
globResolved.addAll(resolveGlob(path));
77+
} else {
78+
globResolved.add(new Path(path));
79+
}
80+
}
81+
82+
/*
83+
* Now, with globs turned into concrete paths, we want to walk through
84+
* the list and check the type of each file:
85+
*
86+
* 1) If a file, add that file directly to our list of input paths
87+
* 2) If a directory, recurseivly add every file ending in .root
88+
*
89+
* There is a problem, however. Each file lookup is synchronous, and if
90+
* the filesystem is remote (e.g. reading xrootd across the WAN), each
91+
* stat() can take upwards of 100msec, which can take forever if the
92+
* user passes in a list of 10k files they'd like to process.
93+
*
94+
* As an optimization, instead of requesting the status of each path
95+
* directly, request the directory listing of each path's parent
96+
* directory to discover the types of each entry. This way, the number
97+
* of FS calls scales by the number of parent directories and not the
98+
* number of paths.
99+
*
100+
* It should also be noted that the hadoop-xrootd connector unrolls
101+
* the multi-arg form of listStatus to individual calls, so that doesn't
102+
* help.
103+
*/
104+
105+
// Loop over all the paths and keep the unique parents of them all
106+
// TODO: Is repeatedly instantiating FileSystem objects slow over WAN?
107+
Map<Path, List<FileStatus>> parentDirectories = new HashMap<Path, List<FileStatus>>();
108+
Map<Path, Path> childToParentMap = new HashMap<Path, Path>();
109+
Map<Path, Path> qualifiedChildToParentMap = new HashMap<Path, Path>();
110+
for (Path path: globResolved) {
111+
Path parent = path.getParent();
112+
parentDirectories.put(parent, null);
113+
childToParentMap.put(path, parent);
114+
FileSystem fs = parent.getFileSystem(hadoopConf);
115+
Path qualifiedChild = path.makeQualified(fs.getUri(), fs.getWorkingDirectory());
116+
qualifiedChildToParentMap.put(qualifiedChild, parent);
117+
}
118+
119+
// Retrieve the listing for all the parent dirs
120+
Map<Path, List<FileStatus>> parentToStatusMap = new HashMap<Path, List<FileStatus>>();
121+
Map<Path, FileStatus> qualifiedListingToStatusMap = new HashMap<Path, FileStatus>();
122+
for (Path parent: parentDirectories.keySet()) {
123+
FileSystem fs = parent.getFileSystem(hadoopConf);
124+
FileStatus[] listing = fs.listStatus(parent);
125+
parentToStatusMap.put(parent, Arrays.asList(listing));
126+
for (FileStatus s: listing) {
127+
assert qualifiedListingToStatusMap.containsKey(s.getPath()) == false;
128+
qualifiedListingToStatusMap.put(s.getPath(), s);
129+
}
130+
}
131+
132+
assert qualifiedListingToStatusMap.size() >= globResolved.size(): "qualifiedlisting < globresolved";
133+
134+
/*
135+
* At this point, we have a list of post-globbing URIs and lists of
136+
* FileStatus for every parent of those URIs. Use this to make a map of
137+
* Globbed path -> FileStatus
138+
*/
139+
Map<Path, FileStatus> clientRequestedPathToStatusMap = new HashMap<Path, FileStatus>();
140+
for (Entry<Path, Path> e: qualifiedChildToParentMap.entrySet()) {
141+
if (!qualifiedListingToStatusMap.containsKey(e.getKey())) {
142+
throw new IOException("Path not found: " + e.getKey());
143+
}
144+
FileStatus status = qualifiedListingToStatusMap.get(e.getKey());
145+
clientRequestedPathToStatusMap.put(e.getKey(), status);
146+
}
147+
148+
// Walk the statuses to sort between files and directories
149+
List<Path> ret = new ArrayList<Path>(globResolved.size());
150+
for (FileStatus status: clientRequestedPathToStatusMap.values()) {
151+
Path path = status.getPath();
152+
if (status.isDirectory()) {
153+
// We were given a directory, add everything recursively
154+
FileSystem fs = status.getPath().getFileSystem(hadoopConf);
155+
RemoteIterator<LocatedFileStatus> fileList = fs.listFiles(status.getPath(), true);
156+
while (fileList.hasNext()) {
157+
LocatedFileStatus file = fileList.next();
158+
if (file.isFile() && (file.getPath().getName().endsWith(".root"))) {
159+
ret.add(file.getPath());
160+
}
161+
}
162+
} else if (status.isFile()) {
163+
ret.add(status.getPath());
164+
} else {
165+
throw new IOException("File '" + path + "' is an unknown type");
166+
}
167+
}
168+
169+
return ret;
170+
}
171+
172+
/**
173+
* Perform glob expansion on a path
174+
* @param path Glob to expand
175+
* @return List of paths that match the given glob
176+
* @throws IOException Nothing matches the given glob
177+
*/
178+
private static List<Path> resolveGlob(String path) throws IOException {
179+
Configuration hadoopConf;
180+
try {
181+
hadoopConf = SparkSession.active().sparkContext().hadoopConfiguration();
182+
} catch (IllegalStateException e) {
183+
hadoopConf = new Configuration();
184+
}
185+
186+
Path hdfsPath = new Path(path);
187+
FileSystem fs = hdfsPath.getFileSystem(hadoopConf);
188+
Path qualified = hdfsPath.makeQualified(fs.getUri(), fs.getWorkingDirectory());
189+
Seq<Path> globPath = SparkHadoopUtil.get().globPathIfNecessary(fs, qualified);
190+
if (globPath.isEmpty()) {
191+
throw new IOException("Path does not exist: " + qualified);
192+
}
193+
// TODO: Is this stable across Scala versions?
194+
List<Path> ret = JavaConverters.seqAsJavaListConverter(globPath).asJava();
195+
return ret;
196+
}
197+
198+
/**
199+
* See if the given path has any glob metacharacters
200+
* @param path Input path
201+
* @return True if the path looks like a glob. False otherwise.
202+
*/
203+
private static boolean isGlob(String path) {
204+
return path.matches(".*[{}\\[\\]*?].*");
205+
}
206+
39207
public static List<String> expandPathToList(String path) throws IOException {
40208
if (Pattern.matches(hadoopPattern, path)) {
41209
return HadoopFile.expandPathToList(path);

src/test/java/edu/vanderbilt/accre/root_proxy/IOTest.java

Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,13 @@
1212
import java.nio.file.Files;
1313
import java.nio.file.Path;
1414
import java.nio.file.Paths;
15+
import java.util.ArrayList;
16+
import java.util.Arrays;
17+
import java.util.Collections;
1518
import java.util.List;
1619

20+
import org.apache.hadoop.conf.Configuration;
21+
import org.apache.hadoop.fs.FileSystem;
1722
import org.junit.AfterClass;
1823
import org.junit.BeforeClass;
1924
import org.junit.Test;
@@ -26,6 +31,10 @@
2631
import edu.vanderbilt.accre.laurelin.root_proxy.ROOTFile;
2732

2833
public class IOTest {
34+
35+
// Needed to instantiate hadoop filesystems
36+
Configuration hadoopConf = new Configuration();
37+
2938
/*
3039
* Basic meta tests
3140
*/
@@ -171,4 +180,121 @@ public void searchDirectory_hadoop() throws IOException {
171180
assertEquals(3, files.size());
172181
}
173182

183+
// Save some keystrokes
184+
private List<org.apache.hadoop.fs.Path> resolveHelper(String ...args) throws IOException {
185+
List<org.apache.hadoop.fs.Path> ret = IOFactory.resolvePathList(new ArrayList<String>(Arrays.asList(args)));
186+
Collections.sort(ret);
187+
return ret;
188+
}
189+
190+
// Just check the suffixes, since different devs will have different working
191+
// dirs and it's kinda a PITA to jump through the hoops to convert it all
192+
private void assertPathListsSame(String message, String[] expected, List<org.apache.hadoop.fs.Path> actual) throws IOException {
193+
FileSystem fs = FileSystem.getLocal(hadoopConf);
194+
for (int i = 0; i < actual.size(); i++) {
195+
assertTrue(message, actual.get(i).toString().endsWith(expected[i]));
196+
}
197+
assertEquals(message + "Number of elements should match", expected.length, actual.size());
198+
}
199+
200+
String[] allGlobFiles = new String[] { "testdata/globtest/1/1/1_1_1.root",
201+
"testdata/globtest/1/1/1_1_2.root",
202+
"testdata/globtest/1/1/1_1_3.root",
203+
"testdata/globtest/1/2/1_2_1.root",
204+
"testdata/globtest/1/2/1_2_2.root",
205+
"testdata/globtest/1/2/1_2_3.root",
206+
"testdata/globtest/1/3/1_3_1.root",
207+
"testdata/globtest/1/3/1_3_2.root",
208+
"testdata/globtest/1/3/1_3_3.root",
209+
"testdata/globtest/2/1/2_1_1.root",
210+
"testdata/globtest/2/1/2_1_2.root",
211+
"testdata/globtest/2/1/2_1_3.root",
212+
"testdata/globtest/2/2/2_2_1.root",
213+
"testdata/globtest/2/2/2_2_2.root",
214+
"testdata/globtest/2/2/2_2_3.root",
215+
"testdata/globtest/2/3/2_3_1.root",
216+
"testdata/globtest/2/3/2_3_2.root",
217+
"testdata/globtest/2/3/2_3_3.root",
218+
"testdata/globtest/3/1/3_1_1.root",
219+
"testdata/globtest/3/1/3_1_2.root",
220+
"testdata/globtest/3/1/3_1_3.root",
221+
"testdata/globtest/3/2/3_2_1.root",
222+
"testdata/globtest/3/2/3_2_2.root",
223+
"testdata/globtest/3/2/3_2_3.root",
224+
"testdata/globtest/3/3/3_3_1.root",
225+
"testdata/globtest/3/3/3_3_2.root",
226+
"testdata/globtest/3/3/3_3_3.root"};
227+
228+
@Test
229+
public void resolvePathList_recursion_full() throws IOException {
230+
List<org.apache.hadoop.fs.Path> paths = resolveHelper("testdata/globtest");
231+
assertPathListsSame("recursion_full", allGlobFiles, paths);
232+
}
233+
234+
@Test
235+
public void resolvePathList_recursion_full_duplicated() throws IOException {
236+
List<org.apache.hadoop.fs.Path> paths = resolveHelper("testdata/globtest", "testdata/globtest");
237+
assertPathListsSame("recursion_full_dup", allGlobFiles, paths);
238+
}
239+
240+
@Test
241+
public void resolvePathList_recursion_globbed() throws IOException {
242+
List<org.apache.hadoop.fs.Path> paths = resolveHelper("testdata/globtest/{1,2,3}");
243+
assertPathListsSame("recursion_globbed", allGlobFiles, paths);
244+
}
245+
246+
String[] someGlobFiles = new String[] {
247+
"testdata/globtest/1/2/1_2_1.root",
248+
"testdata/globtest/1/2/1_2_2.root",
249+
"testdata/globtest/1/2/1_2_3.root",
250+
"testdata/globtest/1/3/1_3_1.root",
251+
"testdata/globtest/1/3/1_3_2.root",
252+
"testdata/globtest/1/3/1_3_3.root",
253+
};
254+
255+
@Test
256+
public void resolvePathList_recursion_globbed2() throws IOException {
257+
List<org.apache.hadoop.fs.Path> paths = resolveHelper("testdata/globtest/{1{/2,/3}}");
258+
assertPathListsSame("recursion_globbed2", someGlobFiles, paths);
259+
}
260+
261+
@Test
262+
public void resolvePathList_recursion_notglobbed() throws IOException {
263+
List<org.apache.hadoop.fs.Path> paths = resolveHelper("testdata/globtest/1/2", "testdata/globtest/1/3");
264+
assertPathListsSame("recursion_notglobbed", someGlobFiles, paths);
265+
}
266+
267+
@Test
268+
public void resolvePathList_explicitlist() throws IOException {
269+
List<org.apache.hadoop.fs.Path> paths = resolveHelper(someGlobFiles);
270+
assertPathListsSame("explicit_list", someGlobFiles, paths);
271+
}
272+
273+
@Test
274+
public void resolvePathList_explicit_one() throws IOException {
275+
List<org.apache.hadoop.fs.Path> paths = resolveHelper("testdata/globtest/1/2/1_2_1.root");
276+
assertPathListsSame("explicit_one", new String[] { "testdata/globtest/1/2/1_2_1.root" }, paths);
277+
}
278+
279+
@Test(expected = IOException.class)
280+
public void resolvePathList_badglob() throws IOException {
281+
List<org.apache.hadoop.fs.Path> paths = resolveHelper("testdata/globtest/nonexistent/*");
282+
}
283+
284+
@Test(expected = IOException.class)
285+
public void resolvePathList_badglob_withothers() throws IOException {
286+
List<org.apache.hadoop.fs.Path> paths = resolveHelper("testdata/globtest/nonexistent/*", "testdata/globtest/1/3");
287+
}
288+
289+
@Test(expected = IOException.class)
290+
public void resolvePathList_badpath() throws IOException {
291+
List<org.apache.hadoop.fs.Path> paths = resolveHelper("testdata/globtest/nonexistent/");
292+
}
293+
294+
@Test(expected = IOException.class)
295+
public void resolvePathList_badpath_withothers() throws IOException {
296+
List<org.apache.hadoop.fs.Path> paths = resolveHelper("testdata/globtest/nonexistent/", "testdata/globtest/1/3");
297+
}
298+
299+
174300
}

testdata/globtest/1/1/1_1_1.root

Whitespace-only changes.

testdata/globtest/1/1/1_1_2.root

Whitespace-only changes.

testdata/globtest/1/1/1_1_3.root

Whitespace-only changes.

testdata/globtest/1/2/1_2_1.root

Whitespace-only changes.

testdata/globtest/1/2/1_2_2.root

Whitespace-only changes.

testdata/globtest/1/2/1_2_3.root

Whitespace-only changes.

testdata/globtest/1/3/1_3_1.root

Whitespace-only changes.

testdata/globtest/1/3/1_3_2.root

Whitespace-only changes.

0 commit comments

Comments
 (0)