spark-root
diff --git a/‎src/main/java/edu/vanderbilt/accre/laurelin/root_proxy/IOFactory.java
Lines changed: 169 additions & 1 deletion b/‎src/main/java/edu/vanderbilt/accre/laurelin/root_proxy/IOFactory.java
Lines changed: 169 additions & 1 deletion
diff --git a/‎src/test/java/edu/vanderbilt/accre/root_proxy/IOTest.java
Lines changed: 126 additions & 0 deletions b/‎src/test/java/edu/vanderbilt/accre/root_proxy/IOTest.java
Lines changed: 126 additions & 0 deletions
diff --git a/‎testdata/globtest/1/1/1_1_1.root b/‎testdata/globtest/1/1/1_1_1.root
diff --git a/‎testdata/globtest/1/1/1_1_2.root b/‎testdata/globtest/1/1/1_1_2.root
diff --git a/‎testdata/globtest/1/1/1_1_3.root b/‎testdata/globtest/1/1/1_1_3.root
diff --git a/‎testdata/globtest/1/2/1_2_1.root b/‎testdata/globtest/1/2/1_2_1.root
diff --git a/‎testdata/globtest/1/2/1_2_2.root b/‎testdata/globtest/1/2/1_2_2.root
diff --git a/‎testdata/globtest/1/2/1_2_3.root b/‎testdata/globtest/1/2/1_2_3.root
diff --git a/‎testdata/globtest/1/3/1_3_1.root b/‎testdata/globtest/1/3/1_3_1.root
diff --git a/‎testdata/globtest/1/3/1_3_2.root b/‎testdata/globtest/1/3/1_3_2.root
@@ -5,11 +5,28 @@
 package edu.vanderbilt.accre.laurelin.root_proxy;
 
 import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
 import java.util.List;
+import java.util.Map;
+import java.util.Map.Entry;
 import java.util.regex.Pattern;
 
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.LocatedFileStatus;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.RemoteIterator;
+import org.apache.spark.deploy.SparkHadoopUtil;
+import org.apache.spark.sql.SparkSession;
+
+import scala.collection.JavaConverters;
+import scala.collection.Seq;
+
 public class IOFactory {
-    static final String hadoopPattern = "^[a-zA-Z]+:/.*";
+    static final String hadoopPattern = "^[a-zA-Z]+:.*";
 
     public static FileInterface openForRead(String path) throws IOException {
         /**
@@ -36,6 +53,157 @@ public static FileInterface openForRead(String path) throws IOException {
         return ret;
     }
 
+    /**
+     * Perform glob-expansion on a list of paths, then recursively expand any
+     * directories listed in the list.
+     *
+     * @param paths Paths to be expanded
+     * @return Fully expanded list of ROOT file paths
+     * @throws IOException If any globs don't resolve or paths don't exist
+     */
+    public static List<Path> resolvePathList(List<String> paths) throws IOException {
+        Configuration hadoopConf;
+        try {
+            hadoopConf = SparkSession.active().sparkContext().hadoopConfiguration();
+        } catch (IllegalStateException e) {
+            hadoopConf = new Configuration();
+        }
+
+        List<Path> globResolved = new ArrayList<Path>(paths.size());
+        // First perform any globbing
+        for (String path: paths) {
+            if (isGlob(path)) {
+                globResolved.addAll(resolveGlob(path));
+            } else {
+                globResolved.add(new Path(path));
+            }
+        }
+
+        /*
+         * Now, with globs turned into concrete paths, we want to walk through
+         * the list and check the type of each file:
+         *
+         * 1) If a file, add that file directly to our list of input paths
+         * 2) If a directory, recurseivly add every file ending in .root
+         *
+         * There is a problem, however. Each file lookup is synchronous, and if
+         * the filesystem is remote (e.g. reading xrootd across the WAN), each
+         * stat() can take upwards of 100msec, which can take forever if the
+         * user passes in a list of 10k files they'd like to process.
+         *
+         * As an optimization, instead of requesting the status of each path
+         * directly, request the directory listing of each path's parent
+         * directory to discover the types of each entry. This way, the number
+         * of FS calls scales by the number of parent directories and not the
+         * number of paths.
+         *
+         * It should also be noted that the hadoop-xrootd connector unrolls
+         * the multi-arg form of listStatus to individual calls, so that doesn't
+         * help.
+         */
+
+        // Loop over all the paths and keep the unique parents of them all
+        // TODO: Is repeatedly instantiating FileSystem objects slow over WAN?
+        Map<Path, List<FileStatus>> parentDirectories = new HashMap<Path, List<FileStatus>>();
+        Map<Path, Path> childToParentMap = new HashMap<Path, Path>();
+        Map<Path, Path> qualifiedChildToParentMap = new HashMap<Path, Path>();
+        for (Path path: globResolved) {
+            Path parent = path.getParent();
+            parentDirectories.put(parent, null);
+            childToParentMap.put(path, parent);
+            FileSystem fs = parent.getFileSystem(hadoopConf);
+            Path qualifiedChild = path.makeQualified(fs.getUri(), fs.getWorkingDirectory());
+            qualifiedChildToParentMap.put(qualifiedChild, parent);
+        }
+
+        // Retrieve the listing for all the parent dirs
+        Map<Path, List<FileStatus>> parentToStatusMap = new HashMap<Path, List<FileStatus>>();
+        Map<Path, FileStatus> qualifiedListingToStatusMap = new HashMap<Path, FileStatus>();
+        for (Path parent: parentDirectories.keySet()) {
+            FileSystem fs = parent.getFileSystem(hadoopConf);
+            FileStatus[] listing = fs.listStatus(parent);
+            parentToStatusMap.put(parent, Arrays.asList(listing));
+            for (FileStatus s: listing) {
+                assert qualifiedListingToStatusMap.containsKey(s.getPath()) == false;
+                qualifiedListingToStatusMap.put(s.getPath(), s);
+            }
+        }
+
+        assert qualifiedListingToStatusMap.size() >= globResolved.size(): "qualifiedlisting < globresolved";
+
+        /*
+         *  At this point, we have a list of post-globbing URIs and lists of
+         *  FileStatus for every parent of those URIs. Use this to make a map of
+         *  Globbed path -> FileStatus
+         */
+        Map<Path, FileStatus> clientRequestedPathToStatusMap = new HashMap<Path, FileStatus>();
+        for (Entry<Path, Path> e: qualifiedChildToParentMap.entrySet()) {
+            if (!qualifiedListingToStatusMap.containsKey(e.getKey())) {
+                throw new IOException("Path not found: " + e.getKey());
+            }
+            FileStatus status = qualifiedListingToStatusMap.get(e.getKey());
+            clientRequestedPathToStatusMap.put(e.getKey(), status);
+        }
+
+        // Walk the statuses to sort between files and directories
+        List<Path> ret = new ArrayList<Path>(globResolved.size());
+        for (FileStatus status: clientRequestedPathToStatusMap.values()) {
+            Path path = status.getPath();
+            if (status.isDirectory()) {
+                // We were given a directory, add everything recursively
+                FileSystem fs = status.getPath().getFileSystem(hadoopConf);
+                RemoteIterator<LocatedFileStatus> fileList = fs.listFiles(status.getPath(), true);
+                while (fileList.hasNext()) {
+                    LocatedFileStatus file = fileList.next();
+                    if (file.isFile() && (file.getPath().getName().endsWith(".root"))) {
+                        ret.add(file.getPath());
+                    }
+                }
+            } else if (status.isFile()) {
+                ret.add(status.getPath());
+            } else {
+                throw new IOException("File '" + path + "' is an unknown type");
+            }
+        }
+
+        return ret;
+    }
+
+    /**
+     * Perform glob expansion on a path
+     * @param path Glob to expand
+     * @return List of paths that match the given glob
+     * @throws IOException Nothing matches the given glob
+     */
+    private static List<Path> resolveGlob(String path) throws IOException {
+        Configuration hadoopConf;
+        try {
+            hadoopConf = SparkSession.active().sparkContext().hadoopConfiguration();
+        } catch (IllegalStateException e) {
+            hadoopConf = new Configuration();
+        }
+
+        Path hdfsPath = new Path(path);
+        FileSystem fs = hdfsPath.getFileSystem(hadoopConf);
+        Path qualified = hdfsPath.makeQualified(fs.getUri(), fs.getWorkingDirectory());
+        Seq<Path> globPath = SparkHadoopUtil.get().globPathIfNecessary(fs, qualified);
+        if (globPath.isEmpty()) {
+            throw new IOException("Path does not exist: " + qualified);
+        }
+        // TODO: Is this stable across Scala versions?
+        List<Path> ret = JavaConverters.seqAsJavaListConverter(globPath).asJava();
+        return ret;
+    }
+
+    /**
+     * See if the given path has any glob metacharacters
+     * @param path Input path
+     * @return True if the path looks like a glob. False otherwise.
+     */
+    private static boolean isGlob(String path) {
+        return path.matches(".*[{}\\[\\]*?].*");
+    }
+
     public static List<String> expandPathToList(String path) throws IOException {
         if (Pattern.matches(hadoopPattern,  path)) {
             return HadoopFile.expandPathToList(path);
 
@@ -12,8 +12,13 @@
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.nio.file.Paths;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
 import java.util.List;
 
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
 import org.junit.AfterClass;
 import org.junit.BeforeClass;
 import org.junit.Test;
@@ -26,6 +31,10 @@
 import edu.vanderbilt.accre.laurelin.root_proxy.ROOTFile;
 
 public class IOTest {
+
+    // Needed to instantiate hadoop filesystems
+    Configuration hadoopConf = new Configuration();
+
     /*
      * Basic meta tests
      */
@@ -171,4 +180,121 @@ public void searchDirectory_hadoop() throws IOException {
         assertEquals(3, files.size());
     }
 
+    // Save some keystrokes
+    private List<org.apache.hadoop.fs.Path> resolveHelper(String ...args) throws IOException {
+        List<org.apache.hadoop.fs.Path> ret = IOFactory.resolvePathList(new ArrayList<String>(Arrays.asList(args)));
+        Collections.sort(ret);
+        return ret;
+    }
+
+    // Just check the suffixes, since different devs will have different working
+    // dirs and it's kinda a PITA to jump through the hoops to convert it all
+    private void assertPathListsSame(String message, String[] expected, List<org.apache.hadoop.fs.Path> actual) throws IOException {
+        FileSystem fs = FileSystem.getLocal(hadoopConf);
+        for (int i = 0; i < actual.size(); i++) {
+            assertTrue(message, actual.get(i).toString().endsWith(expected[i]));
+        }
+        assertEquals(message + "Number of elements should match", expected.length, actual.size());
+    }
+
+    String[] allGlobFiles = new String[] { "testdata/globtest/1/1/1_1_1.root",
+            "testdata/globtest/1/1/1_1_2.root",
+            "testdata/globtest/1/1/1_1_3.root",
+            "testdata/globtest/1/2/1_2_1.root",
+            "testdata/globtest/1/2/1_2_2.root",
+            "testdata/globtest/1/2/1_2_3.root",
+            "testdata/globtest/1/3/1_3_1.root",
+            "testdata/globtest/1/3/1_3_2.root",
+            "testdata/globtest/1/3/1_3_3.root",
+            "testdata/globtest/2/1/2_1_1.root",
+            "testdata/globtest/2/1/2_1_2.root",
+            "testdata/globtest/2/1/2_1_3.root",
+            "testdata/globtest/2/2/2_2_1.root",
+            "testdata/globtest/2/2/2_2_2.root",
+            "testdata/globtest/2/2/2_2_3.root",
+            "testdata/globtest/2/3/2_3_1.root",
+            "testdata/globtest/2/3/2_3_2.root",
+            "testdata/globtest/2/3/2_3_3.root",
+            "testdata/globtest/3/1/3_1_1.root",
+            "testdata/globtest/3/1/3_1_2.root",
+            "testdata/globtest/3/1/3_1_3.root",
+            "testdata/globtest/3/2/3_2_1.root",
+            "testdata/globtest/3/2/3_2_2.root",
+            "testdata/globtest/3/2/3_2_3.root",
+            "testdata/globtest/3/3/3_3_1.root",
+            "testdata/globtest/3/3/3_3_2.root",
+            "testdata/globtest/3/3/3_3_3.root"};
+
+    @Test
+    public void resolvePathList_recursion_full() throws IOException {
+        List<org.apache.hadoop.fs.Path> paths = resolveHelper("testdata/globtest");
+        assertPathListsSame("recursion_full", allGlobFiles, paths);
+    }
+
+    @Test
+    public void resolvePathList_recursion_full_duplicated() throws IOException {
+        List<org.apache.hadoop.fs.Path> paths = resolveHelper("testdata/globtest", "testdata/globtest");
+        assertPathListsSame("recursion_full_dup", allGlobFiles, paths);
+    }
+
+    @Test
+    public void resolvePathList_recursion_globbed() throws IOException {
+        List<org.apache.hadoop.fs.Path> paths = resolveHelper("testdata/globtest/{1,2,3}");
+        assertPathListsSame("recursion_globbed", allGlobFiles, paths);
+    }
+
+    String[] someGlobFiles = new String[] {
+            "testdata/globtest/1/2/1_2_1.root",
+            "testdata/globtest/1/2/1_2_2.root",
+            "testdata/globtest/1/2/1_2_3.root",
+            "testdata/globtest/1/3/1_3_1.root",
+            "testdata/globtest/1/3/1_3_2.root",
+            "testdata/globtest/1/3/1_3_3.root",
+    };
+
+    @Test
+    public void resolvePathList_recursion_globbed2() throws IOException {
+        List<org.apache.hadoop.fs.Path> paths = resolveHelper("testdata/globtest/{1{/2,/3}}");
+        assertPathListsSame("recursion_globbed2", someGlobFiles, paths);
+    }
+
+    @Test
+    public void resolvePathList_recursion_notglobbed() throws IOException {
+        List<org.apache.hadoop.fs.Path> paths = resolveHelper("testdata/globtest/1/2", "testdata/globtest/1/3");
+        assertPathListsSame("recursion_notglobbed", someGlobFiles, paths);
+    }
+
+    @Test
+    public void resolvePathList_explicitlist() throws IOException {
+        List<org.apache.hadoop.fs.Path> paths = resolveHelper(someGlobFiles);
+        assertPathListsSame("explicit_list", someGlobFiles, paths);
+    }
+
+    @Test
+    public void resolvePathList_explicit_one() throws IOException {
+        List<org.apache.hadoop.fs.Path> paths = resolveHelper("testdata/globtest/1/2/1_2_1.root");
+        assertPathListsSame("explicit_one", new String[] { "testdata/globtest/1/2/1_2_1.root" }, paths);
+    }
+
+    @Test(expected = IOException.class)
+    public void resolvePathList_badglob() throws IOException {
+        List<org.apache.hadoop.fs.Path> paths = resolveHelper("testdata/globtest/nonexistent/*");
+    }
+
+    @Test(expected = IOException.class)
+    public void resolvePathList_badglob_withothers() throws IOException {
+        List<org.apache.hadoop.fs.Path> paths = resolveHelper("testdata/globtest/nonexistent/*", "testdata/globtest/1/3");
+    }
+
+    @Test(expected = IOException.class)
+    public void resolvePathList_badpath() throws IOException {
+        List<org.apache.hadoop.fs.Path> paths = resolveHelper("testdata/globtest/nonexistent/");
+    }
+
+    @Test(expected = IOException.class)
+    public void resolvePathList_badpath_withothers() throws IOException {
+        List<org.apache.hadoop.fs.Path> paths = resolveHelper("testdata/globtest/nonexistent/", "testdata/globtest/1/3");
+    }
+
+
 }