Skip to content

faster file list ingestion for flat lists #77

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 1 addition & 4 deletions src/main/java/edu/vanderbilt/accre/laurelin/Root.java
Original file line number Diff line number Diff line change
Expand Up @@ -253,10 +253,7 @@ public TTreeDataSourceV2Reader(DataSourceOptions options, CacheFactory basketCac
logger.trace("construct ttreedatasourcev2reader");
this.sparkContext = sparkContext;
try {
this.paths = new LinkedList<String>();
for (String path: options.paths()) {
this.paths.addAll(IOFactory.expandPathToList(path));
}
this.paths = (LinkedList<String>)IOFactory.expandPathsToList(options.paths());
// FIXME - More than one file, please
currFile = TFile.getFromFile(fileCache.getROOTFile(this.paths.get(0)));
treeName = options.get("tree").orElse("Events");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,13 @@
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import java.util.Arrays;
import java.util.concurrent.Future;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.LocatedFileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.RemoteIterator;
Expand Down Expand Up @@ -71,6 +73,25 @@ public long getLimit() throws IOException {
return limit;
}

public static List<String> expandPathsToList(String[] paths) throws IOException {
LinkedList<String> out = new LinkedList<String>();
Configuration conf = new Configuration();
URI uri = URI.create(paths[0]);
FileSystem fileSystem = FileSystem.get(uri, conf);
Path[] hpaths = new Path[paths.length];
for(int i = 0; i < paths.length; ++i) { hpaths[i] = new Path(paths[i]); }
FileStatus[] statoos = fileSystem.listStatus(hpaths);
for (int i = 0; i < statoos.length; ++i) {
String strpath = statoos[i].getPath().toString();
if((statoos[i].isFile() || statoos[i].isSymlink()) && strpath.endsWith(".root")) {
out.add(strpath);
} else if(statoos[i].isDirectory()) {
out.addAll(HadoopFile.expandPathToList(strpath));
}
}
return out;
}

public static List<String> expandPathToList(String path) throws IOException {
Configuration conf = new Configuration();
URI uri = URI.create(path);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,14 @@ public static FileInterface openForRead(String path) throws IOException {
return ret;
}

public static List<String> expandPathsToList(String[] paths) throws IOException {
if (Pattern.matches(hadoopPattern, paths[0])) {
return HadoopFile.expandPathsToList(paths);
} else {
return NIOFile.expandPathsToList(paths);
}
}

public static List<String> expandPathToList(String path) throws IOException {
if (Pattern.matches(hadoopPattern, path)) {
return HadoopFile.expandPathToList(path);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,13 @@
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import java.util.concurrent.Future;
import java.util.stream.Collectors;
import java.util.stream.Stream;


public class NIOFile implements FileInterface {
private RandomAccessFile fh;
private FileChannel channel;
Expand Down Expand Up @@ -68,6 +70,14 @@ public long getLimit() throws IOException {
return fh.length();
}

public static List<String> expandPathsToList(String[] paths) throws IOException {
LinkedList<String> out = new LinkedList<String>();
for(int i = 0; i < paths.length; ++i) {
out.addAll(NIOFile.expandPathToList(paths[i]));
}
return out;
}

public static List<String> expandPathToList(String path) throws IOException {
File tmp = FileSystems.getDefault().getPath(path).toFile();
if (!tmp.isDirectory()) {
Expand Down
19 changes: 19 additions & 0 deletions src/test/java/edu/vanderbilt/accre/root_proxy/IOTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.List;
import java.util.LinkedList;

import org.junit.AfterClass;
import org.junit.BeforeClass;
Expand Down Expand Up @@ -171,4 +172,22 @@ public void searchDirectory_hadoop() throws IOException {
assertEquals(3, files.size());
}

@Test
public void searchDirectory_nio_paths() throws IOException {
String[] paths = new String[1];
paths[0] = "testdata/recursive";
List<String> files = IOFactory.expandPathsToList(paths);
assertEquals(3, files.size());
}

@Test
public void searchDirectory_hadoop_paths() throws IOException {
Path currentRelativePath = Paths.get("");
String s = currentRelativePath.toAbsolutePath().toString();
String[] paths = new String[1];
paths[0] = "file:///" + s + "/" + "testdata/recursive";
List<String> files = IOFactory.expandPathsToList(paths);
assertEquals(3, files.size());
}

}