diff --git a/NOTICE b/NOTICE index a9b6c56de7..289b092800 100644 --- a/NOTICE +++ b/NOTICE @@ -54,3 +54,41 @@ its NOTICE file: This product includes software developed at The Apache Software Foundation (http://www.apache.org/). +-------------------------------------------------------------------------------- + +This project includes code from Kite, developed at Cloudera, Inc. with +the following copyright notice: + +| Copyright 2013 Cloudera Inc. +| +| Licensed under the Apache License, Version 2.0 (the "License"); +| you may not use this file except in compliance with the License. +| You may obtain a copy of the License at +| +| http://www.apache.org/licenses/LICENSE-2.0 +| +| Unless required by applicable law or agreed to in writing, software +| distributed under the License is distributed on an "AS IS" BASIS, +| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +| See the License for the specific language governing permissions and +| limitations under the License. + +-------------------------------------------------------------------------------- + +This project includes code from Netflix, Inc. with the following copyright +notice: + +| Copyright 2016 Netflix, Inc. +| +| Licensed under the Apache License, Version 2.0 (the "License"); +| you may not use this file except in compliance with the License. +| You may obtain a copy of the License at +| +| http://www.apache.org/licenses/LICENSE-2.0 +| +| Unless required by applicable law or agreed to in writing, software +| distributed under the License is distributed on an "AS IS" BASIS, +| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +| See the License for the specific language governing permissions and +| limitations under the License. + diff --git a/parquet-cli/README.md b/parquet-cli/README.md new file mode 100644 index 0000000000..d17d7199c5 --- /dev/null +++ b/parquet-cli/README.md @@ -0,0 +1,107 @@ + + +## Building + +You can build this project using maven: + +``` +mvn clean install -DskipTests +``` + + +## Running + +The build produces a shaded Jar that can be run using the `hadoop` command: + +``` +hadoop jar parquet-cli-1.9.1-runtime.jar org.apache.parquet.cli.Main +``` + +For a shorter command-line invocation, add an alias to your shell like this: + +``` +alias parquet="hadoop jar /path/to/parquet-cli-1.9.1-runtime.jar org.apache.parquet.cli.Main --dollar-zero parquet" +``` + +### Running without Hadoop + +To run from the target directory instead of using the `hadoop` command, first copy the dependencies to a folder: + +``` +mvn dependency:copy-dependencies +``` + +Then, run the command-line and add `target/dependencies/*` to the classpath: + +``` +java -cp 'target/*:target/dependency/*' org.apache.parquet.cli.Main +``` + + +### Help + +The `parquet` tool includes help for the included commands: + +``` +parquet help +``` +``` +Usage: parquet [options] [command] [command options] + + Options: + + -v, --verbose, --debug + Print extra debugging information + + Commands: + + help + Retrieves details on the functions of other commands + meta + Print a Parquet file's metadata + pages + Print page summaries for a Parquet file + dictionary + Print dictionaries for a Parquet column + check-stats + Check Parquet files for corrupt page and column stats (PARQUET-251) + schema + Print the Avro schema for a file + csv-schema + Build a schema from a CSV data sample + convert-csv + Create a file from CSV data + convert + Create a Parquet file from a data file + to-avro + Create an Avro file from a data file + cat + Print the first N records from a file + head + Print the first N records from a file + + Examples: + + # print information for create + parquet help create + + See 'parquet help ' for more information on a specific command. +``` + diff --git a/parquet-cli/pom.xml b/parquet-cli/pom.xml new file mode 100644 index 0000000000..a9cd21bbea --- /dev/null +++ b/parquet-cli/pom.xml @@ -0,0 +1,153 @@ + + + + org.apache.parquet + parquet + ../pom.xml + 1.9.1-SNAPSHOT + + + 4.0.0 + + parquet-cli + jar + + Apache Parquet Command-line + https://parquet.apache.org + + + + org.apache.parquet + parquet-avro + ${project.version} + + + org.apache.avro + avro + ${avro.version} + + + org.slf4j + slf4j-api + ${slf4j.version} + + + net.sf.opencsv + opencsv + ${opencsv.version} + + + com.fasterxml.jackson.core + jackson-databind + ${jackson2.version} + + + com.beust + jcommander + ${jcommander.version} + + + + org.slf4j + slf4j-log4j12 + ${slf4j.version} + provided + + + com.google.guava + guava + ${guava.version} + provided + + + commons-codec + commons-codec + ${commons-codec.version} + provided + + + org.apache.hadoop + hadoop-client + ${hadoop.version} + provided + + + + + + + + org.apache.maven.plugins + maven-jar-plugin + + + org.apache.maven.plugins + maven-shade-plugin + + + package + + shade + + + true + runtime + false + + + org.xerial.snappy:* + + **/LICENSE + + + + *:* + + META-INF/LICENSE.txt + META-INF/NOTICE.txt + + + + + + * + + + + + + org.apache.avro + ${shade.prefix}.org.apache.avro + + + + + + + + + + diff --git a/parquet-cli/src/main/java/org/apache/parquet/cli/BaseCommand.java b/parquet-cli/src/main/java/org/apache/parquet/cli/BaseCommand.java new file mode 100644 index 0000000000..4b471649a0 --- /dev/null +++ b/parquet-cli/src/main/java/org/apache/parquet/cli/BaseCommand.java @@ -0,0 +1,397 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.parquet.cli; + +import com.beust.jcommander.internal.Lists; +import com.google.common.annotations.VisibleForTesting; +import com.google.common.base.Preconditions; +import com.google.common.io.CharStreams; +import com.google.common.io.Resources; +import org.apache.avro.Schema; +import org.apache.avro.file.DataFileReader; +import org.apache.avro.file.SeekableInput; +import org.apache.avro.generic.GenericData; +import org.apache.avro.generic.GenericDatumReader; +import org.apache.hadoop.conf.Configurable; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.ChecksumFileSystem; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.LocalFileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.parquet.avro.AvroParquetReader; +import org.apache.parquet.avro.AvroReadSupport; +import org.apache.parquet.cli.json.AvroJsonReader; +import org.apache.parquet.cli.util.Formats; +import org.apache.parquet.cli.util.GetClassLoader; +import org.apache.parquet.cli.util.Schemas; +import org.apache.parquet.cli.util.SeekableFSDataInputStream; +import org.apache.parquet.hadoop.ParquetReader; +import org.slf4j.Logger; +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.net.MalformedURLException; +import java.net.URI; +import java.net.URL; +import java.nio.charset.Charset; +import java.security.AccessController; +import java.util.Iterator; +import java.util.List; +import java.util.NoSuchElementException; + +public abstract class BaseCommand implements Command, Configurable { + + @VisibleForTesting + static final Charset UTF8 = Charset.forName("utf8"); + + private static final String RESOURCE_URI_SCHEME = "resource"; + private static final String STDIN_AS_SOURCE = "stdin"; + + protected final Logger console; + + private Configuration conf = null; + private LocalFileSystem localFS = null; + + public BaseCommand(Logger console) { + this.console = console; + } + + /** + * @return FileSystem to use when no file system scheme is present in a path + * @throws IOException + */ + public FileSystem defaultFS() throws IOException { + if (localFS == null) { + this.localFS = FileSystem.getLocal(getConf()); + } + return localFS; + } + + /** + * Output content to the console or a file. + * + * This will not produce checksum files. + * + * @param content String content to write + * @param console A {@link Logger} for writing to the console + * @param filename The destination {@link Path} as a String + * @throws IOException + */ + public void output(String content, Logger console, String filename) + throws IOException { + if (filename == null || "-".equals(filename)) { + console.info(content); + } else { + FSDataOutputStream outgoing = create(filename); + try { + outgoing.write(content.getBytes(UTF8)); + } finally { + outgoing.close(); + } + } + } + + /** + * Creates a file and returns an open {@link FSDataOutputStream}. + * + * If the file does not have a file system scheme, this uses the default FS. + * + * This will not produce checksum files and will overwrite a file that + * already exists. + * + * @param filename The filename to create + * @return An open FSDataOutputStream + * @throws IOException + */ + public FSDataOutputStream create(String filename) throws IOException { + return create(filename, true); + } + + /** + * Creates a file and returns an open {@link FSDataOutputStream}. + * + * If the file does not have a file system scheme, this uses the default FS. + * + * This will produce checksum files and will overwrite a file that already + * exists. + * + * @param filename The filename to create + * @return An open FSDataOutputStream + * @throws IOException + */ + public FSDataOutputStream createWithChecksum(String filename) + throws IOException { + return create(filename, false); + } + + private FSDataOutputStream create(String filename, boolean noChecksum) + throws IOException { + Path filePath = qualifiedPath(filename); + // even though it was qualified using the default FS, it may not be in it + FileSystem fs = filePath.getFileSystem(getConf()); + if (noChecksum && fs instanceof ChecksumFileSystem) { + fs = ((ChecksumFileSystem) fs).getRawFileSystem(); + } + return fs.create(filePath, true /* overwrite */); + } + + /** + * Returns a qualified {@link Path} for the {@code filename}. + * + * If the file does not have a file system scheme, this uses the default FS. + * + * @param filename The filename to qualify + * @return A qualified Path for the filename + * @throws IOException + */ + public Path qualifiedPath(String filename) throws IOException { + Path cwd = defaultFS().makeQualified(new Path(".")); + return new Path(filename).makeQualified(defaultFS().getUri(), cwd); + } + + /** + * Returns a {@link URI} for the {@code filename} that is a qualified Path or + * a resource URI. + * + * If the file does not have a file system scheme, this uses the default FS. + * + * @param filename The filename to qualify + * @return A qualified URI for the filename + * @throws IOException + */ + public URI qualifiedURI(String filename) throws IOException { + URI fileURI = URI.create(filename); + if (RESOURCE_URI_SCHEME.equals(fileURI.getScheme())) { + return fileURI; + } else { + return qualifiedPath(filename).toUri(); + } + } + + /** + * Opens an existing file or resource. + * + * If the file does not have a file system scheme, this uses the default FS. + * + * @param filename The filename to open. + * @return An open InputStream with the file contents + * @throws IOException + * @throws IllegalArgumentException If the file does not exist + */ + public InputStream open(String filename) throws IOException { + if (STDIN_AS_SOURCE.equals(filename)) { + return System.in; + } + + URI uri = qualifiedURI(filename); + if (RESOURCE_URI_SCHEME.equals(uri.getScheme())) { + return Resources.getResource(uri.getRawSchemeSpecificPart()).openStream(); + } else { + Path filePath = new Path(uri); + // even though it was qualified using the default FS, it may not be in it + FileSystem fs = filePath.getFileSystem(getConf()); + return fs.open(filePath); + } + } + + public SeekableInput openSeekable(String filename) throws IOException { + Path path = qualifiedPath(filename); + // even though it was qualified using the default FS, it may not be in it + FileSystem fs = path.getFileSystem(getConf()); + return new SeekableFSDataInputStream(fs, path); + } + + @Override + public void setConf(Configuration conf) { + this.conf = conf; + HadoopFileSystemURLStreamHandler.setDefaultConf(conf); + } + + @Override + public Configuration getConf() { + return conf; + } + + /** + * Returns a {@link ClassLoader} for a set of jars and directories. + * + * @param jars A list of jar paths + * @param paths A list of directories containing .class files + * @throws MalformedURLException + */ + protected static ClassLoader loaderFor(List jars, List paths) + throws MalformedURLException { + return AccessController.doPrivileged(new GetClassLoader(urls(jars, paths))); + } + + /** + * Returns a {@link ClassLoader} for a set of jars. + * + * @param jars A list of jar paths + * @throws MalformedURLException + */ + protected static ClassLoader loaderForJars(List jars) + throws MalformedURLException { + return AccessController.doPrivileged(new GetClassLoader(urls(jars, null))); + } + + /** + * Returns a {@link ClassLoader} for a set of directories. + * + * @param paths A list of directories containing .class files + * @throws MalformedURLException + */ + protected static ClassLoader loaderForPaths(List paths) + throws MalformedURLException { + return AccessController.doPrivileged(new GetClassLoader(urls(null, paths))); + } + + private static List urls(List jars, List dirs) + throws MalformedURLException { + // check the additional jars and lib directories in the local FS + final List urls = Lists.newArrayList(); + if (dirs != null) { + for (String lib : dirs) { + // final URLs must end in '/' for URLClassLoader + File path = lib.endsWith("/") ? new File(lib) : new File(lib + "/"); + Preconditions.checkArgument(path.exists(), + "Lib directory does not exist: " + lib); + Preconditions.checkArgument(path.isDirectory(), + "Not a directory: " + lib); + Preconditions.checkArgument(path.canRead() && path.canExecute(), + "Insufficient permissions to access lib directory: " + lib); + urls.add(path.toURI().toURL()); + } + } + if (jars != null) { + for (String jar : jars) { + File path = new File(jar); + Preconditions.checkArgument(path.exists(), + "Jar files does not exist: " + jar); + Preconditions.checkArgument(path.isFile(), + "Not a file: " + jar); + Preconditions.checkArgument(path.canRead(), + "Cannot read jar file: " + jar); + urls.add(path.toURI().toURL()); + } + } + return urls; + } + + protected Iterable openDataFile(final String source, Schema projection) + throws IOException { + Formats.Format format = Formats.detectFormat(open(source)); + switch (format) { + case PARQUET: + Configuration conf = new Configuration(getConf()); + // TODO: add these to the reader builder + AvroReadSupport.setRequestedProjection(conf, projection); + AvroReadSupport.setAvroReadSchema(conf, projection); + final ParquetReader parquet = AvroParquetReader.builder(qualifiedPath(source)) + .disableCompatibility() + .withDataModel(GenericData.get()) + .withConf(conf) + .build(); + return new Iterable() { + @Override + public Iterator iterator() { + return new Iterator() { + private boolean hasNext = false; + private D next = advance(); + + @Override + public boolean hasNext() { + return hasNext; + } + + @Override + public D next() { + if (!hasNext) { + throw new NoSuchElementException(); + } + D toReturn = next; + this.next = advance(); + return toReturn; + } + + private D advance() { + try { + D next = parquet.read(); + this.hasNext = (next != null); + return next; + } catch (IOException e) { + throw new RuntimeException( + "Failed while reading Parquet file: " + source, e); + } + } + + @Override + public void remove() { + throw new UnsupportedOperationException("Remove is not supported"); + } + }; + } + }; + + case AVRO: + Iterable avroReader = (Iterable) DataFileReader.openReader( + openSeekable(source), new GenericDatumReader<>(projection)); + return avroReader; + + default: + if (source.endsWith("json")) { + return new AvroJsonReader<>(open(source), projection); + } else { + Preconditions.checkArgument(projection == null, + "Cannot select columns from text files"); + Iterable text = CharStreams.readLines(new InputStreamReader(open(source))); + return text; + } + } + } + + protected Schema getAvroSchema(String source) throws IOException { + Formats.Format format; + try (SeekableInput in = openSeekable(source)) { + format = Formats.detectFormat((InputStream) in); + in.seek(0); + + switch (format) { + case PARQUET: + return Schemas.fromParquet(getConf(), qualifiedURI(source)); + case AVRO: + return Schemas.fromAvro(open(source)); + case TEXT: + if (source.endsWith("avsc")) { + return Schemas.fromAvsc(open(source)); + } else if (source.endsWith("json")) { + return Schemas.fromJSON("json", open(source)); + } + default: + } + + throw new IllegalArgumentException(String.format( + "Could not determine file format of %s.", source)); + } + } + +} diff --git a/parquet-cli/src/main/java/org/apache/parquet/cli/Command.java b/parquet-cli/src/main/java/org/apache/parquet/cli/Command.java new file mode 100644 index 0000000000..9c19143258 --- /dev/null +++ b/parquet-cli/src/main/java/org/apache/parquet/cli/Command.java @@ -0,0 +1,40 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.cli; + +import java.io.IOException; +import java.util.List; + +public interface Command { + /** + * Runs this {@code Command}. + * + * @return a return code for the process, 0 indicates success. + * @throws IOException + */ + int run() throws IOException; + + /** + * Returns a list of example uses. Lines starting with '#' will not have the + * executable name added when formatting. + * + * @return a list of String examples + */ + List getExamples(); +} diff --git a/parquet-cli/src/main/java/org/apache/parquet/cli/HadoopFileSystemURLStreamHandler.java b/parquet-cli/src/main/java/org/apache/parquet/cli/HadoopFileSystemURLStreamHandler.java new file mode 100644 index 0000000000..548544a0b7 --- /dev/null +++ b/parquet-cli/src/main/java/org/apache/parquet/cli/HadoopFileSystemURLStreamHandler.java @@ -0,0 +1,79 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.cli; + +import java.io.IOException; +import java.io.InputStream; +import java.net.URL; +import java.net.URLConnection; +import java.net.URLStreamHandler; +import org.apache.hadoop.conf.Configurable; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; + +/** + * A {@link URLStreamHandler} for handling Hadoop filesystem URLs, + * most commonly those with the hdfs scheme. + */ +public class HadoopFileSystemURLStreamHandler extends URLStreamHandler + implements Configurable { + + private static Configuration defaultConf = new Configuration(); + + public static Configuration getDefaultConf() { + return defaultConf; + } + + public static void setDefaultConf(Configuration defaultConf) { + HadoopFileSystemURLStreamHandler.defaultConf = defaultConf; + } + + private Configuration conf = defaultConf; + + @Override + public void setConf(Configuration conf) { + this.conf = conf; + } + + @Override + public Configuration getConf() { + return conf; + } + + @Override + protected URLConnection openConnection(URL url) throws IOException { + return new HadoopFileSystemURLConnection(url); + } + + class HadoopFileSystemURLConnection extends URLConnection { + public HadoopFileSystemURLConnection(URL url) { + super(url); + } + @Override + public void connect() throws IOException { + } + @Override + public InputStream getInputStream() throws IOException { + Path path = new Path(url.toExternalForm()); + FileSystem fileSystem = path.getFileSystem(conf); + return fileSystem.open(path); + } + } +} diff --git a/parquet-cli/src/main/java/org/apache/parquet/cli/Help.java b/parquet-cli/src/main/java/org/apache/parquet/cli/Help.java new file mode 100644 index 0000000000..791d1693f5 --- /dev/null +++ b/parquet-cli/src/main/java/org/apache/parquet/cli/Help.java @@ -0,0 +1,147 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.cli; + +import com.beust.jcommander.JCommander; +import com.beust.jcommander.Parameter; +import com.beust.jcommander.ParameterDescription; +import com.beust.jcommander.Parameters; +import com.google.common.collect.Lists; +import org.slf4j.Logger; +import java.util.List; + +@Parameters(commandDescription = "Retrieves details on the functions of other commands") +public class Help implements Command { + @Parameter(description = "") + List helpCommands = Lists.newArrayList(); + + private final JCommander jc; + private final Logger console; + private String programName; + + public Help(JCommander jc, Logger console) { + this.jc = jc; + this.console = console; + } + + public void setProgramName(String programName) { + this.programName = programName; + } + + @Override + public int run() { + if (helpCommands.isEmpty()) { + printGenericHelp(); + + } else { + for (String cmd : helpCommands) { + JCommander commander = jc.getCommands().get(cmd); + if (commander == null) { + console.error("\nUnknown command: {}\n", cmd); + printGenericHelp(); + return 1; + } + + boolean hasRequired = false; + console.info("\nUsage: {} [general options] {} {} [command options]", + new Object[] { + programName, cmd, + commander.getMainParameterDescription()}); + console.info("\n Description:"); + console.info("\n {}", jc.getCommandDescription(cmd)); + if (!commander.getParameters().isEmpty()) { + console.info("\n Command options:\n"); + for (ParameterDescription param : commander.getParameters()) { + hasRequired = printOption(console, param) || hasRequired; + } + if (hasRequired) { + console.info("\n * = required"); + } + } + List examples = ((Command) commander.getObjects().get(0)).getExamples(); + if (examples != null) { + console.info("\n Examples:"); + for (String example : examples) { + if (example.startsWith("#")) { + // comment + console.info("\n {}", example); + } else { + console.info(" {} {} {}", + new Object[] {programName, cmd, example}); + } + } + } + // add an extra newline in case there are more commands + console.info(""); + } + } + return 0; + } + + public void printGenericHelp() { + boolean hasRequired = false; + console.info( + "\nUsage: {} [options] [command] [command options]", + programName); + console.info("\n Options:\n"); + for (ParameterDescription param : jc.getParameters()) { + hasRequired = printOption(console, param) || hasRequired; + } + if (hasRequired) { + console.info("\n * = required"); + } + console.info("\n Commands:\n"); + for (String command : jc.getCommands().keySet()) { + console.info(" {}\n\t{}", + command, jc.getCommandDescription(command)); + } + console.info("\n Examples:"); + console.info("\n # print information for create\n {} help create", + programName); + console.info("\n See '{} help ' for more information on a " + + "specific command.", programName); + } + + private boolean printOption(Logger console, ParameterDescription param) { + boolean required = param.getParameter().required(); + if (!param.getParameter().hidden()) { + console.info(" {} {}\n\t{}{}", new Object[]{ + required ? "*" : " ", + param.getNames().trim(), + param.getDescription(), + formatDefault(param)}); + } + return required; + } + + private String formatDefault(ParameterDescription param) { + Object defaultValue = param.getDefault(); + if (defaultValue == null || param.getParameter().arity() < 1) { + return ""; + } + return " (default: " + ((defaultValue instanceof String) ? + "\"" + defaultValue + "\"" : + defaultValue.toString()) + ")"; + } + + @Override + public List getExamples() { + return null; + } +} diff --git a/parquet-cli/src/main/java/org/apache/parquet/cli/Main.java b/parquet-cli/src/main/java/org/apache/parquet/cli/Main.java new file mode 100644 index 0000000000..990193c731 --- /dev/null +++ b/parquet-cli/src/main/java/org/apache/parquet/cli/Main.java @@ -0,0 +1,178 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.cli; + +import com.beust.jcommander.JCommander; +import com.beust.jcommander.MissingCommandException; +import com.beust.jcommander.Parameter; +import com.beust.jcommander.ParameterException; +import com.beust.jcommander.Parameters; +import com.google.common.annotations.VisibleForTesting; +import com.google.common.collect.ImmutableSet; +import org.apache.parquet.cli.commands.CSVSchemaCommand; +import org.apache.parquet.cli.commands.CatCommand; +import org.apache.parquet.cli.commands.CheckParquet251Command; +import org.apache.parquet.cli.commands.ConvertCSVCommand; +import org.apache.parquet.cli.commands.ConvertCommand; +import org.apache.parquet.cli.commands.ParquetMetadataCommand; +import org.apache.parquet.cli.commands.SchemaCommand; +import org.apache.parquet.cli.commands.ShowDictionaryCommand; +import org.apache.parquet.cli.commands.ShowPagesCommand; +import org.apache.parquet.cli.commands.ToAvroCommand; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configurable; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.conf.Configured; +import org.apache.hadoop.util.Tool; +import org.apache.hadoop.util.ToolRunner; +import org.apache.log4j.Level; +import org.apache.log4j.PropertyConfigurator; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import java.util.Set; + +@Parameters(commandDescription = "Parquet file utils") +public class Main extends Configured implements Tool { + + @Parameter(names = {"-v", "--verbose", "--debug"}, + description = "Print extra debugging information") + private boolean debug = false; + + @VisibleForTesting + @Parameter(names="--dollar-zero", + description="A way for the runtime path to be passed in", hidden=true) + String programName = DEFAULT_PROGRAM_NAME; + + @VisibleForTesting + static final String DEFAULT_PROGRAM_NAME = "parquet"; + + private static Set HELP_ARGS = ImmutableSet.of("-h", "-help", "--help", "help"); + + private final Logger console; + private final Help help; + + @VisibleForTesting + final JCommander jc; + + Main(Logger console) { + this.console = console; + this.jc = new JCommander(this); + this.help = new Help(jc, console); + jc.setProgramName(DEFAULT_PROGRAM_NAME); + jc.addCommand("help", help, "-h", "-help", "--help"); + jc.addCommand("meta", new ParquetMetadataCommand(console)); + jc.addCommand("pages", new ShowPagesCommand(console)); + jc.addCommand("dictionary", new ShowDictionaryCommand(console)); + jc.addCommand("check-stats", new CheckParquet251Command(console)); + jc.addCommand("schema", new SchemaCommand(console)); + jc.addCommand("csv-schema", new CSVSchemaCommand(console)); + jc.addCommand("convert-csv", new ConvertCSVCommand(console)); + jc.addCommand("convert", new ConvertCommand(console)); + jc.addCommand("to-avro", new ToAvroCommand(console)); + jc.addCommand("cat", new CatCommand(console, 0)); + jc.addCommand("head", new CatCommand(console, 10)); + } + + @Override + public int run(String[] args) throws Exception { + try { + jc.parse(args); + } catch (MissingCommandException e) { + console.error(e.getMessage()); + return 1; + } catch (ParameterException e) { + help.setProgramName(programName); + String cmd = jc.getParsedCommand(); + if (args.length == 1) { // i.e., just the command (missing required arguments) + help.helpCommands.add(cmd); + help.run(); + return 1; + } else { // check for variants like 'cmd --help' etc. + for (String arg : args) { + if (HELP_ARGS.contains(arg)) { + help.helpCommands.add(cmd); + help.run(); + return 0; + } + } + } + console.error(e.getMessage()); + return 1; + } + + help.setProgramName(programName); + + // configure log4j + if (debug) { + org.apache.log4j.Logger console = org.apache.log4j.Logger.getLogger(Main.class); + console.setLevel(Level.DEBUG); + } + + String parsed = jc.getParsedCommand(); + if (parsed == null) { + help.run(); + return 1; + } else if ("help".equals(parsed)) { + return help.run(); + } + + Command command = (Command) jc.getCommands().get(parsed).getObjects().get(0); + if (command == null) { + help.run(); + return 1; + } + + try { + if (command instanceof Configurable) { + ((Configurable) command).setConf(getConf()); + } + return command.run(); + } catch (IllegalArgumentException e) { + if (debug) { + console.error("Argument error", e); + } else { + console.error("Argument error: {}", e.getMessage()); + } + return 1; + } catch (IllegalStateException e) { + if (debug) { + console.error("State error", e); + } else { + console.error("State error: {}", e.getMessage()); + } + return 1; + } catch (Exception e) { + console.error("Unknown error", e); + return 1; + } + } + + public static void main(String[] args) throws Exception { + // reconfigure logging with the kite CLI configuration + PropertyConfigurator.configure( + Main.class.getResource("/cli-logging.properties")); + Logger console = LoggerFactory.getLogger(Main.class); + // use Log4j for any libraries using commons-logging + LogFactory.getFactory().setAttribute( + "org.apache.commons.logging.Log", + "org.apache.commons.logging.impl.Log4JLogger"); + int rc = ToolRunner.run(new Configuration(), new Main(console), args); + System.exit(rc); + } +} diff --git a/parquet-cli/src/main/java/org/apache/parquet/cli/Util.java b/parquet-cli/src/main/java/org/apache/parquet/cli/Util.java new file mode 100644 index 0000000000..860a218d99 --- /dev/null +++ b/parquet-cli/src/main/java/org/apache/parquet/cli/Util.java @@ -0,0 +1,335 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.parquet.cli; + +import com.google.common.base.Joiner; +import com.google.common.base.Preconditions; +import com.google.common.base.Splitter; +import com.google.common.collect.Iterables; +import org.apache.commons.codec.binary.Hex; +import org.apache.parquet.column.ColumnDescriptor; +import org.apache.parquet.column.Encoding; +import org.apache.parquet.column.EncodingStats; +import org.apache.parquet.column.statistics.BinaryStatistics; +import org.apache.parquet.column.statistics.BooleanStatistics; +import org.apache.parquet.column.statistics.DoubleStatistics; +import org.apache.parquet.column.statistics.FloatStatistics; +import org.apache.parquet.column.statistics.IntStatistics; +import org.apache.parquet.column.statistics.LongStatistics; +import org.apache.parquet.column.statistics.Statistics; +import org.apache.parquet.hadoop.metadata.CompressionCodecName; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.OriginalType; +import org.apache.parquet.schema.PrimitiveType; +import org.apache.parquet.schema.Type; +import java.nio.charset.StandardCharsets; +import java.util.Locale; +import java.util.Set; + +import static org.apache.parquet.column.Encoding.BIT_PACKED; +import static org.apache.parquet.column.Encoding.DELTA_BINARY_PACKED; +import static org.apache.parquet.column.Encoding.DELTA_BYTE_ARRAY; +import static org.apache.parquet.column.Encoding.PLAIN; +import static org.apache.parquet.column.Encoding.PLAIN_DICTIONARY; +import static org.apache.parquet.column.Encoding.RLE; +import static org.apache.parquet.column.Encoding.RLE_DICTIONARY; +import static org.apache.parquet.format.Encoding.DELTA_LENGTH_BYTE_ARRAY; + + +public class Util { + + private static final long KB = 1024; + private static final long MB = 1024 * KB; + private static final long GB = 1024 * MB; + private static final long TB = 1024 * GB; + + public static String humanReadable(float bytes) { + if (bytes > TB) { + return String.format("%.03f TB", bytes / TB); + } else if (bytes > GB) { + return String.format("%.03f GB", bytes / GB); + } else if (bytes > MB) { + return String.format("%.03f MB", bytes / MB); + } else if (bytes > KB) { + return String.format("%.03f kB", bytes / KB); + } else { + return String.format("%.02f B", bytes); + } + } + + public static String humanReadable(long bytes) { + if (bytes > TB) { + return String.format("%.03f TB", ((float) bytes) / TB); + } else if (bytes > GB) { + return String.format("%.03f GB", ((float) bytes) / GB); + } else if (bytes > MB) { + return String.format("%.03f MB", ((float) bytes) / MB); + } else if (bytes > KB) { + return String.format("%.03f kB", ((float) bytes) / KB); + } else { + return String.format("%d B", bytes); + } + } + + public static String minMaxAsString(Statistics stats, OriginalType annotation) { + if (stats == null) { + return "no stats"; + } + if (!stats.hasNonNullValue()) { + return ""; + } + // TODO: use original types when showing decimal, timestamp, etc. + if (stats instanceof BooleanStatistics) { + return String.format("%s / %s", + ((BooleanStatistics) stats).getMin(), + ((BooleanStatistics) stats).getMax()); + } else if (stats instanceof IntStatistics) { + return String.format("%d / %d", + ((IntStatistics) stats).getMin(), + ((IntStatistics) stats).getMax()); + } else if (stats instanceof LongStatistics) { + return String.format("%d / %d", + ((LongStatistics) stats).getMin(), + ((LongStatistics) stats).getMax()); + } else if (stats instanceof FloatStatistics) { + return String.format("%f / %f", + ((FloatStatistics) stats).getMin(), + ((FloatStatistics) stats).getMax()); + } else if (stats instanceof DoubleStatistics) { + return String.format("%f / %f", + ((DoubleStatistics) stats).getMin(), + ((DoubleStatistics) stats).getMax()); + } else if (stats instanceof BinaryStatistics) { + byte[] minBytes = stats.getMinBytes(); + byte[] maxBytes = stats.getMaxBytes(); + return String.format("%s / %s", + printable(minBytes, annotation == OriginalType.UTF8, 30), + printable(maxBytes, annotation == OriginalType.UTF8, 30)); + } else { + throw new RuntimeException("Unknown stats type: " + stats); + } + } + + public static String toString(Statistics stats, long count, OriginalType annotation) { + if (stats == null) { + return "no stats"; + } + // TODO: use original types when showing decimal, timestamp, etc. + if (stats instanceof BooleanStatistics) { + return String.format("nulls: %d/%d", stats.getNumNulls(), count); + } else if (stats instanceof IntStatistics) { + return String.format("min: %d max: %d nulls: %d/%d", + ((IntStatistics) stats).getMin(), ((IntStatistics) stats).getMax(), + stats.getNumNulls(), count); + } else if (stats instanceof LongStatistics) { + return String.format("min: %d max: %d nulls: %d/%d", + ((LongStatistics) stats).getMin(), ((LongStatistics) stats).getMax(), + stats.getNumNulls(), count); + } else if (stats instanceof FloatStatistics) { + return String.format("min: %f max: %f nulls: %d/%d", + ((FloatStatistics) stats).getMin(), + ((FloatStatistics) stats).getMax(), + stats.getNumNulls(), count); + } else if (stats instanceof DoubleStatistics) { + return String.format("min: %f max: %f nulls: %d/%d", + ((DoubleStatistics) stats).getMin(), + ((DoubleStatistics) stats).getMax(), + stats.getNumNulls(), count); + } else if (stats instanceof BinaryStatistics) { + byte[] minBytes = stats.getMinBytes(); + byte[] maxBytes = stats.getMaxBytes(); + return String.format("min: %s max: %s nulls: %d/%d", + printable(minBytes, annotation == OriginalType.UTF8, 30), + printable(maxBytes, annotation == OriginalType.UTF8, 30), + stats.getNumNulls(), count); + } else { + throw new RuntimeException("Unknown stats type: " + stats); + } + } + + private static String printable(byte[] bytes, boolean isUtf8, int len) { + if (bytes == null) { + return "null"; + } else if (isUtf8) { + return humanReadable(new String(bytes, StandardCharsets.UTF_8), len); + } else { + return humanReadable(bytes, len); + } + } + + public static String humanReadable(String str, int len) { + if (str == null) { + return "null"; + } + + StringBuilder sb = new StringBuilder(); + sb.append("\""); + if (str.length() > len - 2) { + sb.append(str.substring(0, len - 5)).append("..."); + } else { + sb.append(str); + } + sb.append("\""); + + return sb.toString(); + } + + public static String humanReadable(byte[] bytes, int len) { + if (bytes == null || bytes.length == 0) { + return "null"; + } + + StringBuilder sb = new StringBuilder(); + String asString = Hex.encodeHexString(bytes); + sb.append("0x"); + if (asString.length() > len - 2) { + sb.append(asString.substring(0, (len - 5) / 2)).append("..."); + } else { + sb.append(asString); + } + + return sb.toString(); + } + + public static String shortCodec(CompressionCodecName codec) { + switch (codec) { + case UNCOMPRESSED: + return "_"; + case SNAPPY: + return "S"; + case GZIP: + return "G"; + case LZO: + return "L"; + default: + return "?"; + } + } + + public static String encodingAsString(Encoding encoding, boolean isDict) { + switch (encoding) { + case PLAIN: + return "_"; + case PLAIN_DICTIONARY: + // data pages use RLE, dictionary pages use plain + return isDict ? "_" : "R"; + case RLE_DICTIONARY: + return "R"; + case DELTA_BINARY_PACKED: + case DELTA_LENGTH_BYTE_ARRAY: + case DELTA_BYTE_ARRAY: + return "D"; + default: + return "?"; + } + } + + public static String encodingStatsAsString(EncodingStats encodingStats) { + StringBuilder sb = new StringBuilder(); + if (encodingStats.hasDictionaryPages()) { + for (Encoding encoding: encodingStats.getDictionaryEncodings()) { + sb.append(encodingAsString(encoding, true)); + } + sb.append(" "); + } else { + sb.append(" "); + } + + Set encodings = encodingStats.getDataEncodings(); + if (encodings.contains(RLE_DICTIONARY) || encodings.contains(PLAIN_DICTIONARY)) { + sb.append("R"); + } + if (encodings.contains(PLAIN)) { + sb.append("_"); + } + if (encodings.contains(DELTA_BYTE_ARRAY) || + encodings.contains(DELTA_BINARY_PACKED) || + encodings.contains(DELTA_LENGTH_BYTE_ARRAY)) { + sb.append("D"); + } + + // Check for fallback and add a flag + if (encodingStats.hasDictionaryEncodedPages() && encodingStats.hasNonDictionaryEncodedPages()) { + sb.append(" F"); + } + + return sb.toString(); + } + + public static String encodingsAsString(Set encodings, ColumnDescriptor desc) { + StringBuilder sb = new StringBuilder(); + if (encodings.contains(RLE) || encodings.contains(BIT_PACKED)) { + sb.append(desc.getMaxDefinitionLevel() == 0 ? "B" : "R"); + sb.append(desc.getMaxRepetitionLevel() == 0 ? "B" : "R"); + if (encodings.contains(PLAIN_DICTIONARY)) { + sb.append("R"); + } + if (encodings.contains(PLAIN)) { + sb.append("_"); + } + } else { + sb.append("RR"); + if (encodings.contains(RLE_DICTIONARY)) { + sb.append("R"); + } + if (encodings.contains(PLAIN)) { + sb.append("_"); + } + if (encodings.contains(DELTA_BYTE_ARRAY) || + encodings.contains(DELTA_BINARY_PACKED) || + encodings.contains(DELTA_LENGTH_BYTE_ARRAY)) { + sb.append("D"); + } + } + return sb.toString(); + } + + private static final Splitter DOT = Splitter.on('.'); + + public static ColumnDescriptor descriptor(String column, MessageType schema) { + String[] path = Iterables.toArray(DOT.split(column), String.class); + Preconditions.checkArgument(schema.containsPath(path), + "Schema doesn't have column: " + column); + return schema.getColumnDescription(path); + } + + public static String columnName(ColumnDescriptor desc) { + return Joiner.on('.').join(desc.getPath()); + } + + public static PrimitiveType primitive(MessageType schema, String[] path) { + Type current = schema; + for (String part : path) { + current = current.asGroupType().getType(part); + if (current.isPrimitive()) { + return current.asPrimitiveType(); + } + } + return null; + } + + public static PrimitiveType primitive(String column, MessageType schema) { + String[] path = Iterables.toArray(DOT.split(column), String.class); + Preconditions.checkArgument(schema.containsPath(path), + "Schema doesn't have column: " + column); + return primitive(schema, path); + } + +} diff --git a/parquet-cli/src/main/java/org/apache/parquet/cli/commands/CSVSchemaCommand.java b/parquet-cli/src/main/java/org/apache/parquet/cli/commands/CSVSchemaCommand.java new file mode 100644 index 0000000000..4fbfb9b3db --- /dev/null +++ b/parquet-cli/src/main/java/org/apache/parquet/cli/commands/CSVSchemaCommand.java @@ -0,0 +1,131 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.parquet.cli.commands; + +import com.beust.jcommander.Parameter; +import com.beust.jcommander.Parameters; +import com.beust.jcommander.internal.Lists; +import com.google.common.base.Preconditions; +import com.google.common.collect.ImmutableSet; +import org.apache.parquet.cli.BaseCommand; +import org.apache.parquet.cli.csv.CSVProperties; +import org.apache.parquet.cli.csv.AvroCSV; +import org.slf4j.Logger; +import java.io.IOException; +import java.nio.charset.Charset; +import java.util.List; +import java.util.Set; + +@Parameters(commandDescription="Build a schema from a CSV data sample") +public class CSVSchemaCommand extends BaseCommand { + + public CSVSchemaCommand(Logger console) { + super(console); + } + + @Parameter(description="") + List samplePaths; + + @Parameter(names={"-o", "--output"}, description="Save schema avsc to path") + String outputPath = null; + + @Parameter(names={"--class", "--record-name"}, required = true, + description="A name or class for the result schema") + String recordName = null; + + @Parameter(names="--minimize", + description="Minimize schema file size by eliminating white space") + boolean minimize=false; + + @Parameter(names="--delimiter", description="Delimiter character") + String delimiter = ","; + + @Parameter(names="--escape", description="Escape character") + String escape = "\\"; + + @Parameter(names="--quote", description="Quote character") + String quote = "\""; + + @Parameter(names="--no-header", description="Don't use first line as CSV header") + boolean noHeader = false; + + @Parameter(names="--skip-lines", description="Lines to skip before CSV start") + int linesToSkip = 0; + + @Parameter(names="--charset", description="Character set name", hidden = true) + String charsetName = Charset.defaultCharset().displayName(); + + @Parameter(names="--header", + description="Line to use as a header. Must match the CSV settings.") + String header; + + @Parameter(names="--require", + description="Do not allow null values for the given field") + List requiredFields; + + @Override + public int run() throws IOException { + Preconditions.checkArgument(samplePaths != null && !samplePaths.isEmpty(), + "Sample CSV path is required"); + Preconditions.checkArgument(samplePaths.size() == 1, + "Only one CSV sample can be given"); + + if (header != null) { + // if a header is given on the command line, do assume one is in the file + noHeader = true; + } + + CSVProperties props = new CSVProperties.Builder() + .delimiter(delimiter) + .escape(escape) + .quote(quote) + .header(header) + .hasHeader(!noHeader) + .linesToSkip(linesToSkip) + .charset(charsetName) + .build(); + + Set required = ImmutableSet.of(); + if (requiredFields != null) { + required = ImmutableSet.copyOf(requiredFields); + } + + // assume fields are nullable by default, users can easily change this + String sampleSchema = AvroCSV + .inferNullableSchema( + recordName, open(samplePaths.get(0)), props, required) + .toString(!minimize); + + output(sampleSchema, console, outputPath); + + return 0; + } + + @Override + public List getExamples() { + return Lists.newArrayList( + "# Print the schema for samples.csv to standard out:", + "samples.csv --record-name Sample", + "# Write schema to sample.avsc:", + "samples.csv -o sample.avsc --record-name Sample" + ); + } + +} diff --git a/parquet-cli/src/main/java/org/apache/parquet/cli/commands/CatCommand.java b/parquet-cli/src/main/java/org/apache/parquet/cli/commands/CatCommand.java new file mode 100644 index 0000000000..7703e88ca0 --- /dev/null +++ b/parquet-cli/src/main/java/org/apache/parquet/cli/commands/CatCommand.java @@ -0,0 +1,106 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.cli.commands; + +import com.beust.jcommander.Parameter; +import com.beust.jcommander.Parameters; +import com.google.common.base.Preconditions; +import com.google.common.collect.Lists; +import com.google.common.io.Closeables; +import org.apache.avro.Schema; +import org.apache.parquet.cli.BaseCommand; +import org.apache.parquet.cli.util.Expressions; +import org.slf4j.Logger; +import java.io.Closeable; +import java.io.IOException; +import java.util.List; + +import static org.apache.parquet.cli.util.Expressions.select; + +@Parameters(commandDescription = "Print the first N records from a file") +public class CatCommand extends BaseCommand { + + @Parameter(description = "") + List sourceFiles; + + @Parameter(names={"-n", "--num-records"}, + description="The number of records to print") + long numRecords; + + @Parameter( + names = {"-c", "--column", "--columns"}, + description = "List of columns") + List columns; + + public CatCommand(Logger console, long defaultNumRecords) { + super(console); + this.numRecords = defaultNumRecords; + } + + @Override + public int run() throws IOException { + Preconditions.checkArgument( + sourceFiles != null && !sourceFiles.isEmpty(), + "Missing file name"); + Preconditions.checkArgument(sourceFiles.size() == 1, + "Only one file can be given"); + + final String source = sourceFiles.get(0); + + Schema schema = getAvroSchema(source); + Schema projection = Expressions.filterSchema(schema, columns); + + Iterable reader = openDataFile(source, projection); + boolean threw = true; + long count = 0; + try { + for (Object record : reader) { + if (numRecords > 0 && count >= numRecords) { + break; + } + if (columns == null || columns.size() != 1) { + console.info(String.valueOf(record)); + } else { + console.info(String.valueOf(select(projection, record, columns.get(0)))); + } + count += 1; + } + threw = false; + } catch (RuntimeException e) { + throw new RuntimeException("Failed on record " + count, e); + } finally { + if (reader instanceof Closeable) { + Closeables.close((Closeable) reader, threw); + } + } + + return 0; + } + + @Override + public List getExamples() { + return Lists.newArrayList( + "# Show the first 10 records in file \"data.avro\":", + "data.avro", + "# Show the first 50 records in file \"data.parquet\":", + "data.parquet -n 50" + ); + } +} + diff --git a/parquet-cli/src/main/java/org/apache/parquet/cli/commands/CheckParquet251Command.java b/parquet-cli/src/main/java/org/apache/parquet/cli/commands/CheckParquet251Command.java new file mode 100644 index 0000000000..8f6082122b --- /dev/null +++ b/parquet-cli/src/main/java/org/apache/parquet/cli/commands/CheckParquet251Command.java @@ -0,0 +1,351 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.parquet.cli.commands; + +import com.beust.jcommander.Parameter; +import com.beust.jcommander.Parameters; +import com.beust.jcommander.internal.Lists; +import com.google.common.base.Predicate; +import com.google.common.collect.Iterables; +import org.apache.parquet.cli.BaseCommand; +import org.apache.hadoop.fs.Path; +import org.apache.parquet.CorruptStatistics; +import org.apache.parquet.Version; +import org.apache.parquet.VersionParser; +import org.apache.parquet.bytes.BytesInput; +import org.apache.parquet.util.DynConstructors; +import org.apache.parquet.column.ColumnDescriptor; +import org.apache.parquet.column.ColumnReader; +import org.apache.parquet.column.page.DataPage; +import org.apache.parquet.column.page.DataPageV1; +import org.apache.parquet.column.page.DataPageV2; +import org.apache.parquet.column.page.DictionaryPage; +import org.apache.parquet.column.page.PageReadStore; +import org.apache.parquet.column.page.PageReader; +import org.apache.parquet.column.statistics.Statistics; +import org.apache.parquet.format.converter.ParquetMetadataConverter; +import org.apache.parquet.hadoop.ParquetFileReader; +import org.apache.parquet.hadoop.metadata.FileMetaData; +import org.apache.parquet.hadoop.metadata.ParquetMetadata; +import org.apache.parquet.io.ParquetDecodingException; +import org.apache.parquet.io.api.Binary; +import org.apache.parquet.io.api.PrimitiveConverter; +import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName; +import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeNameConverter; +import org.slf4j.Logger; +import javax.annotation.Nullable; +import java.io.IOException; +import java.util.Arrays; +import java.util.List; + +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BINARY; + +@Parameters(commandDescription = "Check Parquet files for corrupt page and column stats (PARQUET-251)") +public class CheckParquet251Command extends BaseCommand { + + public CheckParquet251Command(Logger console) { + super(console); + } + + @Parameter(description = "", required = true) + List files; + + @Override + public int run() throws IOException { + boolean badFiles = false; + for (String file : files) { + String problem = check(file); + if (problem != null) { + badFiles = true; + console.info("{} has corrupt stats: {}", file, problem); + } else { + console.info("{} has no corrupt stats", file); + } + } + + return badFiles ? 1 : 0; + } + + private String check(String file) throws IOException { + Path path = qualifiedPath(file); + ParquetMetadata footer = ParquetFileReader.readFooter( + getConf(), path, ParquetMetadataConverter.NO_FILTER); + + FileMetaData meta = footer.getFileMetaData(); + String createdBy = meta.getCreatedBy(); + if (CorruptStatistics.shouldIgnoreStatistics(createdBy, BINARY)) { + // create fake metadata that will read corrupt stats and return them + FileMetaData fakeMeta = new FileMetaData( + meta.getSchema(), meta.getKeyValueMetaData(), Version.FULL_VERSION); + + // get just the binary columns + List columns = Lists.newArrayList(); + Iterables.addAll(columns, Iterables.filter( + meta.getSchema().getColumns(), + new Predicate() { + @Override + public boolean apply(@Nullable ColumnDescriptor input) { + return input != null && input.getType() == BINARY; + } + })); + + // now check to see if the data is actually corrupt + ParquetFileReader reader = new ParquetFileReader(getConf(), + fakeMeta, path, footer.getBlocks(), columns); + + try { + PageStatsValidator validator = new PageStatsValidator(); + for (PageReadStore pages = reader.readNextRowGroup(); pages != null; + pages = reader.readNextRowGroup()) { + validator.validate(columns, pages); + } + } catch (BadStatsException e) { + return e.getMessage(); + } + } + + return null; + } + + @Override + public List getExamples() { + return Arrays.asList( + "# Check file1.parquet for corrupt page and column stats", + "file1.parquet"); + } + + + public static class BadStatsException extends RuntimeException { + public BadStatsException(String message) { + super(message); + } + } + + public class SingletonPageReader implements PageReader { + private final DictionaryPage dict; + private final DataPage data; + + public SingletonPageReader(DictionaryPage dict, DataPage data) { + this.dict = dict; + this.data = data; + } + + @Override + public DictionaryPage readDictionaryPage() { + return dict; + } + + @Override + public long getTotalValueCount() { + return data.getValueCount(); + } + + @Override + public DataPage readPage() { + return data; + } + } + + private static > + Statistics getStatisticsFromPageHeader(DataPage page) { + return page.accept(new DataPage.Visitor>() { + @Override + @SuppressWarnings("unchecked") + public Statistics visit(DataPageV1 dataPageV1) { + return (Statistics) dataPageV1.getStatistics(); + } + + @Override + @SuppressWarnings("unchecked") + public Statistics visit(DataPageV2 dataPageV2) { + return (Statistics) dataPageV2.getStatistics(); + } + }); + } + + private class StatsValidator> { + private final boolean hasNonNull; + private final T min; + private final T max; + + public StatsValidator(DataPage page) { + Statistics stats = getStatisticsFromPageHeader(page); + this.hasNonNull = stats.hasNonNullValue(); + if (hasNonNull) { + this.min = stats.genericGetMin(); + this.max = stats.genericGetMax(); + } else { + this.min = null; + this.max = null; + } + } + + public void validate(T value) { + if (hasNonNull) { + if (min.compareTo(value) > 0) { + throw new BadStatsException("Min should be <= all values."); + } + if (max.compareTo(value) < 0) { + throw new BadStatsException("Max should be >= all values."); + } + } + } + } + + private PrimitiveConverter getValidatingConverter( + final DataPage page, PrimitiveTypeName type) { + return type.convert(new PrimitiveTypeNameConverter() { + @Override + public PrimitiveConverter convertFLOAT(PrimitiveTypeName primitiveTypeName) { + final StatsValidator validator = new StatsValidator(page); + return new PrimitiveConverter() { + @Override + public void addFloat(float value) { + validator.validate(value); + } + }; + } + + @Override + public PrimitiveConverter convertDOUBLE(PrimitiveTypeName primitiveTypeName) { + final StatsValidator validator = new StatsValidator(page); + return new PrimitiveConverter() { + @Override + public void addDouble(double value) { + validator.validate(value); + } + }; + } + + @Override + public PrimitiveConverter convertINT32(PrimitiveTypeName primitiveTypeName) { + final StatsValidator validator = new StatsValidator(page); + return new PrimitiveConverter() { + @Override + public void addInt(int value) { + validator.validate(value); + } + }; + } + + @Override + public PrimitiveConverter convertINT64(PrimitiveTypeName primitiveTypeName) { + final StatsValidator validator = new StatsValidator(page); + return new PrimitiveConverter() { + @Override + public void addLong(long value) { + validator.validate(value); + } + }; + } + + @Override + public PrimitiveConverter convertBOOLEAN(PrimitiveTypeName primitiveTypeName) { + final StatsValidator validator = new StatsValidator(page); + return new PrimitiveConverter() { + @Override + public void addBoolean(boolean value) { + validator.validate(value); + } + }; + } + + @Override + public PrimitiveConverter convertINT96(PrimitiveTypeName primitiveTypeName) { + return convertBINARY(primitiveTypeName); + } + + @Override + public PrimitiveConverter convertFIXED_LEN_BYTE_ARRAY(PrimitiveTypeName primitiveTypeName) { + return convertBINARY(primitiveTypeName); + } + + @Override + public PrimitiveConverter convertBINARY(PrimitiveTypeName primitiveTypeName) { + final StatsValidator validator = new StatsValidator(page); + return new PrimitiveConverter() { + @Override + public void addBinary(Binary value) { + validator.validate(value); + } + }; + } + }); + } + + private static final DynConstructors.Ctor COL_READER_CTOR = + new DynConstructors.Builder(ColumnReader.class) + .hiddenImpl("org.apache.parquet.column.impl.ColumnReaderImpl", + ColumnDescriptor.class, PageReader.class, + PrimitiveConverter.class, VersionParser.ParsedVersion.class) + .build(); + + public class PageStatsValidator { + public void validate(List columns, PageReadStore store) { + for (ColumnDescriptor desc : columns) { + PageReader reader = store.getPageReader(desc); + DictionaryPage dict = reader.readDictionaryPage(); + DictionaryPage reusableDict = null; + if (dict != null) { + try { + reusableDict = new DictionaryPage( + BytesInput.from(dict.getBytes().toByteArray()), + dict.getDictionarySize(), dict.getEncoding()); + } catch (IOException e) { + throw new ParquetDecodingException("Cannot read dictionary", e); + } + } + DataPage page; + while ((page = reader.readPage()) != null) { + validateStatsForPage(page, reusableDict, desc); + } + } + } + + private void validateStatsForPage(DataPage page, DictionaryPage dict, + ColumnDescriptor desc) { + SingletonPageReader reader = new SingletonPageReader(dict, page); + PrimitiveConverter converter = getValidatingConverter(page, desc.getType()); + Statistics stats = getStatisticsFromPageHeader(page); + + long numNulls = 0; + + ColumnReader column = COL_READER_CTOR.newInstance(desc, reader, converter, null); + for (int i = 0; i < reader.getTotalValueCount(); i += 1) { + if (column.getCurrentDefinitionLevel() >= desc.getMaxDefinitionLevel()) { + column.writeCurrentValueToConverter(); + } else { + numNulls += 1; + } + column.consume(); + } + + if (numNulls != stats.getNumNulls()) { + throw new BadStatsException("Number of nulls doesn't match."); + } + + console.debug(String.format( + "Validated stats min=%s max=%s nulls=%d for page=%s col=%s", + String.valueOf(stats.genericGetMin()), + String.valueOf(stats.genericGetMax()), stats.getNumNulls(), page, + Arrays.toString(desc.getPath()))); + } + } +} diff --git a/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ConvertCSVCommand.java b/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ConvertCSVCommand.java new file mode 100644 index 0000000000..624ba91bf7 --- /dev/null +++ b/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ConvertCSVCommand.java @@ -0,0 +1,204 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.parquet.cli.commands; + +import com.beust.jcommander.Parameter; +import com.beust.jcommander.Parameters; +import com.google.common.base.Preconditions; +import com.google.common.collect.ImmutableSet; +import com.google.common.collect.Lists; +import org.apache.parquet.cli.BaseCommand; +import org.apache.parquet.cli.csv.AvroCSVReader; +import org.apache.parquet.cli.csv.CSVProperties; +import org.apache.parquet.cli.csv.AvroCSV; +import org.apache.parquet.cli.util.Schemas; +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericData; +import org.apache.parquet.avro.AvroParquetWriter; +import org.apache.parquet.cli.util.Codecs; +import org.apache.parquet.hadoop.ParquetFileWriter; +import org.apache.parquet.hadoop.ParquetWriter; +import org.apache.parquet.hadoop.metadata.CompressionCodecName; +import org.slf4j.Logger; +import java.io.File; +import java.io.IOException; +import java.nio.charset.Charset; +import java.util.List; +import java.util.Set; + +import static org.apache.avro.generic.GenericData.Record; +import static org.apache.parquet.column.ParquetProperties.WriterVersion.PARQUET_1_0; +import static org.apache.parquet.column.ParquetProperties.WriterVersion.PARQUET_2_0; + +@Parameters(commandDescription="Create a file from CSV data") +public class ConvertCSVCommand extends BaseCommand { + + public ConvertCSVCommand(Logger console) { + super(console); + } + + @Parameter(description="") + List targets; + + @Parameter( + names={"-o", "--output"}, + description="Output file path", + required=true) + String outputPath = null; + + @Parameter( + names={"-2", "--format-version-2", "--writer-version-2"}, + description="Use Parquet format version 2", + hidden = true) + boolean v2 = false; + + @Parameter(names="--delimiter", description="Delimiter character") + String delimiter = ","; + + @Parameter(names="--escape", description="Escape character") + String escape = "\\"; + + @Parameter(names="--quote", description="Quote character") + String quote = "\""; + + @Parameter(names="--no-header", description="Don't use first line as CSV header") + boolean noHeader = false; + + @Parameter(names="--skip-lines", description="Lines to skip before CSV start") + int linesToSkip = 0; + + @Parameter(names="--charset", description="Character set name", hidden = true) + String charsetName = Charset.defaultCharset().displayName(); + + @Parameter(names="--header", + description="Line to use as a header. Must match the CSV settings.") + String header; + + @Parameter(names="--require", + description="Do not allow null values for the given field") + List requiredFields; + + @Parameter(names = {"-s", "--schema"}, + description = "The file containing the Avro schema.") + String avroSchemaFile; + + @Parameter(names = {"--compression-codec"}, + description = "A compression codec name.") + String compressionCodecName = "GZIP"; + + @Parameter(names="--row-group-size", description="Target row group size") + int rowGroupSize = ParquetWriter.DEFAULT_BLOCK_SIZE; + + @Parameter(names="--page-size", description="Target page size") + int pageSize = ParquetWriter.DEFAULT_PAGE_SIZE; + + @Parameter(names="--dictionary-size", description="Max dictionary page size") + int dictionaryPageSize = ParquetWriter.DEFAULT_PAGE_SIZE; + + @Parameter( + names={"--overwrite"}, + description="Remove any data already in the target view or dataset") + boolean overwrite = false; + + @Override + @SuppressWarnings("unchecked") + public int run() throws IOException { + Preconditions.checkArgument(targets != null && targets.size() == 1, + "CSV path is required."); + + if (header != null) { + // if a header is given on the command line, don't assume one is in the file + noHeader = true; + } + + CSVProperties props = new CSVProperties.Builder() + .delimiter(delimiter) + .escape(escape) + .quote(quote) + .header(header) + .hasHeader(!noHeader) + .linesToSkip(linesToSkip) + .charset(charsetName) + .build(); + + String source = targets.get(0); + + Schema csvSchema; + if (avroSchemaFile != null) { + csvSchema = Schemas.fromAvsc(open(avroSchemaFile)); + } else { + Set required = ImmutableSet.of(); + if (requiredFields != null) { + required = ImmutableSet.copyOf(requiredFields); + } + + String filename = new File(source).getName(); + String recordName; + if (filename.contains(".")) { + recordName = filename.substring(0, filename.indexOf(".")); + } else { + recordName = filename; + } + + csvSchema = AvroCSV.inferNullableSchema( + recordName, open(source), props, required); + } + + long count = 0; + try (AvroCSVReader reader = new AvroCSVReader<>( + open(source), props, csvSchema, Record.class, true)) { + CompressionCodecName codec = Codecs.parquetCodec(compressionCodecName); + try (ParquetWriter writer = AvroParquetWriter + .builder(qualifiedPath(outputPath)) + .withWriterVersion(v2 ? PARQUET_2_0 : PARQUET_1_0) + .withWriteMode(overwrite ? + ParquetFileWriter.Mode.OVERWRITE : ParquetFileWriter.Mode.CREATE) + .withCompressionCodec(codec) + .withDictionaryEncoding(true) + .withDictionaryPageSize(dictionaryPageSize) + .withPageSize(pageSize) + .withRowGroupSize(rowGroupSize) + .withDataModel(GenericData.get()) + .withConf(getConf()) + .withSchema(csvSchema) + .build()) { + for (Record record : reader) { + writer.write(record); + } + } catch (RuntimeException e) { + throw new RuntimeException("Failed on record " + count, e); + } + } + + return 0; + } + + @Override + public List getExamples() { + return Lists.newArrayList( + "# Create a Parquet file from a CSV file", + "sample.csv sample.parquet --schema schema.avsc", + "# Create a Parquet file in HDFS from local CSV", + "path/to/sample.csv hdfs:/user/me/sample.parquet --schema schema.avsc", + "# Create an Avro file from CSV data in S3", + "s3:/data/path/sample.csv sample.avro --format avro --schema s3:/schemas/schema.avsc" + ); + } +} diff --git a/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ConvertCommand.java b/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ConvertCommand.java new file mode 100644 index 0000000000..7f828748e6 --- /dev/null +++ b/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ConvertCommand.java @@ -0,0 +1,165 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.parquet.cli.commands; + +import com.beust.jcommander.Parameter; +import com.beust.jcommander.Parameters; +import com.google.common.base.Preconditions; +import com.google.common.collect.Lists; +import com.google.common.io.Closeables; +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericData; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.parquet.avro.AvroParquetWriter; +import org.apache.parquet.cli.BaseCommand; +import org.apache.parquet.cli.util.Codecs; +import org.apache.parquet.cli.util.Schemas; +import org.apache.parquet.hadoop.ParquetWriter; +import org.apache.parquet.hadoop.metadata.CompressionCodecName; +import org.slf4j.Logger; +import java.io.Closeable; +import java.io.IOException; +import java.util.List; + +import static org.apache.avro.generic.GenericData.Record; +import static org.apache.parquet.cli.util.Expressions.filterSchema; +import static org.apache.parquet.column.ParquetProperties.WriterVersion.PARQUET_1_0; +import static org.apache.parquet.column.ParquetProperties.WriterVersion.PARQUET_2_0; + +@Parameters(commandDescription="Create a Parquet file from a data file") +public class ConvertCommand extends BaseCommand { + + public ConvertCommand(Logger console) { + super(console); + } + + @Parameter(description = "") + List targets; + + @Parameter( + names={"-o", "--output"}, + description="Output file path", + required=true) + String outputPath = null; + + @Parameter(names = {"-s", "--schema"}, + description = "The file containing the Avro schema.") + String avroSchemaFile; + + @Parameter( + names = {"-c", "--column", "--columns"}, + description = "List of columns") + List columns; + + @Parameter(names = {"--compression-codec"}, + description = "A compression codec name.") + String compressionCodecName = "GZIP"; + + @Parameter( + names={"--overwrite"}, + description="Overwrite the output file if it exists") + boolean overwrite = false; + + @Parameter( + names={"-2", "--format-version-2", "--writer-version-2"}, + description="Use Parquet format version 2", + hidden = true) + boolean v2 = false; + + @Parameter(names="--row-group-size", description="Target row group size") + int rowGroupSize = ParquetWriter.DEFAULT_BLOCK_SIZE; + + @Parameter(names="--page-size", description="Target page size") + int pageSize = ParquetWriter.DEFAULT_PAGE_SIZE; + + @Parameter(names="--dictionary-size", description="Max dictionary page size") + int dictionaryPageSize = ParquetWriter.DEFAULT_PAGE_SIZE; + + @Override + @SuppressWarnings("unchecked") + public int run() throws IOException { + Preconditions.checkArgument(targets != null && targets.size() == 1, + "A data file is required."); + + String source = targets.get(0); + + CompressionCodecName codec = Codecs.parquetCodec(compressionCodecName); + + Schema schema; + if (avroSchemaFile != null) { + schema = Schemas.fromAvsc(open(avroSchemaFile)); + } else { + schema = getAvroSchema(source); + } + Schema projection = filterSchema(schema, columns); + + Path outPath = qualifiedPath(outputPath); + FileSystem outFS = outPath.getFileSystem(getConf()); + if (overwrite && outFS.exists(outPath)) { + console.debug("Deleting output file {} (already exists)", outPath); + outFS.delete(outPath); + } + + Iterable reader = openDataFile(source, projection); + boolean threw = true; + long count = 0; + try { + try (ParquetWriter writer = AvroParquetWriter + .builder(qualifiedPath(outputPath)) + .withWriterVersion(v2 ? PARQUET_2_0 : PARQUET_1_0) + .withConf(getConf()) + .withCompressionCodec(codec) + .withRowGroupSize(rowGroupSize) + .withDictionaryPageSize(dictionaryPageSize < 64 ? 64 : dictionaryPageSize) + .withDictionaryEncoding(dictionaryPageSize != 0) + .withPageSize(pageSize) + .withDataModel(GenericData.get()) + .withSchema(projection) + .build()) { + for (Record record : reader) { + writer.write(record); + count += 1; + } + } + threw = false; + } catch (RuntimeException e) { + throw new RuntimeException("Failed on record " + count, e); + } finally { + if (reader instanceof Closeable) { + Closeables.close((Closeable) reader, threw); + } + } + + return 0; + } + + @Override + public List getExamples() { + return Lists.newArrayList( + "# Create a Parquet file from an Avro file", + "sample.avro -o sample.parquet", + "# Create a Parquet file in S3 from a local Avro file", + "path/to/sample.avro -o s3:/user/me/sample.parquet", + "# Create a Parquet file from Avro data in S3", + "s3:/data/path/sample.avro -o sample.parquet" + ); + } +} diff --git a/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ParquetMetadataCommand.java b/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ParquetMetadataCommand.java new file mode 100644 index 0000000000..0bd77a30a8 --- /dev/null +++ b/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ParquetMetadataCommand.java @@ -0,0 +1,180 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.parquet.cli.commands; + +import com.beust.jcommander.Parameter; +import com.beust.jcommander.Parameters; +import com.google.common.base.Function; +import com.google.common.base.Preconditions; +import com.google.common.collect.Iterables; +import com.google.common.collect.Lists; +import org.apache.parquet.cli.BaseCommand; +import org.apache.commons.lang.StringUtils; +import org.apache.parquet.column.ColumnDescriptor; +import org.apache.parquet.column.Encoding; +import org.apache.parquet.column.EncodingStats; +import org.apache.parquet.column.statistics.Statistics; +import org.apache.parquet.format.converter.ParquetMetadataConverter; +import org.apache.parquet.hadoop.ParquetFileReader; +import org.apache.parquet.hadoop.metadata.BlockMetaData; +import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; +import org.apache.parquet.hadoop.metadata.CompressionCodecName; +import org.apache.parquet.hadoop.metadata.ParquetMetadata; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.PrimitiveType; +import org.slf4j.Logger; +import javax.annotation.Nullable; +import java.io.IOException; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import static org.apache.parquet.cli.Util.encodingStatsAsString; +import static org.apache.parquet.cli.Util.encodingsAsString; +import static org.apache.parquet.cli.Util.humanReadable; +import static org.apache.parquet.cli.Util.minMaxAsString; +import static org.apache.parquet.cli.Util.primitive; +import static org.apache.parquet.cli.Util.shortCodec; + +@Parameters(commandDescription="Print a Parquet file's metadata") +public class ParquetMetadataCommand extends BaseCommand { + + public ParquetMetadataCommand(Logger console) { + super(console); + } + + @Parameter(description = "") + List targets; + + @Override + @SuppressWarnings("unchecked") + public int run() throws IOException { + Preconditions.checkArgument(targets != null && targets.size() >= 1, + "A Parquet file is required."); + Preconditions.checkArgument(targets.size() == 1, + "Cannot process multiple Parquet files."); + + String source = targets.get(0); + ParquetMetadata footer = ParquetFileReader.readFooter( + getConf(), qualifiedPath(source), ParquetMetadataConverter.NO_FILTER); + + console.info("\nFile path: {}", source); + console.info("Created by: {}", footer.getFileMetaData().getCreatedBy()); + + Map kv = footer.getFileMetaData().getKeyValueMetaData(); + if (kv != null && !kv.isEmpty()) { + console.info("Properties:"); + String format = " %" + maxSize(kv.keySet()) + "s: %s"; + for (Map.Entry entry : kv.entrySet()) { + console.info(String.format(format, entry.getKey(), entry.getValue())); + } + } else { + console.info("Properties: (none)"); + } + + MessageType schema = footer.getFileMetaData().getSchema(); + console.info("Schema:\n{}", schema); + + List rowGroups = footer.getBlocks(); + for (int index = 0, n = rowGroups.size(); index < n; index += 1) { + printRowGroup(console, index, rowGroups.get(index), schema); + } + + console.info(""); + + return 0; + } + + @Override + public List getExamples() { + return Lists.newArrayList( + ); + } + + private int maxSize(Iterable strings) { + int size = 0; + for (String s : strings) { + size = Math.max(size, s.length()); + } + return size; + } + + private void printRowGroup(Logger console, int index, BlockMetaData rowGroup, MessageType schema) { + long start = rowGroup.getStartingPos(); + long rowCount = rowGroup.getRowCount(); + long compressedSize = rowGroup.getCompressedSize(); + long uncompressedSize = rowGroup.getTotalByteSize(); + String filePath = rowGroup.getPath(); + + console.info(String.format("\nRow group %d: count: %d %s records start: %d total: %s%s\n%s", + index, rowCount, + humanReadable(((float) compressedSize) / rowCount), + start, humanReadable(compressedSize), + filePath != null ? " path: " + filePath : "", + StringUtils.leftPad("", 80, '-'))); + + int size = maxSize(Iterables.transform(rowGroup.getColumns(), + new Function() { + @Override + public String apply(@Nullable ColumnChunkMetaData input) { + return input == null ? "" : input.getPath().toDotString(); + } + })); + + console.info(String.format("%-" + size + "s %-9s %-9s %-9s %-10s %-7s %s", + "", "type", "encodings", "count", "avg size", "nulls", "min / max")); + for (ColumnChunkMetaData column : rowGroup.getColumns()) { + printColumnChunk(console, size, column, schema); + } + } + + private void printColumnChunk(Logger console, int width, ColumnChunkMetaData column, MessageType schema) { + String[] path = column.getPath().toArray(); + PrimitiveType type = primitive(schema, path); + Preconditions.checkNotNull(type); + + ColumnDescriptor desc = schema.getColumnDescription(path); + long size = column.getTotalSize(); + long count = column.getValueCount(); + float perValue = ((float) size) / count; + CompressionCodecName codec = column.getCodec(); + Set encodings = column.getEncodings(); + EncodingStats encodingStats = column.getEncodingStats(); + String encodingSummary = encodingStats == null ? + encodingsAsString(encodings, desc) : + encodingStatsAsString(encodingStats); + Statistics stats = column.getStatistics(); + + String name = column.getPath().toDotString(); + + PrimitiveType.PrimitiveTypeName typeName = type.getPrimitiveTypeName(); + if (typeName == PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY) { + console.info(String.format("%-" + width + "s FIXED[%d] %s %-7s %-9d %-8s %-7s %s", + name, type.getTypeLength(), shortCodec(codec), encodingSummary, count, + humanReadable(perValue), stats == null ? "" : String.valueOf(stats.getNumNulls()), + minMaxAsString(stats, type.getOriginalType()))); + } else { + console.info(String.format("%-" + width + "s %-9s %s %-7s %-9d %-10s %-7s %s", + name, typeName, shortCodec(codec), encodingSummary, count, humanReadable(perValue), + stats == null ? "" : String.valueOf(stats.getNumNulls()), + minMaxAsString(stats, type.getOriginalType()))); + } + } +} diff --git a/parquet-cli/src/main/java/org/apache/parquet/cli/commands/SchemaCommand.java b/parquet-cli/src/main/java/org/apache/parquet/cli/commands/SchemaCommand.java new file mode 100644 index 0000000000..ea2306f01a --- /dev/null +++ b/parquet-cli/src/main/java/org/apache/parquet/cli/commands/SchemaCommand.java @@ -0,0 +1,138 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.parquet.cli.commands; + +import com.beust.jcommander.Parameter; +import com.beust.jcommander.Parameters; +import com.google.common.base.Preconditions; +import com.google.common.collect.Lists; +import org.apache.parquet.cli.BaseCommand; +import org.apache.parquet.cli.util.Formats; +import org.apache.avro.file.SeekableInput; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.parquet.format.converter.ParquetMetadataConverter; +import org.apache.parquet.hadoop.ParquetFileReader; +import org.slf4j.Logger; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.nio.charset.StandardCharsets; +import java.util.List; + +@Parameters(commandDescription="Print the Avro schema for a file") +public class SchemaCommand extends BaseCommand { + + public SchemaCommand(Logger console) { + super(console); + } + + @Parameter(description = "") + List targets; + + @Parameter( + names={"-o", "--output"}, + description="Output file path") + String outputPath = null; + + @Parameter( + names={"--overwrite"}, + description="Overwrite the output file if it exists") + boolean overwrite = false; + + @Parameter( + names={"--parquet"}, + description="Print a Parquet schema, without converting to Avro", + hidden=true) + boolean parquetSchema = false; + + @Override + @SuppressWarnings("unchecked") + public int run() throws IOException { + Preconditions.checkArgument(targets != null && targets.size() == 1, + "Parquet file is required."); + + if (targets.size() > 1) { + Preconditions.checkArgument(outputPath == null, + "Cannot output multiple schemas to file " + outputPath); + for (String source : targets) { + console.info("{}: {}", source, getSchema(source)); + } + + } else { + String source = targets.get(0); + + if (outputPath != null) { + Path outPath = qualifiedPath(outputPath); + FileSystem outFS = outPath.getFileSystem(getConf()); + if (overwrite && outFS.exists(outPath)) { + console.debug("Deleting output file {} (already exists)", outPath); + outFS.delete(outPath); + } + + try (OutputStream out = create(outputPath)) { + out.write(getSchema(source).getBytes(StandardCharsets.UTF_8)); + } + } else { + console.info(getSchema(source)); + } + } + + return 0; + } + + @Override + public List getExamples() { + return Lists.newArrayList( + "# Print the Avro schema for a Parquet file", + "sample.parquet", + "# Print the Avro schema for an Avro file", + "sample.avro", + "# Print the Avro schema for a JSON file", + "sample.json" + ); + } + + private String getSchema(String source) throws IOException { + if (parquetSchema) { + return getParquetSchema(source); + } else { + return getAvroSchema(source).toString(true); + } + } + + private String getParquetSchema(String source) throws IOException { + Formats.Format format; + try (SeekableInput in = openSeekable(source)) { + format = Formats.detectFormat((InputStream) in); + in.seek(0); + + switch (format) { + case PARQUET: + return new ParquetFileReader( + getConf(), qualifiedPath(source), ParquetMetadataConverter.NO_FILTER) + .getFileMetaData().getSchema().toString(); + default: + throw new IllegalArgumentException(String.format( + "Could not get a Parquet schema for format %s: %s", format, source)); + } + } + } +} diff --git a/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ShowDictionaryCommand.java b/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ShowDictionaryCommand.java new file mode 100644 index 0000000000..db427c9c74 --- /dev/null +++ b/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ShowDictionaryCommand.java @@ -0,0 +1,131 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.parquet.cli.commands; + +import com.beust.jcommander.Parameter; +import com.beust.jcommander.Parameters; +import com.google.common.base.Preconditions; +import com.google.common.collect.Lists; +import org.apache.parquet.cli.BaseCommand; +import org.apache.parquet.cli.Util; +import org.apache.parquet.column.ColumnDescriptor; +import org.apache.parquet.column.Dictionary; +import org.apache.parquet.column.page.DictionaryPage; +import org.apache.parquet.column.page.DictionaryPageReadStore; +import org.apache.parquet.hadoop.ParquetFileReader; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.OriginalType; +import org.apache.parquet.schema.PrimitiveType; +import org.slf4j.Logger; +import java.io.IOException; +import java.util.List; + +// TODO: show dictionary size in values and in bytes +@Parameters(commandDescription="Print dictionaries for a Parquet column") +public class ShowDictionaryCommand extends BaseCommand { + + public ShowDictionaryCommand(Logger console) { + super(console); + } + + @Parameter(description = "") + List targets; + + @Parameter( + names = {"-c", "--column"}, + description = "Column path", + required = true) + String column; + + @Override + @SuppressWarnings("unchecked") + public int run() throws IOException { + Preconditions.checkArgument(targets != null && targets.size() >= 1, + "A Parquet file is required."); + Preconditions.checkArgument(targets.size() == 1, + "Cannot process multiple Parquet files."); + + String source = targets.get(0); + + ParquetFileReader reader = ParquetFileReader.open(getConf(), qualifiedPath(source)); + MessageType schema = reader.getFileMetaData().getSchema(); + ColumnDescriptor descriptor = Util.descriptor(column, schema); + PrimitiveType type = Util.primitive(column, schema); + Preconditions.checkNotNull(type); + + DictionaryPageReadStore dictionaryReader; + int rowGroup = 0; + while ((dictionaryReader = reader.getNextDictionaryReader()) != null) { + DictionaryPage page = dictionaryReader.readDictionaryPage(descriptor); + + Dictionary dict = page.getEncoding().initDictionary(descriptor, page); + + console.info("\nRow group {} dictionary for \"{}\":", rowGroup, column, page.getCompressedSize()); + for (int i = 0; i <= dict.getMaxId(); i += 1) { + switch(type.getPrimitiveTypeName()) { + case BINARY: + if (type.getOriginalType() == OriginalType.UTF8) { + console.info("{}: {}", String.format("%6d", i), + Util.humanReadable(dict.decodeToBinary(i).toStringUsingUTF8(), 70)); + } else { + console.info("{}: {}", String.format("%6d", i), + Util.humanReadable(dict.decodeToBinary(i).getBytesUnsafe(), 70)); + } + break; + case INT32: + console.info("{}: {}", String.format("%6d", i), + dict.decodeToInt(i)); + break; + case INT64: + console.info("{}: {}", String.format("%6d", i), + dict.decodeToLong(i)); + break; + case FLOAT: + console.info("{}: {}", String.format("%6d", i), + dict.decodeToFloat(i)); + break; + case DOUBLE: + console.info("{}: {}", String.format("%6d", i), + dict.decodeToDouble(i)); + break; + default: + throw new IllegalArgumentException( + "Unknown dictionary type: " + type.getPrimitiveTypeName()); + } + } + + reader.skipNextRowGroup(); + + rowGroup += 1; + } + + console.info(""); + + return 0; + } + + @Override + public List getExamples() { + return Lists.newArrayList( + "# Show the dictionary for column 'col' from a Parquet file", + "-c col sample.parquet" + ); + } +} diff --git a/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ShowPagesCommand.java b/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ShowPagesCommand.java new file mode 100644 index 0000000000..beda4529ed --- /dev/null +++ b/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ShowPagesCommand.java @@ -0,0 +1,217 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.parquet.cli.commands; + +import com.beust.jcommander.Parameter; +import com.beust.jcommander.Parameters; +import com.google.common.base.Preconditions; +import com.google.common.collect.Lists; +import com.google.common.collect.Maps; +import org.apache.parquet.cli.BaseCommand; +import org.apache.commons.lang.StringUtils; +import org.apache.parquet.column.ColumnDescriptor; +import org.apache.parquet.column.page.DataPage; +import org.apache.parquet.column.page.DataPageV1; +import org.apache.parquet.column.page.DataPageV2; +import org.apache.parquet.column.page.DictionaryPage; +import org.apache.parquet.column.page.Page; +import org.apache.parquet.column.page.PageReadStore; +import org.apache.parquet.column.page.PageReader; +import org.apache.parquet.hadoop.ParquetFileReader; +import org.apache.parquet.hadoop.metadata.CompressionCodecName; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.PrimitiveType; +import org.slf4j.Logger; +import java.io.IOException; +import java.util.List; +import java.util.Map; + +import static org.apache.parquet.cli.Util.columnName; +import static org.apache.parquet.cli.Util.descriptor; +import static org.apache.parquet.cli.Util.encodingAsString; +import static org.apache.parquet.cli.Util.humanReadable; +import static org.apache.parquet.cli.Util.minMaxAsString; +import static org.apache.parquet.cli.Util.primitive; +import static org.apache.parquet.cli.Util.shortCodec; + +@Parameters(commandDescription="Print page summaries for a Parquet file") +public class ShowPagesCommand extends BaseCommand { + + public ShowPagesCommand(Logger console) { + super(console); + } + + @Parameter(description = "") + List targets; + + @Parameter( + names = {"-c", "--column", "--columns"}, + description = "List of columns") + List columns; + + @Override + @SuppressWarnings("unchecked") + public int run() throws IOException { + Preconditions.checkArgument(targets != null && targets.size() >= 1, + "A Parquet file is required."); + Preconditions.checkArgument(targets.size() == 1, + "Cannot process multiple Parquet files."); + + String source = targets.get(0); + ParquetFileReader reader = ParquetFileReader.open(getConf(), qualifiedPath(source)); + + MessageType schema = reader.getFileMetaData().getSchema(); + Map columns = Maps.newLinkedHashMap(); + if (this.columns == null || this.columns.isEmpty()) { + for (ColumnDescriptor descriptor : schema.getColumns()) { + columns.put(descriptor, primitive(schema, descriptor.getPath())); + } + } else { + for (String column : this.columns) { + columns.put(descriptor(column, schema), primitive(column, schema)); + } + } + + CompressionCodecName codec = reader.getRowGroups().get(0).getColumns().get(0).getCodec(); + // accumulate formatted lines to print by column + Map> formatted = Maps.newLinkedHashMap(); + PageFormatter formatter = new PageFormatter(); + PageReadStore pageStore; + int rowGroupNum = 0; + while ((pageStore = reader.readNextRowGroup()) != null) { + for (ColumnDescriptor descriptor : columns.keySet()) { + List lines = formatted.get(columnName(descriptor)); + if (lines == null) { + lines = Lists.newArrayList(); + formatted.put(columnName(descriptor), lines); + } + + formatter.setContext(rowGroupNum, columns.get(descriptor), codec); + PageReader pages = pageStore.getPageReader(descriptor); + + DictionaryPage dict = pages.readDictionaryPage(); + if (dict != null) { + lines.add(formatter.format(dict)); + } + DataPage page; + while ((page = pages.readPage()) != null) { + lines.add(formatter.format(page)); + } + } + rowGroupNum += 1; + } + + // TODO: Show total column size and overall size per value in the column summary line + for (String columnName : formatted.keySet()) { + console.info(String.format("\nColumn: %s\n%s", columnName, StringUtils.leftPad("", 80, '-'))); + console.info(formatter.getHeader()); + for (String line : formatted.get(columnName)) { + console.info(line); + } + console.info(""); + } + + return 0; + } + + @Override + public List getExamples() { + return Lists.newArrayList( + "# Show pages for column 'col' from a Parquet file", + "-c col sample.parquet" + ); + } + + private class PageFormatter implements DataPage.Visitor { + private int rowGroupNum; + private int pageNum; + private PrimitiveType type; + private String shortCodec; + + String getHeader() { + return String.format(" %-6s %-5s %-4s %-7s %-10s %-10s %-8s %-7s %s", + "page", "type", "enc", "count", "avg size", "size", "rows", "nulls", "min / max"); + } + + void setContext(int rowGroupNum, PrimitiveType type, CompressionCodecName codec) { + this.rowGroupNum = rowGroupNum; + this.pageNum = 0; + this.type = type; + this.shortCodec = shortCodec(codec); + } + + String format(Page page) { + String formatted = ""; + if (page instanceof DictionaryPage) { + formatted = printDictionaryPage((DictionaryPage) page); + } else if (page instanceof DataPage) { + formatted = ((DataPage) page).accept(this); + } + pageNum += 1; + return formatted; + } + + private String printDictionaryPage(DictionaryPage dict) { + // TODO: the compressed size of a dictionary page is lost in Parquet + dict.getUncompressedSize(); + long totalSize = dict.getCompressedSize(); + int count = dict.getDictionarySize(); + float perValue = ((float) totalSize) / count; + String enc = encodingAsString(dict.getEncoding(), true); + if (pageNum == 0) { + return String.format("%3d-D %-5s %s %-2s %-7d %-10s %-10s", + rowGroupNum, "dict", shortCodec, enc, count, humanReadable(perValue), + humanReadable(totalSize)); + } else { + return String.format("%3d-%-3d %-5s %s %-2s %-7d %-10s %-10s", + rowGroupNum, pageNum, "dict", shortCodec, enc, count, humanReadable(perValue), + humanReadable(totalSize)); + } + } + + @Override + public String visit(DataPageV1 page) { + String enc = encodingAsString(page.getValueEncoding(), false); + long totalSize = page.getCompressedSize(); + int count = page.getValueCount(); + long numNulls = page.getStatistics().getNumNulls(); + float perValue = ((float) totalSize) / count; + String minMax = minMaxAsString(page.getStatistics(), type.getOriginalType()); + return String.format("%3d-%-3d %-5s %s %-2s %-7d %-10s %-10s %-8s %-7s %s", + rowGroupNum, pageNum, "data", shortCodec, enc, count, humanReadable(perValue), + humanReadable(totalSize), "", numNulls, minMax); + } + + @Override + public String visit(DataPageV2 page) { + String enc = encodingAsString(page.getDataEncoding(), false); + long totalSize = page.getCompressedSize(); + int count = page.getValueCount(); + int numRows = page.getRowCount(); + int numNulls = page.getNullCount(); + float perValue = ((float) totalSize) / count; + String minMax = minMaxAsString(page.getStatistics(), type.getOriginalType()); + String compression = (page.isCompressed() ? shortCodec : "_"); + return String.format("%3d-%-3d %-5s %s %-2s %-7d %-10s %-10s %-8d %-7s %s", + rowGroupNum, pageNum, "data", compression, enc, count, humanReadable(perValue), + humanReadable(totalSize), numRows, numNulls, minMax); + } + } +} diff --git a/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ToAvroCommand.java b/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ToAvroCommand.java new file mode 100644 index 0000000000..ceb11cf70c --- /dev/null +++ b/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ToAvroCommand.java @@ -0,0 +1,141 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.parquet.cli.commands; + +import com.beust.jcommander.Parameter; +import com.beust.jcommander.Parameters; +import com.google.common.base.Preconditions; +import com.google.common.collect.Lists; +import com.google.common.io.Closeables; +import org.apache.avro.Schema; +import org.apache.avro.file.CodecFactory; +import org.apache.avro.file.DataFileWriter; +import org.apache.avro.generic.GenericDatumWriter; +import org.apache.avro.io.DatumWriter; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.parquet.cli.BaseCommand; +import org.apache.parquet.cli.util.Codecs; +import org.apache.parquet.cli.util.Schemas; +import org.slf4j.Logger; +import java.io.Closeable; +import java.io.IOException; +import java.util.List; + +import static org.apache.avro.generic.GenericData.Record; +import static org.apache.parquet.cli.util.Expressions.filterSchema; + +@Parameters(commandDescription="Create an Avro file from a data file") +public class ToAvroCommand extends BaseCommand { + + public ToAvroCommand(Logger console) { + super(console); + } + + @Parameter(description = "") + List targets; + + @Parameter( + names={"-o", "--output"}, + description="Output file path", + required=true) + String outputPath = null; + + @Parameter(names = {"-s", "--schema"}, + description = "The file containing an Avro schema for the output file") + String avroSchemaFile; + + @Parameter( + names = {"-c", "--column", "--columns"}, + description = "List of columns") + List columns; + + @Parameter(names = {"--compression-codec"}, + description = "A compression codec name.") + String compressionCodecName = "GZIP"; + + @Parameter( + names={"--overwrite"}, + description="Overwrite the output file if it exists") + boolean overwrite = false; + + @Override + @SuppressWarnings("unchecked") + public int run() throws IOException { + Preconditions.checkArgument(targets != null && targets.size() == 1, + "A data file is required."); + + String source = targets.get(0); + + CodecFactory codecFactory = Codecs.avroCodec(compressionCodecName); + + Schema schema; + if (avroSchemaFile != null) { + schema = Schemas.fromAvsc(open(avroSchemaFile)); + } else { + schema = getAvroSchema(source); + } + Schema projection = filterSchema(schema, columns); + + Path outPath = qualifiedPath(outputPath); + FileSystem outFS = outPath.getFileSystem(getConf()); + if (overwrite && outFS.exists(outPath)) { + console.debug("Deleting output file {} (already exists)", outPath); + outFS.delete(outPath); + } + + Iterable reader = openDataFile(source, projection); + boolean threw = true; + long count = 0; + try { + DatumWriter datumWriter = new GenericDatumWriter<>(schema); + DataFileWriter w = new DataFileWriter<>(datumWriter); + w.setCodec(codecFactory); + + try (DataFileWriter writer = w.create(projection, create(outputPath))) { + for (Record record : reader) { + writer.append(record); + count += 1; + } + } + threw = false; + } catch (RuntimeException e) { + throw new RuntimeException("Failed on record " + count, e); + } finally { + if (reader instanceof Closeable) { + Closeables.close((Closeable) reader, threw); + } + } + + return 0; + } + + @Override + public List getExamples() { + return Lists.newArrayList( + "# Create an Avro file from a Parquet file", + "sample.parquet sample.avro", + "# Create an Avro file in HDFS from a local JSON file", + "path/to/sample.json hdfs:/user/me/sample.parquet", + "# Create an Avro file from data in S3", + "s3:/data/path/sample.parquet sample.avro" + ); + } +} diff --git a/parquet-cli/src/main/java/org/apache/parquet/cli/csv/AvroCSV.java b/parquet-cli/src/main/java/org/apache/parquet/cli/csv/AvroCSV.java new file mode 100644 index 0000000000..47cd665ff0 --- /dev/null +++ b/parquet-cli/src/main/java/org/apache/parquet/cli/csv/AvroCSV.java @@ -0,0 +1,258 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.parquet.cli.csv; + +import au.com.bytecode.opencsv.CSVParser; +import au.com.bytecode.opencsv.CSVReader; +import com.google.common.base.CharMatcher; +import com.google.common.base.Preconditions; +import com.google.common.collect.ImmutableSet; +import com.google.common.collect.Lists; +import org.apache.avro.Schema; +import org.apache.avro.SchemaBuilder; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.nio.charset.Charset; +import java.util.Set; +import java.util.regex.Pattern; + +import static java.lang.Math.min; + +public class AvroCSV { + + private static final Pattern LONG = Pattern.compile("\\d+"); + private static final Pattern DOUBLE = Pattern.compile("\\d*\\.\\d*[dD]?"); + private static final Pattern FLOAT = Pattern.compile("\\d*\\.\\d*[fF]?"); + private static final int DEFAULT_INFER_LINES = 25; + private static final Set NO_REQUIRED_FIELDS = ImmutableSet.of(); + //As per the Avro specs mentioned here -http://avro.apache.org/docs/1.7.5/spec.html + // It should start with [A-Za-z_] and subsequently contain only [A-Za-z0-9_] + private static final Pattern AVRO_COMPATIBLE = Pattern. + compile("^[A-Za-z_][A-Za-z\\d_]*$"); + + static CSVReader newReader(InputStream incoming, CSVProperties props) { + return new CSVReader( + new InputStreamReader(incoming, Charset.forName(props.charset)), + props.delimiter.charAt(0), props.quote.charAt(0), + props.escape.charAt(0), props.linesToSkip, + false /* strict quotes off: don't ignore unquoted strings */, + true /* ignore leading white-space */ ); + } + + static CSVParser newParser(CSVProperties props) { + return new CSVParser( + props.delimiter.charAt(0), props.quote.charAt(0), + props.escape.charAt(0), + false /* strict quotes off: don't ignore unquoted strings */, + true /* ignore leading white-space */ ); + } + + public static Schema inferNullableSchema(String name, InputStream incoming, + CSVProperties props) + throws IOException { + return inferSchemaInternal(name, incoming, props, NO_REQUIRED_FIELDS, true); + } + + public static Schema inferNullableSchema(String name, InputStream incoming, + CSVProperties props, + Set requiredFields) + throws IOException { + return inferSchemaInternal(name, incoming, props, requiredFields, true); + } + + public static Schema inferSchema(String name, InputStream incoming, + CSVProperties props) + throws IOException { + return inferSchemaInternal(name, incoming, props, NO_REQUIRED_FIELDS, false); + } + + public static Schema inferSchema(String name, InputStream incoming, + CSVProperties props, + Set requiredFields) + throws IOException { + return inferSchemaInternal(name, incoming, props, requiredFields, false); + } + + private static Schema inferSchemaInternal(String name, InputStream incoming, + CSVProperties props, + Set requiredFields, + boolean makeNullable) + throws IOException { + CSVReader reader = newReader(incoming, props); + + String[] header; + String[] line; + if (props.useHeader) { + // read the header and then the first line + header = reader.readNext(); + line = reader.readNext(); + Preconditions.checkNotNull(line, "No content to infer schema"); + + } else if (props.header != null) { + header = newParser(props).parseLine(props.header); + line = reader.readNext(); + Preconditions.checkNotNull(line, "No content to infer schema"); + + } else { + // use the first line to create a header + line = reader.readNext(); + Preconditions.checkNotNull(line, "No content to infer schema"); + header = new String[line.length]; + for (int i = 0; i < line.length; i += 1) { + header[i] = "field_" + String.valueOf(i); + } + } + + Schema.Type[] types = new Schema.Type[header.length]; + String[] values = new String[header.length]; + boolean[] nullable = new boolean[header.length]; + boolean[] empty = new boolean[header.length]; + + for (int processed = 0; processed < DEFAULT_INFER_LINES; processed += 1) { + if (line == null) { + break; + } + + for (int i = 0; i < header.length; i += 1) { + if (i < line.length) { + if (types[i] == null) { + types[i] = inferFieldType(line[i]); + if (types[i] != null) { + // keep track of the value used + values[i] = line[i]; + } + } + + if (line[i] == null) { + nullable[i] = true; + } else if (line[i].isEmpty()) { + empty[i] = true; + } + } else { + // no value results in null + nullable[i] = true; + } + } + + line = reader.readNext(); + } + + SchemaBuilder.FieldAssembler fieldAssembler = SchemaBuilder.record(name).fields(); + + // types may be missing, but fieldSchema will return a nullable string + for (int i = 0; i < header.length; i += 1) { + if (header[i] == null) { + throw new RuntimeException("Bad header for field " + i + ": null"); + } + + String fieldName = header[i].trim(); + + if (fieldName.isEmpty()) { + throw new RuntimeException( + "Bad header for field " + i + ": \"" + fieldName + "\""); + } else if(!isAvroCompatibleName(fieldName)) { + throw new RuntimeException( + "Bad header for field, should start with a character " + + "or _ and can contain only alphanumerics and _ " + + i + ": \"" + fieldName + "\""); + } + + // the empty string is not considered null for string fields + boolean foundNull = (nullable[i] || + (empty[i] && types[i] != Schema.Type.STRING)); + + if (requiredFields.contains(fieldName)) { + if (foundNull) { + throw new RuntimeException("Found null value for required field: " + + fieldName + " (" + types[i] + ")"); + } + fieldAssembler = fieldAssembler.name(fieldName) + .doc("Type inferred from '" + sample(values[i]) + "'") + .type(schema(types[i], false)).noDefault(); + } else { + SchemaBuilder.GenericDefault defaultBuilder = fieldAssembler.name(fieldName) + .doc("Type inferred from '" + sample(values[i]) + "'") + .type(schema(types[i], makeNullable || foundNull)); + if (makeNullable || foundNull) { + fieldAssembler = defaultBuilder.withDefault(null); + } else { + fieldAssembler = defaultBuilder.noDefault(); + } + } + } + return fieldAssembler.endRecord(); + } + + private static final CharMatcher NON_PRINTABLE = CharMatcher + .inRange('\u0020', '\u007e').negate(); + + private static String sample(String value) { + if (value != null) { + return NON_PRINTABLE.replaceFrom( + value.subSequence(0, min(50, value.length())), '.'); + } else { + return "null"; + } + } + + /** + * Create a {@link Schema} for the given type. If the type is null, + * the schema will be a nullable String. If isNullable is true, the returned + * schema will be nullable. + * + * @param type a {@link Schema.Type} compatible with {@code Schema.create} + * @param makeNullable If {@code true}, the return type will be nullable + * @return a {@code Schema} for the given {@code Schema.Type} + * @see Schema#create(org.apache.avro.Schema.Type) + */ + private static Schema schema(Schema.Type type, boolean makeNullable) { + Schema schema = Schema.create(type == null ? Schema.Type.STRING : type); + if (makeNullable || type == null) { + schema = Schema.createUnion(Lists.newArrayList( + Schema.create(Schema.Type.NULL), schema)); + } + return schema; + } + + private static Schema.Type inferFieldType(String example) { + if (example == null || example.isEmpty()) { + return null; // not enough information + } else if (LONG.matcher(example).matches()) { + return Schema.Type.LONG; + } else if (DOUBLE.matcher(example).matches()) { + return Schema.Type.DOUBLE; + } else if (FLOAT.matcher(example).matches()) { + return Schema.Type.FLOAT; + } + return Schema.Type.STRING; + } + + /** + * Returns true if the name does not contain characters that are known to be + * incompatible with the specs defined in Avro schema. + * + * @param name a String field name to check + * @return will return true if the name is Avro compatible ,false if not + */ + private static boolean isAvroCompatibleName(String name) { + return AVRO_COMPATIBLE.matcher(name).matches(); + } +} diff --git a/parquet-cli/src/main/java/org/apache/parquet/cli/csv/AvroCSVReader.java b/parquet-cli/src/main/java/org/apache/parquet/cli/csv/AvroCSVReader.java new file mode 100644 index 0000000000..8d5e835cb2 --- /dev/null +++ b/parquet-cli/src/main/java/org/apache/parquet/cli/csv/AvroCSVReader.java @@ -0,0 +1,121 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.parquet.cli.csv; + +import au.com.bytecode.opencsv.CSVReader; +import com.google.common.base.Preconditions; +import com.google.common.collect.Lists; +import org.apache.parquet.cli.util.RuntimeIOException; +import org.apache.avro.Schema; +import java.io.Closeable; +import java.io.IOException; +import java.io.InputStream; +import java.util.Iterator; +import java.util.List; +import java.util.NoSuchElementException; + +public class AvroCSVReader implements Iterator, Iterable, Closeable { + + private final boolean reuseRecords; + private final CSVReader reader; + private final RecordBuilder builder; + private boolean hasNext = false; + private String[] next = null; + private E record = null; + + public AvroCSVReader(InputStream stream, CSVProperties props, + Schema schema, Class type, boolean reuseRecords) { + this.reader = AvroCSV.newReader(stream, props); + this.reuseRecords = reuseRecords; + + Preconditions.checkArgument(Schema.Type.RECORD.equals(schema.getType()), + "Schemas for CSV files must be records of primitive types"); + + List header = null; + if (props.useHeader) { + this.hasNext = advance(); + header = Lists.newArrayList(next); + } else if (props.header != null) { + try { + header = Lists.newArrayList( + AvroCSV.newParser(props).parseLine(props.header)); + } catch (IOException e) { + throw new RuntimeIOException( + "Failed to parse header from properties: " + props.header, e); + } + } + + this.builder = new RecordBuilder<>(schema, type, header); + + // initialize by reading the first record + this.hasNext = advance(); + } + + @Override + public boolean hasNext() { + return hasNext; + } + + @Override + public E next() { + if (!hasNext) { + throw new NoSuchElementException(); + } + + try { + if (reuseRecords) { + this.record = builder.makeRecord(next, record); + return record; + } else { + return builder.makeRecord(next, null); + } + } finally { + this.hasNext = advance(); + } + } + + private boolean advance() { + try { + next = reader.readNext(); + } catch (IOException ex) { + throw new RuntimeIOException("Could not read record", ex); + } + return (next != null); + } + + @Override + public void close() { + try { + reader.close(); + } catch (IOException e) { + throw new RuntimeIOException("Cannot close reader", e); + } + } + + @Override + public void remove() { + throw new UnsupportedOperationException("Remove is not implemented."); + } + + @Override + public Iterator iterator() { + return this; + } +} diff --git a/parquet-cli/src/main/java/org/apache/parquet/cli/csv/CSVProperties.java b/parquet-cli/src/main/java/org/apache/parquet/cli/csv/CSVProperties.java new file mode 100644 index 0000000000..bd4ba064d4 --- /dev/null +++ b/parquet-cli/src/main/java/org/apache/parquet/cli/csv/CSVProperties.java @@ -0,0 +1,111 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.parquet.cli.csv; + +import javax.annotation.concurrent.Immutable; +import org.apache.commons.lang.StringEscapeUtils; + +@Immutable +public class CSVProperties { + + public static final String DEFAULT_CHARSET = "utf8"; + public static final String DEFAULT_DELIMITER = ","; + public static final String DEFAULT_QUOTE = "\""; + public static final String DEFAULT_ESCAPE = "\\"; + public static final String DEFAULT_HAS_HEADER = "false"; + public static final int DEFAULT_LINES_TO_SKIP = 0; + + // configuration + public final String charset; + public final String delimiter; + public final String quote; + public final String escape; + public final String header; + public final boolean useHeader; + public final int linesToSkip; + + private CSVProperties(String charset, String delimiter, String quote, + String escape, String header, boolean useHeader, + int linesToSkip) { + this.charset = charset; + this.delimiter = delimiter; + this.quote = quote; + this.escape = escape; + this.header = header; + this.useHeader = useHeader; + this.linesToSkip = linesToSkip; + } + + public static class Builder { + private String charset = DEFAULT_CHARSET; + private String delimiter = DEFAULT_DELIMITER; + private String quote = DEFAULT_QUOTE; + private String escape = DEFAULT_ESCAPE; + private boolean useHeader = Boolean.valueOf(DEFAULT_HAS_HEADER); + private int linesToSkip = DEFAULT_LINES_TO_SKIP; + private String header = null; + + public Builder charset(String charset) { + this.charset = charset; + return this; + } + + public Builder delimiter(String delimiter) { + this.delimiter = StringEscapeUtils.unescapeJava(delimiter); + return this; + } + + public Builder quote(String quote) { + this.quote = StringEscapeUtils.unescapeJava(quote); + return this; + } + + public Builder escape(String escape) { + this.escape = StringEscapeUtils.unescapeJava(escape); + return this; + } + + public Builder header(String header) { + this.header = header; + return this; + } + + public Builder hasHeader() { + this.useHeader = true; + return this; + } + + public Builder hasHeader(boolean hasHeader) { + this.useHeader = hasHeader; + return this; + } + + public Builder linesToSkip(int linesToSkip) { + this.linesToSkip = linesToSkip; + return this; + } + + public CSVProperties build() { + return new CSVProperties( + charset, delimiter, quote, escape, + header, useHeader, linesToSkip); + } + } +} diff --git a/parquet-cli/src/main/java/org/apache/parquet/cli/csv/RecordBuilder.java b/parquet-cli/src/main/java/org/apache/parquet/cli/csv/RecordBuilder.java new file mode 100644 index 0000000000..9adf22e00b --- /dev/null +++ b/parquet-cli/src/main/java/org/apache/parquet/cli/csv/RecordBuilder.java @@ -0,0 +1,200 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.parquet.cli.csv; + +import org.apache.parquet.cli.util.RecordException; +import org.apache.parquet.cli.util.Schemas; +import org.apache.avro.AvroRuntimeException; +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericData; +import org.apache.avro.generic.IndexedRecord; +import org.apache.avro.reflect.ReflectData; +import java.util.List; + +class RecordBuilder { + private final Schema schema; + private final Class recordClass; + private final Schema.Field[] fields; + private final int[] indexes; // Record position to CSV field position + + public RecordBuilder(Schema schema, Class recordClass, List header) { + this.schema = schema; + this.recordClass = recordClass; + + // initialize the index and field arrays + fields = schema.getFields().toArray(new Schema.Field[schema.getFields().size()]); + indexes = new int[fields.length]; + + if (header != null) { + for (int i = 0; i < fields.length; i += 1) { + fields[i] = schema.getFields().get(i); + indexes[i] = Integer.MAX_VALUE; // never present in the row + } + + // there's a header in next + for (int i = 0; i < header.size(); i += 1) { + Schema.Field field = schema.getField(header.get(i)); + if (field != null) { + indexes[field.pos()] = i; + } + } + + } else { + // without a header, map to fields by position + for (int i = 0; i < fields.length; i += 1) { + fields[i] = schema.getFields().get(i); + indexes[i] = i; + } + } + } + + public E makeRecord(String[] fields, E reuse) { + E record = reuse; + if (record == null) { + record = newRecordInstance(); + } + + if (record instanceof IndexedRecord) { + fillIndexed((IndexedRecord) record, fields); + } else { + fillReflect(record, fields); + } + + return record; + } + + @SuppressWarnings("unchecked") + private E newRecordInstance() { + if (recordClass != GenericData.Record.class && !recordClass.isInterface()) { + E record = (E) ReflectData.newInstance(recordClass, schema); + if (record != null) { + return record; + } + } + return (E) new GenericData.Record(schema); + } + + private void fillIndexed(IndexedRecord record, String[] data) { + for (int i = 0; i < indexes.length; i += 1) { + int index = indexes[i]; + record.put(i, + makeValue(index < data.length ? data[index] : null, fields[i])); + } + } + + private void fillReflect(Object record, String[] data) { + for (int i = 0; i < indexes.length; i += 1) { + Schema.Field field = fields[i]; + int index = indexes[i]; + Object value = makeValue(index < data.length ? data[index] : null, field); + ReflectData.get().setField(record, field.name(), i, value); + } + } + + private static Object makeValue(String string, Schema.Field field) { + try { + Object value = makeValue(string, field.schema()); + if (value != null || Schemas.nullOk(field.schema())) { + return value; + } else { + // this will fail if there is no default value + return ReflectData.get().getDefaultValue(field); + } + } catch (RecordException e) { + // add the field name to the error message + throw new RecordException(String.format( + "Cannot convert field %s", field.name()), e); + } catch (NumberFormatException e) { + throw new RecordException(String.format( + "Field %s: value not a %s: '%s'", + field.name(), field.schema(), string), e); + } catch (AvroRuntimeException e) { + throw new RecordException(String.format( + "Field %s: cannot make %s value: '%s'", + field.name(), field.schema(), string), e); + } + } + + /** + * Returns a the value as the first matching schema type or null. + * + * Note that if the value may be null even if the schema does not allow the + * value to be null. + * + * @param string a String representation of the value + * @param schema a Schema + * @return the string coerced to the correct type from the schema or null + */ + private static Object makeValue(String string, Schema schema) { + if (string == null) { + return null; + } + + try { + switch (schema.getType()) { + case BOOLEAN: + return Boolean.valueOf(string); + case STRING: + return string; + case FLOAT: + return Float.valueOf(string); + case DOUBLE: + return Double.valueOf(string); + case INT: + return Integer.valueOf(string); + case LONG: + return Long.valueOf(string); + case ENUM: + // TODO: translate to enum class + if (schema.hasEnumSymbol(string)) { + return string; + } else { + try { + return schema.getEnumSymbols().get(Integer.parseInt(string)); + } catch (IndexOutOfBoundsException ex) { + return null; + } + } + case UNION: + Object value = null; + for (Schema possible : schema.getTypes()) { + value = makeValue(string, possible); + if (value != null) { + return value; + } + } + return null; + case NULL: + return null; + default: + // FIXED, BYTES, MAP, ARRAY, RECORD are not supported + throw new RecordException( + "Unsupported field type:" + schema.getType()); + } + } catch (NumberFormatException e) { + // empty string is considered null for numeric types + if (string.isEmpty()) { + return null; + } else { + throw e; + } + } + } +} diff --git a/parquet-cli/src/main/java/org/apache/parquet/cli/json/AvroJson.java b/parquet-cli/src/main/java/org/apache/parquet/cli/json/AvroJson.java new file mode 100644 index 0000000000..f17ee83486 --- /dev/null +++ b/parquet-cli/src/main/java/org/apache/parquet/cli/json/AvroJson.java @@ -0,0 +1,636 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.parquet.cli.json; + +import com.fasterxml.jackson.core.JsonFactory; +import com.fasterxml.jackson.core.JsonParseException; +import com.fasterxml.jackson.core.JsonParser; +import com.fasterxml.jackson.databind.JsonMappingException; +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.node.ArrayNode; +import com.fasterxml.jackson.databind.node.BinaryNode; +import com.fasterxml.jackson.databind.node.BooleanNode; +import com.fasterxml.jackson.databind.node.MissingNode; +import com.fasterxml.jackson.databind.node.NullNode; +import com.fasterxml.jackson.databind.node.NumericNode; +import com.fasterxml.jackson.databind.node.ObjectNode; +import com.fasterxml.jackson.databind.node.TextNode; +import com.google.common.base.Function; +import com.google.common.base.Joiner; +import com.google.common.base.Preconditions; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.Iterables; +import com.google.common.collect.Iterators; +import com.google.common.collect.Lists; +import com.google.common.collect.Maps; +import com.google.common.collect.Sets; +import org.apache.parquet.cli.util.RecordException; +import org.apache.parquet.cli.util.RuntimeIOException; +import org.apache.parquet.cli.util.Schemas; +import org.apache.avro.AvroRuntimeException; +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericData; +import java.io.IOException; +import java.io.InputStream; +import java.nio.ByteBuffer; +import java.util.Collection; +import java.util.Iterator; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.Set; + +public class AvroJson { + + private static final JsonFactory FACTORY = new JsonFactory(); + + public static Iterator parser(final InputStream stream) { + try { + JsonParser parser = FACTORY.createParser(stream); + parser.setCodec(new ObjectMapper()); + return parser.readValuesAs(JsonNode.class); + } catch (IOException e) { + throw new RuntimeIOException("Cannot read from stream", e); + } + } + + public static JsonNode parse(String json) { + return parse(json, JsonNode.class); + } + + public static T parse(String json, Class returnType) { + ObjectMapper mapper = new ObjectMapper(); + try { + return mapper.readValue(json, returnType); + } catch (JsonParseException e) { + throw new IllegalArgumentException("Invalid JSON", e); + } catch (JsonMappingException e) { + throw new IllegalArgumentException("Invalid JSON", e); + } catch (IOException e) { + throw new RuntimeIOException("Cannot initialize JSON parser", e); + } + } + + public static JsonNode parse(InputStream json) { + return parse(json, JsonNode.class); + } + + public static T parse(InputStream json, Class returnType) { + ObjectMapper mapper = new ObjectMapper(); + try { + return mapper.readValue(json, returnType); + } catch (JsonParseException e) { + throw new IllegalArgumentException("Invalid JSON stream", e); + } catch (JsonMappingException e) { + throw new IllegalArgumentException("Invalid JSON stream", e); + } catch (IOException e) { + throw new RuntimeIOException("Cannot initialize JSON parser", e); + } + } + + public static Object convertToAvro(GenericData model, JsonNode datum, + Schema schema) { + if (datum == null) { + return null; + } + switch (schema.getType()) { + case RECORD: + RecordException.check(datum.isObject(), + "Cannot convert non-object to record: %s", datum); + Object record = model.newRecord(null, schema); + for (Schema.Field field : schema.getFields()) { + model.setField(record, field.name(), field.pos(), + convertField(model, datum.get(field.name()), field)); + } + return record; + + case MAP: + RecordException.check(datum.isObject(), + "Cannot convert non-object to map: %s", datum); + Map map = Maps.newLinkedHashMap(); + Iterator> iter = datum.fields(); + while (iter.hasNext()) { + Map.Entry entry = iter.next(); + map.put(entry.getKey(), convertToAvro( + model, entry.getValue(), schema.getValueType())); + } + return map; + + case ARRAY: + RecordException.check(datum.isArray(), + "Cannot convert to array: %s", datum); + List list = Lists.newArrayListWithExpectedSize(datum.size()); + for (JsonNode element : datum) { + list.add(convertToAvro(model, element, schema.getElementType())); + } + return list; + + case UNION: + return convertToAvro(model, datum, + resolveUnion(datum, schema.getTypes())); + + case BOOLEAN: + RecordException.check(datum.isBoolean(), + "Cannot convert to boolean: %s", datum); + return datum.booleanValue(); + + case FLOAT: + RecordException.check(datum.isFloat() || datum.isInt(), + "Cannot convert to float: %s", datum); + return datum.floatValue(); + + case DOUBLE: + RecordException.check( + datum.isDouble() || datum.isFloat() || + datum.isLong() || datum.isInt(), + "Cannot convert to double: %s", datum); + return datum.doubleValue(); + + case INT: + RecordException.check(datum.isInt(), + "Cannot convert to int: %s", datum); + return datum.intValue(); + + case LONG: + RecordException.check(datum.isLong() || datum.isInt(), + "Cannot convert to long: %s", datum); + return datum.longValue(); + + case STRING: + RecordException.check(datum.isTextual(), + "Cannot convert to string: %s", datum); + return datum.textValue(); + + case ENUM: + RecordException.check(datum.isTextual(), + "Cannot convert to string: %s", datum); + return model.createEnum(datum.textValue(), schema); + + case BYTES: + RecordException.check(datum.isBinary(), + "Cannot convert to binary: %s", datum); + try { + return ByteBuffer.wrap(datum.binaryValue()); + } catch (IOException e) { + throw new RecordException("Failed to read JSON binary", e); + } + + case FIXED: + RecordException.check(datum.isBinary(), + "Cannot convert to fixed: %s", datum); + byte[] bytes; + try { + bytes = datum.binaryValue(); + } catch (IOException e) { + throw new RecordException("Failed to read JSON binary", e); + } + RecordException.check(bytes.length < schema.getFixedSize(), + "Binary data is too short: %s bytes for %s", bytes.length, schema); + return model.createFixed(null, bytes, schema); + + case NULL: + return null; + + default: + // don't use DatasetRecordException because this is a Schema problem + throw new IllegalArgumentException("Unknown schema type: " + schema); + } + } + + private static Object convertField(GenericData model, JsonNode datum, + Schema.Field field) { + try { + Object value = convertToAvro(model, datum, field.schema()); + if (value != null || Schemas.nullOk(field.schema())) { + return value; + } else { + return model.getDefaultValue(field); + } + } catch (RecordException e) { + // add the field name to the error message + throw new RecordException(String.format( + "Cannot convert field %s", field.name()), e); + } catch (AvroRuntimeException e) { + throw new RecordException(String.format( + "Field %s: cannot make %s value: '%s'", + field.name(), field.schema(), String.valueOf(datum)), e); + } + } + + private static Schema resolveUnion(JsonNode datum, Collection schemas) { + Set primitives = Sets.newHashSet(); + List others = Lists.newArrayList(); + for (Schema schema : schemas) { + if (PRIMITIVES.containsKey(schema.getType())) { + primitives.add(schema.getType()); + } else { + others.add(schema); + } + } + + // Try to identify specific primitive types + Schema primitiveSchema = null; + if (datum == null || datum.isNull()) { + primitiveSchema = closestPrimitive(primitives, Schema.Type.NULL); + } else if (datum.isShort() || datum.isInt()) { + primitiveSchema = closestPrimitive(primitives, + Schema.Type.INT, Schema.Type.LONG, + Schema.Type.FLOAT, Schema.Type.DOUBLE); + } else if (datum.isLong()) { + primitiveSchema = closestPrimitive(primitives, + Schema.Type.LONG, Schema.Type.DOUBLE); + } else if (datum.isFloat()) { + primitiveSchema = closestPrimitive(primitives, + Schema.Type.FLOAT, Schema.Type.DOUBLE); + } else if (datum.isDouble()) { + primitiveSchema = closestPrimitive(primitives, Schema.Type.DOUBLE); + } else if (datum.isBoolean()) { + primitiveSchema = closestPrimitive(primitives, Schema.Type.BOOLEAN); + } + + if (primitiveSchema != null) { + return primitiveSchema; + } + + // otherwise, select the first schema that matches the datum + for (Schema schema : others) { + if (matches(datum, schema)) { + return schema; + } + } + + throw new RecordException(String.format( + "Cannot resolve union: %s not in %s", datum, schemas)); + } + + // this does not contain string, bytes, or fixed because the datum type + // doesn't necessarily determine the schema. + private static ImmutableMap PRIMITIVES = ImmutableMap + .builder() + .put(Schema.Type.NULL, Schema.create(Schema.Type.NULL)) + .put(Schema.Type.BOOLEAN, Schema.create(Schema.Type.BOOLEAN)) + .put(Schema.Type.INT, Schema.create(Schema.Type.INT)) + .put(Schema.Type.LONG, Schema.create(Schema.Type.LONG)) + .put(Schema.Type.FLOAT, Schema.create(Schema.Type.FLOAT)) + .put(Schema.Type.DOUBLE, Schema.create(Schema.Type.DOUBLE)) + .build(); + + private static Schema closestPrimitive(Set possible, Schema.Type... types) { + for (Schema.Type type : types) { + if (possible.contains(type) && PRIMITIVES.containsKey(type)) { + return PRIMITIVES.get(type); + } + } + return null; + } + + private static boolean matches(JsonNode datum, Schema schema) { + switch (schema.getType()) { + case RECORD: + if (datum.isObject()) { + // check that each field is present or has a default + boolean missingField = false; + for (Schema.Field field : schema.getFields()) { + if (!datum.has(field.name()) && field.defaultValue() == null) { + missingField = true; + break; + } + } + if (!missingField) { + return true; + } + } + break; + case UNION: + if (resolveUnion(datum, schema.getTypes()) != null) { + return true; + } + break; + case MAP: + if (datum.isObject()) { + return true; + } + break; + case ARRAY: + if (datum.isArray()) { + return true; + } + break; + case BOOLEAN: + if (datum.isBoolean()) { + return true; + } + break; + case FLOAT: + if (datum.isFloat() || datum.isInt()) { + return true; + } + break; + case DOUBLE: + if (datum.isDouble() || datum.isFloat() || + datum.isLong() || datum.isInt()) { + return true; + } + break; + case INT: + if (datum.isInt()) { + return true; + } + break; + case LONG: + if (datum.isLong() || datum.isInt()) { + return true; + } + break; + case STRING: + if (datum.isTextual()) { + return true; + } + break; + case ENUM: + if (datum.isTextual() && schema.hasEnumSymbol(datum.textValue())) { + return true; + } + break; + case BYTES: + case FIXED: + if (datum.isBinary()) { + return true; + } + break; + case NULL: + if (datum == null || datum.isNull()) { + return true; + } + break; + default: // UNION or unknown + throw new IllegalArgumentException("Unsupported schema: " + schema); + } + return false; + } + + public static Schema inferSchema(InputStream incoming, final String name, + int numRecords) { + Iterator schemas = Iterators.transform(parser(incoming), + new Function() { + @Override + public Schema apply(JsonNode node) { + return inferSchema(node, name); + } + }); + + if (!schemas.hasNext()) { + return null; + } + + Schema result = schemas.next(); + for (int i = 1; schemas.hasNext() && i < numRecords; i += 1) { + result = Schemas.merge(result, schemas.next()); + } + + return result; + } + + public static Schema inferSchema(JsonNode node, String name) { + return visit(node, new JsonSchemaVisitor(name)); + } + + public static Schema inferSchemaWithMaps(JsonNode node, String name) { + return visit(node, new JsonSchemaVisitor(name).useMaps()); + } + + private static class JsonSchemaVisitor extends JsonTreeVisitor { + + private static final Joiner DOT = Joiner.on('.'); + private final String name; + private boolean objectsToRecords = true; + + public JsonSchemaVisitor(String name) { + this.name = name; + } + + public JsonSchemaVisitor useMaps() { + this.objectsToRecords = false; + return this; + } + + @Override + public Schema object(ObjectNode object, Map fields) { + if (objectsToRecords || recordLevels.size() < 1) { + List recordFields = Lists.newArrayListWithExpectedSize( + fields.size()); + + for (Map.Entry entry : fields.entrySet()) { + recordFields.add(new Schema.Field( + entry.getKey(), entry.getValue(), + "Type inferred from '" + object.get(entry.getKey()) + "'", + null)); + } + + Schema recordSchema; + if (recordLevels.size() < 1) { + recordSchema = Schema.createRecord(name, null, null, false); + } else { + recordSchema = Schema.createRecord( + DOT.join(recordLevels), null, null, false); + } + + recordSchema.setFields(recordFields); + + return recordSchema; + + } else { + // translate to a map; use LinkedHashSet to preserve schema order + switch (fields.size()) { + case 0: + return Schema.createMap(Schema.create(Schema.Type.NULL)); + case 1: + return Schema.createMap(Iterables.getOnlyElement(fields.values())); + default: + return Schema.createMap(Schemas.mergeOrUnion(fields.values())); + } + } + } + + @Override + public Schema array(ArrayNode ignored, List elementSchemas) { + // use LinkedHashSet to preserve schema order + switch (elementSchemas.size()) { + case 0: + return Schema.createArray(Schema.create(Schema.Type.NULL)); + case 1: + return Schema.createArray(Iterables.getOnlyElement(elementSchemas)); + default: + return Schema.createArray(Schemas.mergeOrUnion(elementSchemas)); + } + } + + @Override + public Schema binary(BinaryNode ignored) { + return Schema.create(Schema.Type.BYTES); + } + + @Override + public Schema text(TextNode ignored) { + return Schema.create(Schema.Type.STRING); + } + + @Override + public Schema number(NumericNode number) { + if (number.isInt()) { + return Schema.create(Schema.Type.INT); + } else if (number.isLong()) { + return Schema.create(Schema.Type.LONG); + } else if (number.isFloat()) { + return Schema.create(Schema.Type.FLOAT); + } else if (number.isDouble()) { + return Schema.create(Schema.Type.DOUBLE); + } else { + throw new UnsupportedOperationException( + number.getClass().getName() + " is not supported"); + } + } + + @Override + public Schema bool(BooleanNode ignored) { + return Schema.create(Schema.Type.BOOLEAN); + } + + @Override + public Schema nullNode(NullNode ignored) { + return Schema.create(Schema.Type.NULL); + } + + @Override + public Schema missing(MissingNode ignored) { + throw new UnsupportedOperationException("MissingNode is not supported."); + } + } + + private static T visit(JsonNode node, JsonTreeVisitor visitor) { + switch (node.getNodeType()) { + case OBJECT: + Preconditions.checkArgument(node instanceof ObjectNode, + "Expected instance of ObjectNode: " + node); + + // use LinkedHashMap to preserve field order + Map fields = Maps.newLinkedHashMap(); + + Iterator> iter = node.fields(); + while (iter.hasNext()) { + Map.Entry entry = iter.next(); + + visitor.recordLevels.push(entry.getKey()); + fields.put(entry.getKey(), visit(entry.getValue(), visitor)); + visitor.recordLevels.pop(); + } + + return visitor.object((ObjectNode) node, fields); + + case ARRAY: + Preconditions.checkArgument(node instanceof ArrayNode, + "Expected instance of ArrayNode: " + node); + + List elements = Lists.newArrayListWithExpectedSize(node.size()); + + for (JsonNode element : node) { + elements.add(visit(element, visitor)); + } + + return visitor.array((ArrayNode) node, elements); + + case BINARY: + Preconditions.checkArgument(node instanceof BinaryNode, + "Expected instance of BinaryNode: " + node); + return visitor.binary((BinaryNode) node); + + case STRING: + Preconditions.checkArgument(node instanceof TextNode, + "Expected instance of TextNode: " + node); + + return visitor.text((TextNode) node); + + case NUMBER: + Preconditions.checkArgument(node instanceof NumericNode, + "Expected instance of NumericNode: " + node); + + return visitor.number((NumericNode) node); + + case BOOLEAN: + Preconditions.checkArgument(node instanceof BooleanNode, + "Expected instance of BooleanNode: " + node); + + return visitor.bool((BooleanNode) node); + + case MISSING: + Preconditions.checkArgument(node instanceof MissingNode, + "Expected instance of MissingNode: " + node); + + return visitor.missing((MissingNode) node); + + case NULL: + Preconditions.checkArgument(node instanceof NullNode, + "Expected instance of NullNode: " + node); + + return visitor.nullNode((NullNode) node); + + default: + throw new IllegalArgumentException( + "Unknown node type: " + node.getNodeType() + ": " + node); + } + } + + private abstract static class JsonTreeVisitor { + protected LinkedList recordLevels = Lists.newLinkedList(); + + public T object(ObjectNode object, Map fields) { + return null; + } + + public T array(ArrayNode array, List elements) { + return null; + } + + public T binary(BinaryNode binary) { + return null; + } + + public T text(TextNode text) { + return null; + } + + public T number(NumericNode number) { + return null; + } + + public T bool(BooleanNode bool) { + return null; + } + + public T missing(MissingNode missing) { + return null; + } + + public T nullNode(NullNode nullNode) { + return null; + } + } +} diff --git a/parquet-cli/src/main/java/org/apache/parquet/cli/json/AvroJsonReader.java b/parquet-cli/src/main/java/org/apache/parquet/cli/json/AvroJsonReader.java new file mode 100644 index 0000000000..a3b067dae5 --- /dev/null +++ b/parquet-cli/src/main/java/org/apache/parquet/cli/json/AvroJsonReader.java @@ -0,0 +1,85 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.parquet.cli.json; + +import com.fasterxml.jackson.databind.JsonNode; +import com.google.common.base.Function; +import com.google.common.collect.Iterators; +import org.apache.parquet.cli.util.RuntimeIOException; +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericData; +import javax.annotation.Nullable; +import java.io.Closeable; +import java.io.IOException; +import java.io.InputStream; +import java.util.Iterator; + +public class AvroJsonReader implements Iterator, Iterable, Closeable { + + private final GenericData model; + private final Schema schema; + private final InputStream stream; + private Iterator iterator; + + public AvroJsonReader(InputStream stream, Schema schema) { + this.stream = stream; + this.schema = schema; + this.model = GenericData.get(); + this.iterator = Iterators.transform(AvroJson.parser(stream), + new Function() { + @Override + @SuppressWarnings("unchecked") + public E apply(@Nullable JsonNode node) { + return (E) AvroJson.convertToAvro( + model, node, AvroJsonReader.this.schema); + } + }); + } + + @Override + public boolean hasNext() { + return iterator.hasNext(); + } + + @Override + public E next() { + return iterator.next(); + } + + @Override + public void close() { + iterator = null; + try { + stream.close(); + } catch (IOException e) { + throw new RuntimeIOException("Cannot close reader", e); + } + } + + @Override + public void remove() { + throw new UnsupportedOperationException("Remove is not implemented."); + } + + @Override + public Iterator iterator() { + return this; + } +} diff --git a/parquet-cli/src/main/java/org/apache/parquet/cli/util/Codecs.java b/parquet-cli/src/main/java/org/apache/parquet/cli/util/Codecs.java new file mode 100644 index 0000000000..06f12fd279 --- /dev/null +++ b/parquet-cli/src/main/java/org/apache/parquet/cli/util/Codecs.java @@ -0,0 +1,50 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.parquet.cli.util; + +import org.apache.avro.file.CodecFactory; +import org.apache.parquet.hadoop.metadata.CompressionCodecName; + +import java.util.Locale; + +public class Codecs { + public static CompressionCodecName parquetCodec(String codec) { + try { + return CompressionCodecName.valueOf(codec.toUpperCase(Locale.ENGLISH)); + } catch (IllegalArgumentException e) { + throw new IllegalArgumentException("Unknown compression codec: " + codec); + } + } + + public static CodecFactory avroCodec(String codec) { + CompressionCodecName parquetCodec = parquetCodec(codec); + switch (parquetCodec) { + case UNCOMPRESSED: + return CodecFactory.nullCodec(); + case SNAPPY: + return CodecFactory.snappyCodec(); + case GZIP: + return CodecFactory.deflateCodec(9); + default: + throw new IllegalArgumentException( + "Codec incompatible with Avro: " + codec); + } + } +} diff --git a/parquet-cli/src/main/java/org/apache/parquet/cli/util/Expressions.java b/parquet-cli/src/main/java/org/apache/parquet/cli/util/Expressions.java new file mode 100644 index 0000000000..61f632aab8 --- /dev/null +++ b/parquet-cli/src/main/java/org/apache/parquet/cli/util/Expressions.java @@ -0,0 +1,391 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.parquet.cli.util; + +import com.google.common.base.Objects; +import com.google.common.base.Preconditions; +import com.google.common.collect.Lists; +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericData; +import org.apache.avro.generic.GenericRecord; +import org.apache.avro.util.Utf8; +import java.util.Collection; +import java.util.List; +import java.util.Map; +import java.util.regex.Pattern; + + +public class Expressions { + private static final Pattern NUMERIC_RE = Pattern.compile("^\\d+$"); + + public static Object select(Schema schema, Object datum, String path) { + return select(schema, datum, Lists.newArrayList(parse(path))); + } + + @SuppressWarnings("unchecked") + private static Object select(Schema schema, Object datum, List tokens) { + if (tokens.isEmpty()) { + return datum; + } + + Preconditions.checkArgument(tokens.size() == 1, "Cannot return multiple values"); + PathExpr token = tokens.get(0); + + switch (schema.getType()) { + case RECORD: + if (!(datum instanceof GenericRecord) && "json".equals(schema.getName())) { + // skip the placeholder record schema + return select(schema.getField("value").schema(), datum, tokens); + } + Preconditions.checkArgument(token.type == PathExpr.Type.FIELD, + "Cannot dereference records"); + Preconditions.checkArgument(datum instanceof GenericRecord, + "Not a record: %s", datum); + GenericRecord record = (GenericRecord) datum; + Schema.Field field = schema.getField(token.value); + Preconditions.checkArgument(field != null, + "No such field '%s' in schema: %s", token.value, schema); + return select(field.schema(), record.get(token.value), token.children); + + case MAP: + Preconditions.checkArgument(datum instanceof Map, + "Not a map: %s", datum); + Map map = (Map) datum; + Object value = map.get(token.value); + if (value == null) { + // try with a Utf8 + value = map.get(new Utf8(token.value)); + } + return select(schema.getValueType(), value, token.children); + + case ARRAY: + Preconditions.checkArgument(token.type == PathExpr.Type.DEREF, + "Cannot access fields of an array"); + Preconditions.checkArgument(datum instanceof Collection, + "Not an array: %s", datum); + Preconditions.checkArgument(NUMERIC_RE.matcher(token.value).matches(), + "Not an array index: %s", token.value); + List list = (List) datum; + return select(schema.getElementType(), list.get(Integer.parseInt(token.value)), + token.children); + + case UNION: + int branch = GenericData.get().resolveUnion(schema, datum); + return select(schema.getTypes().get(branch), datum, tokens); + + default: + throw new IllegalArgumentException("Cannot access child of primitive value: " + datum); + } + } + + /** + * a.2.b[3]["key"] + * * optional (union with null) should be ignored + * * unions should match by position number or short name (e.g. 2, user) + * * fields should match by name + * * arrays are dereferenced by position [n] => schema is the element schema + * * maps are dereferenced by key => schema is the value schema + */ + public static Schema filterSchema(Schema schema, String... fieldPaths) { + return filterSchema(schema, Lists.newArrayList(fieldPaths)); + } + + public static Schema filterSchema(Schema schema, List fieldPaths) { + if (fieldPaths == null) { + return schema; + } + List paths = merge(Lists.newArrayList(fieldPaths)); + return filter(schema, paths); + } + + private static PathExpr parse(String path) { + PathExpr expr = null; + PathExpr last = null; + boolean inDeref = false; + boolean afterDeref = false; + int valueStart = 0; + for (int i = 0; i < path.length(); i += 1) { + switch (path.charAt(i)) { + case '.': + Preconditions.checkState(valueStart != i || afterDeref, "Empty reference: ''"); + if (!inDeref) { + if (valueStart != i) { + PathExpr current = PathExpr.field(path.substring(valueStart, i)); + if (last != null) { + last.children.add(current); + } else { + expr = current; + } + last = current; + } + valueStart = i + 1; + afterDeref = false; + } + break; + case '[': + Preconditions.checkState(!inDeref, "Cannot nest [ within []"); + Preconditions.checkState(valueStart != i || afterDeref, "Empty reference: ''"); + if (valueStart != i) { + PathExpr current = PathExpr.field(path.substring(valueStart, i)); + if (last != null) { + last.children.add(current); + } else { + expr = current; + } + last = current; + } + valueStart = i + 1; + inDeref = true; + afterDeref = false; + break; + case ']': + Preconditions.checkState(inDeref, "Cannot use ] without a starting ["); + Preconditions.checkState(valueStart != i, "Empty reference: ''"); + PathExpr current = PathExpr.deref(path.substring(valueStart, i)); + if (last != null) { + last.children.add(current); + } else { + expr = current; + } + last = current; + valueStart = i + 1; + inDeref = false; + afterDeref = true; + break; + default: + Preconditions.checkState(!afterDeref, "Fields after [] must start with ."); + } + } + Preconditions.checkState(!inDeref, "Fields after [ must end with ]"); + if (valueStart < path.length()) { + PathExpr current = PathExpr.field(path.substring(valueStart, path.length())); + if (last != null) { + last.children.add(current); + } else { + expr = current; + } + } + return expr; + } + + private static List merge(List fields) { + List paths = Lists.newArrayList(); + for (String field : fields) { + merge(paths, parse(field)); + } + return paths; + } + + private static List merge(List tokens, PathExpr toAdd) { + boolean merged = false; + for (PathExpr token : tokens) { + if ((token.type == toAdd.type) && + (token.type == PathExpr.Type.DEREF || token.value.equals(toAdd.value))) { + for (PathExpr child : toAdd.children) { + merge(token.children, child); + } + merged = true; + } + } + if (!merged) { + tokens.add(toAdd); + } + return tokens; + } + + private static Schema filter(Schema schema, List exprs) { + if (exprs.isEmpty()) { + return schema; + } + + switch (schema.getType()) { + case RECORD: + List fields = Lists.newArrayList(); + for (PathExpr expr : exprs) { + Schema.Field field = schema.getField(expr.value); + Preconditions.checkArgument(field != null, + "Cannot find field '%s' in schema: %s", expr.value, schema); + fields.add(new Schema.Field(expr.value, filter(field.schema(), expr.children), + field.doc(), field.defaultVal(), field.order())); + } + return Schema.createRecord(schema.getName(), + schema.getDoc(), schema.getNamespace(), schema.isError(), fields); + + case UNION: + // Ignore schemas that are a union with null because there is another token + if (schema.getTypes().size() == 2) { + if (schema.getTypes().get(0).getType() == Schema.Type.NULL) { + return filter(schema.getTypes().get(1), exprs); + } else if (schema.getTypes().get(1).getType() == Schema.Type.NULL) { + return filter(schema.getTypes().get(0), exprs); + } + } + + List schemas = Lists.newArrayList(); + for (PathExpr expr : exprs) { + schemas.add(filter(schema, expr)); + } + + if (schemas.size() > 1) { + return Schema.createUnion(schemas); + } else { + return schemas.get(0); + } + + case MAP: + Preconditions.checkArgument(exprs.size() == 1, + "Cannot find multiple children of map schema: %s", schema); + return filter(schema, exprs.get(0)); + + case ARRAY: + Preconditions.checkArgument(exprs.size() == 1, + "Cannot find multiple children of array schema: %s", schema); + return filter(schema, exprs.get(0)); + + default: + throw new IllegalArgumentException(String.format( + "Cannot find child of primitive schema: %s", schema)); + } + } + + private static Schema filter(Schema schema, PathExpr expr) { + if (expr == null) { + return schema; + } + + switch (schema.getType()) { + case RECORD: + Preconditions.checkArgument(expr.type == PathExpr.Type.FIELD, + "Cannot index a record: [%s]", expr.value); + Schema.Field field = schema.getField(expr.value); + if (field != null) { + return filter(field.schema(), expr.children); + } else { + throw new IllegalArgumentException(String.format( + "Cannot find field '%s' in schema: %s", expr.value, schema.toString(true))); + } + + case MAP: + return Schema.createMap(filter(schema.getValueType(), expr.children)); + + case ARRAY: + Preconditions.checkArgument(expr.type == PathExpr.Type.DEREF, + "Cannot find field '%s' in an array", expr.value); + Preconditions.checkArgument(NUMERIC_RE.matcher(expr.value).matches(), + "Cannot index array by non-numeric value '%s'", expr.value); + return Schema.createArray(filter(schema.getElementType(), expr.children)); + + case UNION: + // TODO: this should only return something if the type can match rather than explicitly + // accessing parts of a union. when selecting data, unions are ignored. + Preconditions.checkArgument(expr.type == PathExpr.Type.DEREF, + "Cannot find field '%s' in a union", expr.value); + List options = schema.getTypes(); + if (NUMERIC_RE.matcher(expr.value).matches()) { + // look up the option by position + int i = Integer.parseInt(expr.value); + if (i < options.size()) { + return filter(options.get(i), expr.children); + } + } else { + // look up the option by name + for (Schema option : options) { + if (expr.value.equalsIgnoreCase(option.getName())) { + return filter(option, expr.children); + } + } + } + throw new IllegalArgumentException(String.format( + "Invalid union index '%s' for schema: %s", expr.value, schema)); + + default: + throw new IllegalArgumentException(String.format( + "Cannot find '%s' in primitive schema: %s", expr.value, schema)); + } + } + + private static class PathExpr { + enum Type { + DEREF, + FIELD + } + + static PathExpr deref(String value) { + return new PathExpr(Type.DEREF, value); + } + + static PathExpr deref(String value, PathExpr child) { + return new PathExpr(Type.DEREF, value, Lists.newArrayList(child)); + } + + static PathExpr field(String value) { + return new PathExpr(Type.FIELD, value); + } + + static PathExpr field(String value, PathExpr child) { + return new PathExpr(Type.FIELD, value, Lists.newArrayList(child)); + } + + private final Type type; + private final String value; + private final List children; + + PathExpr(Type type, String value) { + this.type = type; + this.value = value; + this.children = Lists.newArrayList(); + } + + PathExpr(Type type, String value, List children) { + this.type = type; + this.value = value; + this.children = children; + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + + PathExpr pathExpr = (PathExpr) o; + + if (type != pathExpr.type) return false; + if (value != null ? !value.equals(pathExpr.value) : pathExpr.value != null) return false; + return children != null ? children.equals(pathExpr.children) : pathExpr.children == null; + } + + @Override + public int hashCode() { + int result = type != null ? type.hashCode() : 0; + result = 31 * result + (value != null ? value.hashCode() : 0); + result = 31 * result + (children != null ? children.hashCode() : 0); + return result; + } + + @Override + public String toString() { + return Objects.toStringHelper(this) + .add("type", type) + .add("value", value) + .add("children", children) + .toString(); + } + } +} diff --git a/parquet-cli/src/main/java/org/apache/parquet/cli/util/Formats.java b/parquet-cli/src/main/java/org/apache/parquet/cli/util/Formats.java new file mode 100644 index 0000000000..68951826df --- /dev/null +++ b/parquet-cli/src/main/java/org/apache/parquet/cli/util/Formats.java @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.parquet.cli.util; + +import java.io.IOException; +import java.io.InputStream; +import java.util.Arrays; + +public class Formats { + public enum Format { + PARQUET, + AVRO, + SEQUENCE, + TEXT + } + + public static Format detectFormat(InputStream stream) throws IOException { + byte[] first3 = new byte[3]; + stream.read(first3); + if (Arrays.equals(first3, new byte[]{'P', 'A', 'R'})) { + return Format.PARQUET; + } else if (Arrays.equals(first3, new byte[]{'O', 'b', 'j'})) { + return Format.AVRO; + } else if (Arrays.equals(first3, new byte[]{'S', 'E', 'Q'})) { + return Format.SEQUENCE; + } else { + return Format.TEXT; + } + } +} diff --git a/parquet-cli/src/main/java/org/apache/parquet/cli/util/GetClassLoader.java b/parquet-cli/src/main/java/org/apache/parquet/cli/util/GetClassLoader.java new file mode 100644 index 0000000000..1cacbd5e8a --- /dev/null +++ b/parquet-cli/src/main/java/org/apache/parquet/cli/util/GetClassLoader.java @@ -0,0 +1,39 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.parquet.cli.util; + +import java.net.URL; +import java.net.URLClassLoader; +import java.security.PrivilegedAction; +import java.util.List; + +public class GetClassLoader implements PrivilegedAction { + private final URL[] urls; + + public GetClassLoader(List urls) { + this.urls = urls.toArray(new URL[urls.size()]); + } + + @Override + public ClassLoader run() { + return new URLClassLoader( + urls, Thread.currentThread().getContextClassLoader()); + } +} diff --git a/parquet-cli/src/main/java/org/apache/parquet/cli/util/RecordException.java b/parquet-cli/src/main/java/org/apache/parquet/cli/util/RecordException.java new file mode 100644 index 0000000000..f7e7b6cfe2 --- /dev/null +++ b/parquet-cli/src/main/java/org/apache/parquet/cli/util/RecordException.java @@ -0,0 +1,53 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.parquet.cli.util; + +/** + * Exception to signal that a record could not be read or written. + */ +public class RecordException extends RuntimeException { + public RecordException(String message) { + super(message); + } + + public RecordException(String message, Throwable cause) { + super(message, cause); + } + + /** + * Precondition-style validation that throws a {@link RecordException}. + * + * @param isValid + * {@code true} if valid, {@code false} if an exception should be + * thrown + * @param message + * A String message for the exception. + */ + public static void check(boolean isValid, String message, Object... args) { + if (!isValid) { + String[] argStrings = new String[args.length]; + for (int i = 0; i < args.length; i += 1) { + argStrings[i] = String.valueOf(args[i]); + } + throw new RecordException( + String.format(String.valueOf(message), (Object[]) argStrings)); + } + } +} diff --git a/parquet-cli/src/main/java/org/apache/parquet/cli/util/RuntimeIOException.java b/parquet-cli/src/main/java/org/apache/parquet/cli/util/RuntimeIOException.java new file mode 100644 index 0000000000..e7233191fe --- /dev/null +++ b/parquet-cli/src/main/java/org/apache/parquet/cli/util/RuntimeIOException.java @@ -0,0 +1,31 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.parquet.cli.util; + +import java.io.IOException; + +/** + * RuntimeException wrapper for IOExceptions + */ +public class RuntimeIOException extends RuntimeException { + public RuntimeIOException(String message, IOException cause) { + super(message, cause); + } +} diff --git a/parquet-cli/src/main/java/org/apache/parquet/cli/util/Schemas.java b/parquet-cli/src/main/java/org/apache/parquet/cli/util/Schemas.java new file mode 100644 index 0000000000..877c7cc86f --- /dev/null +++ b/parquet-cli/src/main/java/org/apache/parquet/cli/util/Schemas.java @@ -0,0 +1,498 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.parquet.cli.util; + +import com.google.common.base.Objects; +import com.google.common.base.Preconditions; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.Lists; +import com.google.common.collect.Sets; +import com.google.common.io.Closeables; +import org.apache.parquet.cli.json.AvroJson; +import org.apache.avro.Schema; +import org.apache.avro.file.DataFileStream; +import org.apache.avro.generic.GenericDatumReader; +import org.apache.avro.generic.GenericRecord; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.parquet.avro.AvroSchemaConverter; +import org.apache.parquet.hadoop.ParquetFileReader; +import org.apache.parquet.hadoop.metadata.ParquetMetadata; +import org.codehaus.jackson.node.NullNode; +import java.io.IOException; +import java.io.InputStream; +import java.net.URI; +import java.util.Collection; +import java.util.Iterator; +import java.util.List; +import java.util.Set; + +public class Schemas { + + public static Schema fromAvsc(InputStream in) throws IOException { + // the parser has state, so use a new one each time + return new Schema.Parser().parse(in); + } + + public static Schema fromAvro(InputStream in) throws IOException { + GenericDatumReader datumReader = + new GenericDatumReader(); + DataFileStream stream = null; + boolean threw = true; + + try { + stream = new DataFileStream<>(in, datumReader); + Schema schema = stream.getSchema(); + threw = false; + return schema; + } finally { + Closeables.close(stream, threw); + } + } + + public static Schema fromParquet(Configuration conf, URI location) throws IOException { + Path path = new Path(location); + FileSystem fs = path.getFileSystem(conf); + + ParquetMetadata footer = ParquetFileReader.readFooter(fs.getConf(), path); + + String schemaString = footer.getFileMetaData() + .getKeyValueMetaData().get("parquet.avro.schema"); + if (schemaString == null) { + // try the older property + schemaString = footer.getFileMetaData() + .getKeyValueMetaData().get("avro.schema"); + } + + if (schemaString != null) { + return new Schema.Parser().parse(schemaString); + } else { + return new AvroSchemaConverter() + .convert(footer.getFileMetaData().getSchema()); + } + } + + public static Schema fromJSON(String name, InputStream in) throws IOException { + return AvroJson.inferSchema(in, name, 20); + } + + /** + * Returns whether null is allowed by the schema. + * + * @param schema a Schema + * @return true if schema allows the value to be null + */ + public static boolean nullOk(Schema schema) { + if (Schema.Type.NULL == schema.getType()) { + return true; + } else if (Schema.Type.UNION == schema.getType()) { + for (Schema possible : schema.getTypes()) { + if (nullOk(possible)) { + return true; + } + } + } + return false; + } + + /** + * Merges {@link Schema} instances if they are compatible. + *

+ * Schemas are incompatible if: + *

    + *
  • The {@link Schema.Type} does not match.
  • + *
  • For record schemas, the record name does not match
  • + *
  • For enum schemas, the enum name does not match
  • + *
+ *

+ * Map value, array element, and record field types types will use unions if + * necessary, and union schemas are merged recursively. + * + * @param schemas a set of {@code Schema} instances to merge + * @return a merged {@code Schema} + * @throws IllegalStateException if the schemas are not compatible + */ + public static Schema merge(Iterable schemas) { + Iterator iter = schemas.iterator(); + if (!iter.hasNext()) { + return null; + } + Schema result = iter.next(); + while (iter.hasNext()) { + result = merge(result, iter.next()); + } + return result; + } + + /** + * Merges {@link Schema} instances and creates a union of schemas if any are + * incompatible. + *

+ * Schemas are incompatible if: + *

    + *
  • The {@link Schema.Type} does not match.
  • + *
  • For record schemas, the record name does not match
  • + *
  • For enum schemas, the enum name does not match
  • + *
+ *

+ * Map value, array element, and record field types types will use unions if + * necessary, and union schemas are merged recursively. + * + * @param schemas a set of {@code Schema} instances to merge + * @return a combined {@code Schema} + */ + public static Schema mergeOrUnion(Iterable schemas) { + Iterator iter = schemas.iterator(); + if (!iter.hasNext()) { + return null; + } + Schema result = iter.next(); + while (iter.hasNext()) { + result = mergeOrUnion(result, iter.next()); + } + return result; + } + + /** + * Merges two {@link Schema} instances if they are compatible. + *

+ * Two schemas are incompatible if: + *

    + *
  • The {@link Schema.Type} does not match.
  • + *
  • For record schemas, the record name does not match
  • + *
  • For enum schemas, the enum name does not match
  • + *
+ *

+ * Map value and array element types will use unions if necessary, and union + * schemas are merged recursively. + * + * @param left a {@code Schema} + * @param right a {@code Schema} + * @return a merged {@code Schema} + * @throws IllegalStateException if the schemas are not compatible + */ + public static Schema merge(Schema left, Schema right) { + Schema merged = mergeOnly(left, right); + Preconditions.checkState(merged != null, + "Cannot merge %s and %s", left, right); + return merged; + } + + /** + * Merges two {@link Schema} instances or returns {@code null}. + *

+ * The two schemas are merged if they are the same type. Records are merged + * if the two records have the same name or have no names but have a + * significant number of shared fields. + *

+ * @see {@link #mergeOrUnion} to return a union when a merge is not possible. + * + * @param left a {@code Schema} + * @param right a {@code Schema} + * @return a {@code Schema} for both types + */ + private static Schema mergeOrUnion(Schema left, Schema right) { + Schema merged = mergeOnly(left, right); + if (merged != null) { + return merged; + } + return union(left, right); + } + + /** + * Creates a union of two {@link Schema} instances. + *

+ * If either {@code Schema} is a union, this will attempt to merge the other + * schema with the types contained in that union before adding more types to + * the union that is produced. + *

+ * If both schemas are not unions, no merge is attempted. + * + * @param left a {@code Schema} + * @param right a {@code Schema} + * @return a UNION schema of the to {@code Schema} instances + */ + private static Schema union(Schema left, Schema right) { + if (left.getType() == Schema.Type.UNION) { + if (right.getType() == Schema.Type.UNION) { + // combine the unions by adding each type in right individually + Schema combined = left; + for (Schema type : right.getTypes()) { + combined = union(combined, type); + } + return combined; + + } else { + boolean notMerged = true; + // combine a union with a non-union by checking if each type will merge + List types = Lists.newArrayList(); + Iterator schemas = left.getTypes().iterator(); + // try to merge each type and stop when one succeeds + while (schemas.hasNext()) { + Schema next = schemas.next(); + Schema merged = mergeOnly(next, right); + if (merged != null) { + types.add(merged); + notMerged = false; + break; + } else { + // merge didn't work, add the type + types.add(next); + } + } + // add the remaining types from the left union + while (schemas.hasNext()) { + types.add(schemas.next()); + } + + if (notMerged) { + types.add(right); + } + + return Schema.createUnion(types); + } + } else if (right.getType() == Schema.Type.UNION) { + return union(right, left); + } + + return Schema.createUnion(ImmutableList.of(left, right)); + } + + /** + * Merges two {@link Schema} instances or returns {@code null}. + *

+ * The two schemas are merged if they are the same type. Records are merged + * if the two records have the same name or have no names but have a + * significant number of shared fields. + *

+ * @see {@link #mergeOrUnion} to return a union when a merge is not possible. + * + * @param left a {@code Schema} + * @param right a {@code Schema} + * @return a merged {@code Schema} or {@code null} if merging is not possible + */ + private static Schema mergeOnly(Schema left, Schema right) { + if (Objects.equal(left, right)) { + return left; + } + + // handle primitive type promotion; doesn't promote integers to floats + switch (left.getType()) { + case INT: + if (right.getType() == Schema.Type.LONG) { + return right; + } + break; + case LONG: + if (right.getType() == Schema.Type.INT) { + return left; + } + break; + case FLOAT: + if (right.getType() == Schema.Type.DOUBLE) { + return right; + } + break; + case DOUBLE: + if (right.getType() == Schema.Type.FLOAT) { + return left; + } + } + + // any other cases where the types don't match must be combined by a union + if (left.getType() != right.getType()) { + return null; + } + + switch (left.getType()) { + case UNION: + return union(left, right); + case RECORD: + if (left.getName() == null && right.getName() == null && + fieldSimilarity(left, right) < SIMILARITY_THRESH) { + return null; + } else if (!Objects.equal(left.getName(), right.getName())) { + return null; + } + + Schema combinedRecord = Schema.createRecord( + coalesce(left.getName(), right.getName()), + coalesce(left.getDoc(), right.getDoc()), + coalesce(left.getNamespace(), right.getNamespace()), + false + ); + combinedRecord.setFields(mergeFields(left, right)); + + return combinedRecord; + + case MAP: + return Schema.createMap( + mergeOrUnion(left.getValueType(), right.getValueType())); + + case ARRAY: + return Schema.createArray( + mergeOrUnion(left.getElementType(), right.getElementType())); + + case ENUM: + if (!Objects.equal(left.getName(), right.getName())) { + return null; + } + Set symbols = Sets.newLinkedHashSet(); + symbols.addAll(left.getEnumSymbols()); + symbols.addAll(right.getEnumSymbols()); + return Schema.createEnum( + left.getName(), + coalesce(left.getDoc(), right.getDoc()), + coalesce(left.getNamespace(), right.getNamespace()), + ImmutableList.copyOf(symbols) + ); + + default: + // all primitives are handled before the switch by the equality check. + // schemas that reach this point are not primitives and also not any of + // the above known types. + throw new UnsupportedOperationException( + "Unknown schema type: " + left.getType()); + } + } + + private static final Schema NULL = Schema.create(Schema.Type.NULL); + private static final NullNode NULL_DEFAULT = NullNode.getInstance(); + + /** + * Returns a union {@link Schema} of NULL and the given {@code schema}. + *

+ * A NULL schema is always the first type in the union so that a null default + * value can be set. + * + * @param schema a {@code Schema} + * @return a union of null and the given schema + */ + private static Schema nullableForDefault(Schema schema) { + if (schema.getType() == Schema.Type.NULL) { + return schema; + } + + if (schema.getType() != Schema.Type.UNION) { + return Schema.createUnion(ImmutableList.of(NULL, schema)); + } + + if (schema.getTypes().get(0).getType() == Schema.Type.NULL) { + return schema; + } + + List types = Lists.newArrayList(); + types.add(NULL); + for (Schema type : schema.getTypes()) { + if (type.getType() != Schema.Type.NULL) { + types.add(type); + } + } + + return Schema.createUnion(types); + } + + private static List mergeFields(Schema left, Schema right) { + List fields = Lists.newArrayList(); + for (Schema.Field leftField : left.getFields()) { + Schema.Field rightField = right.getField(leftField.name()); + if (rightField != null) { + fields.add(new Schema.Field( + leftField.name(), + mergeOrUnion(leftField.schema(), rightField.schema()), + coalesce(leftField.doc(), rightField.doc()), + coalesce(leftField.defaultValue(), rightField.defaultValue()) + )); + } else { + if (leftField.defaultValue() != null) { + fields.add(copy(leftField)); + } else { + fields.add(new Schema.Field( + leftField.name(), nullableForDefault(leftField.schema()), + leftField.doc(), NULL_DEFAULT + )); + } + } + } + + for (Schema.Field rightField : right.getFields()) { + if (left.getField(rightField.name()) == null) { + if (rightField.defaultValue() != null) { + fields.add(copy(rightField)); + } else { + fields.add(new Schema.Field( + rightField.name(), nullableForDefault(rightField.schema()), + rightField.doc(), NULL_DEFAULT + )); + } + } + } + + return fields; + } + + /** + * Creates a new field with the same name, schema, doc, and default value as + * the incoming schema. + *

+ * Fields cannot be used in more than one record (not Immutable?). + */ + public static Schema.Field copy(Schema.Field field) { + return new Schema.Field( + field.name(), field.schema(), field.doc(), field.defaultValue()); + } + + private static float fieldSimilarity(Schema left, Schema right) { + // check whether the unnamed records appear to be the same record + Set leftNames = names(left.getFields()); + Set rightNames = names(right.getFields()); + int common = Sets.intersection(leftNames, rightNames).size(); + float leftRatio = ((float) common) / ((float) leftNames.size()); + float rightRatio = ((float) common) / ((float) rightNames.size()); + return hmean(leftRatio, rightRatio); + } + + private static Set names(Collection fields) { + Set names = Sets.newHashSet(); + for (Schema.Field field : fields) { + names.add(field.name()); + } + return names; + } + + private static float SIMILARITY_THRESH = 0.3f; + private static float hmean(float left, float right) { + return (2.0f * left * right) / (left + right); + } + + /** + * Returns the first non-null object that is passed in. + */ + @SafeVarargs + private static E coalesce(E... objects) { + for (E object : objects) { + if (object != null) { + return object; + } + } + return null; + } +} diff --git a/parquet-cli/src/main/java/org/apache/parquet/cli/util/SeekableFSDataInputStream.java b/parquet-cli/src/main/java/org/apache/parquet/cli/util/SeekableFSDataInputStream.java new file mode 100644 index 0000000000..8a8b41e774 --- /dev/null +++ b/parquet-cli/src/main/java/org/apache/parquet/cli/util/SeekableFSDataInputStream.java @@ -0,0 +1,76 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.parquet.cli.util; + +import org.apache.avro.file.SeekableInput; +import org.apache.hadoop.fs.FSDataInputStream; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import java.io.IOException; +import java.io.InputStream; + +/** + * A wrapper for FSDataInputStream that implements Avro's SeekableInput. + */ +public class SeekableFSDataInputStream extends InputStream implements SeekableInput { + private final FSDataInputStream in; + private final FileStatus stat; + + public SeekableFSDataInputStream(FileSystem fs, Path file) throws IOException { + this.in = fs.open(file); + this.stat = fs.getFileStatus(file); + } + + @Override + public void seek(long p) throws IOException { + in.seek(p); + } + + @Override + public long tell() throws IOException { + return in.getPos(); + } + + @Override + public long length() throws IOException { + return stat.getLen(); + } + + @Override + public int read(byte[] b) throws IOException { + return in.read(b); + } + + @Override + public int read() throws IOException { + return in.read(); + } + + @Override + public int read(byte[] b, int off, int len) throws IOException { + return in.read(b, off, len); + } + + @Override + public void close() throws IOException { + in.close(); + } +} diff --git a/parquet-cli/src/main/resources/META-INF/LICENSE b/parquet-cli/src/main/resources/META-INF/LICENSE new file mode 100644 index 0000000000..2b581f8e64 --- /dev/null +++ b/parquet-cli/src/main/resources/META-INF/LICENSE @@ -0,0 +1,348 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + +-------------------------------------------------------------------------------- + +This product depends on Apache Thrift and includes it in this binary artifact. + +Copyright: 2006-2010 The Apache Software Foundation. +Home page: https://thrift.apache.org/ +License: http://www.apache.org/licenses/LICENSE-2.0 + +-------------------------------------------------------------------------------- + +This product depends on SLF4J and includes SLF4J in this binary artifact. SLF4J +is a simple logging facade for Java. + +Copyright: 2004-2013 QOS.ch. +Home page: http://www.slf4j.org/ +License: http://slf4j.org/license.html (MIT license) + +The following is the SLF4J license (MIT): + + Copyright (c) 2004-2013 QOS.ch + All rights reserved. + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +-------------------------------------------------------------------------------- + +This project includes code from Daniel Lemire's JavaFastPFOR project in this +binary artifact. The "Lemire" bit packing classes produced by parquet-generator +are derived from the JavaFastPFOR project. + +Copyright: 2013 Daniel Lemire +Home page: http://lemire.me/en/ +Project page: https://github.com/lemire/JavaFastPFOR +License: Apache License Version 2.0 http://www.apache.org/licenses/LICENSE-2.0 + +-------------------------------------------------------------------------------- + +This product depends on Apache Avro and includes it in this binary artifact. + +Copyright: 2010-2016 The Apache Software Foundation. +Home page: https://avro.apache.org/ +License: http://www.apache.org/licenses/LICENSE-2.0 + +-------------------------------------------------------------------------------- + +This product depends on fastutil and includes it in this binary artifact. +Fastutil provides type-specific collection implementations. + +Copyright: 2002-2014 Sebastiano Vigna +Home page: http://fasutil.di.unimi.it/ +License: http://www.apache.org/licenses/LICENSE-2.0.html + +-------------------------------------------------------------------------------- + +This product depends on Jackson and includes it in this binary artifact. +Jackson is a high-performance JSON processor. + +Copyright: 2007-2015 Tatu Saloranta and other contributors +Home page: http://jackson.codehaus.org/ +Home page: http://wiki.fasterxml.com/JacksonHome +License: http://www.apache.org/licenses/LICENSE-2.0.txt + +-------------------------------------------------------------------------------- + +This product depends on snappy-java and includes it in this binary artifact. +Snappy is a fast compression codec that aims for high speeds and reasonable +compression, developed by Google. + +Copyright: 2011 Taro L. Saito and other contributors +Home page: http://www.xerial.org/ +License: http://www.apache.org/licenses/LICENSE-2.0.txt + +-------------------------------------------------------------------------------- + +This product depends on Apache Commons and includes commons-codec, +commons-pool, and commons-compress in this binary artifact. + +Copyright: 2002-2015 The Apache Software Foundation. +Home page: https://commons.apache.org/proper/commons-codec/ +Home page: https://commons.apache.org/proper/commons-pool/ +License: http://www.apache.org/licenses/LICENSE-2.0 + +Commons Compress includes files derived from the LZMA SDK, version 9.20 (C/ and +CPP/7zip/), in the package org.apache.commons.compress.archivers.sevenz: + +| LZMA SDK is placed in the public domain. (http://www.7-zip.org/sdk.html) + +-------------------------------------------------------------------------------- + +This product depends on Google guava and includes it in this binary artifact. + +Copyright: 2010-2015 The Guava Authors +Home page: https://github.com/google/guava +License: http://www.apache.org/licenses/LICENSE-2.0 + +-------------------------------------------------------------------------------- + +This product depends on JCommander and includes it in this binary artifact. + +Copyright: Copyright 2012, Cedric Beust and contributors +Home page: http://jcommander.org +License: https://github.com/cbeust/jcommander/blob/master/license.txt + +-------------------------------------------------------------------------------- + +This product depends on OpenCSV and includes it in this binary artifact. + +Copyright: 2006 Glen Smith and contributors +Home page: http://opencsv.sourceforge.net/ +License: http://www.apache.org/licenses/LICENSE-2.0 + +---------------------------------------------------------------------- + +License for paranamer, included in this binary artifact: + +Copyright (c) 2006 Paul Hammant & ThoughtWorks Inc +All rights reserved. + +| Redistribution and use in source and binary forms, with or without +| modification, are permitted provided that the following conditions +| are met: +| 1. Redistributions of source code must retain the above copyright +| notice, this list of conditions and the following disclaimer. +| 2. Redistributions in binary form must reproduce the above copyright +| notice, this list of conditions and the following disclaimer in the +| documentation and/or other materials provided with the distribution. +| 3. Neither the name of the copyright holders nor the names of its +| contributors may be used to endorse or promote products derived from +| this software without specific prior written permission. +| +| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +| CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +| SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +| INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +| CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +| ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +| THE POSSIBILITY OF SUCH DAMAGE. + +---------------------------------------------------------------------- + +License for xz compression, included in this binary artifact: + +Home page: http://tukaani.org/xz/java.html + +| This Java implementation of XZ has been put into the public domain, thus you +| can do whatever you want with it. All the files in the package have been +| written by Lasse Collin, but some files are heavily based on public domain code +| written by Igor Pavlov. + diff --git a/parquet-cli/src/main/resources/META-INF/NOTICE b/parquet-cli/src/main/resources/META-INF/NOTICE new file mode 100644 index 0000000000..f90733d38a --- /dev/null +++ b/parquet-cli/src/main/resources/META-INF/NOTICE @@ -0,0 +1,45 @@ + +Apache Parquet MR +Copyright 2016 The Apache Software Foundation + +This product includes software developed at +The Apache Software Foundation (http://www.apache.org/). + +-------------------------------------------------------------------------------- + +This project includes code from Kite, developed at Cloudera, Inc. with +the following copyright notice: + +| Copyright 2013 Cloudera Inc. +| +| Licensed under the Apache License, Version 2.0 (the "License"); +| you may not use this file except in compliance with the License. +| You may obtain a copy of the License at +| +| http://www.apache.org/licenses/LICENSE-2.0 +| +| Unless required by applicable law or agreed to in writing, software +| distributed under the License is distributed on an "AS IS" BASIS, +| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +| See the License for the specific language governing permissions and +| limitations under the License. + +-------------------------------------------------------------------------------- + +This project includes code from Netflix, Inc. with the following copyright +notice: + +| Copyright 2016 Netflix, Inc. +| +| Licensed under the Apache License, Version 2.0 (the "License"); +| you may not use this file except in compliance with the License. +| You may obtain a copy of the License at +| +| http://www.apache.org/licenses/LICENSE-2.0 +| +| Unless required by applicable law or agreed to in writing, software +| distributed under the License is distributed on an "AS IS" BASIS, +| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +| See the License for the specific language governing permissions and +| limitations under the License. + diff --git a/parquet-cli/src/main/resources/cli-logging.properties b/parquet-cli/src/main/resources/cli-logging.properties new file mode 100644 index 0000000000..73919859c7 --- /dev/null +++ b/parquet-cli/src/main/resources/cli-logging.properties @@ -0,0 +1,51 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +# debug log4j configuration +#log4j.debug=true + +# by default, log anything but cli console to component logger +log4j.rootLogger = WARN, component + +# Set the appender named console to be a ConsoleAppender +log4j.appender.console=org.apache.log4j.ConsoleAppender + +# CLI console output +log4j.logger.org.apache.parquet.cli=INFO, console +log4j.additivity.org.apache.parquet.cli=false + +# Define the layout for console appender +log4j.appender.console.layout=org.apache.log4j.PatternLayout +log4j.appender.console.layout.ConversionPattern=%m%n + +# Change to turn on component logging +log4j.appender.component=org.apache.log4j.varia.NullAppender + +# Define the layout for component appender +log4j.appender.component.layout=org.apache.log4j.PatternLayout +log4j.appender.component.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p :: %m [%C]%n + +# silence native code warnings +log4j.logger.org.apache.hadoop.util.NativeCodeLoader=ERROR + +log4j.logger.org.apache.parquet.CorruptStatistics=ERROR + +# set up logging levels for MR +log4j.logger.org.apache.hadoop.mapred.LocalJobRunner=WARN, console +log4j.logger.org.apache.hadoop.mapreduce.Job=INFO, console diff --git a/parquet-common/src/main/java/org/apache/parquet/Exceptions.java b/parquet-common/src/main/java/org/apache/parquet/Exceptions.java new file mode 100644 index 0000000000..bdd531c153 --- /dev/null +++ b/parquet-common/src/main/java/org/apache/parquet/Exceptions.java @@ -0,0 +1,34 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.parquet; + +public class Exceptions { + /** + * If the given throwable is an instance of E, throw it as an E. + */ + public static void throwIfInstance(Throwable t, + Class excClass) + throws E { + if (excClass.isAssignableFrom(t.getClass())) { + // the throwable is already an exception, so return it + throw excClass.cast(t); + } + } +} diff --git a/parquet-common/src/main/java/org/apache/parquet/util/DynConstructors.java b/parquet-common/src/main/java/org/apache/parquet/util/DynConstructors.java new file mode 100644 index 0000000000..e1dddf1438 --- /dev/null +++ b/parquet-common/src/main/java/org/apache/parquet/util/DynConstructors.java @@ -0,0 +1,273 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.parquet.util; + +import org.apache.parquet.Preconditions; +import java.lang.reflect.Constructor; +import java.lang.reflect.InvocationTargetException; +import java.security.AccessController; +import java.security.PrivilegedAction; +import java.util.HashMap; +import java.util.Map; + +import static org.apache.parquet.Exceptions.throwIfInstance; + +public class DynConstructors { + public static class Ctor extends DynMethods.UnboundMethod { + private final Constructor ctor; + private final Class constructed; + + private Ctor(Constructor constructor, Class constructed) { + super(null, "newInstance"); + this.ctor = constructor; + this.constructed = constructed; + } + + public Class getConstructedClass() { + return constructed; + } + + public C newInstanceChecked(Object... args) throws Exception { + try { + return ctor.newInstance(args); + } catch (InstantiationException e) { + throw e; + } catch (IllegalAccessException e) { + throw e; + } catch (InvocationTargetException e) { + throwIfInstance(e.getCause(), Exception.class); + throwIfInstance(e.getCause(), RuntimeException.class); + throw new RuntimeException(e.getCause()); + } + } + + public C newInstance(Object... args) { + try { + return newInstanceChecked(args); + } catch (Exception e) { + throwIfInstance(e, RuntimeException.class); + throw new RuntimeException(e); + } + } + + @Override + @SuppressWarnings("unchecked") + public R invoke(Object target, Object... args) { + Preconditions.checkArgument(target == null, + "Invalid call to constructor: target must be null"); + return (R) newInstance(args); + } + + @Override + @SuppressWarnings("unchecked") + public R invokeChecked(Object target, Object... args) throws Exception { + Preconditions.checkArgument(target == null, + "Invalid call to constructor: target must be null"); + return (R) newInstanceChecked(args); + } + + @Override + public DynMethods.BoundMethod bind(Object receiver) { + throw new IllegalStateException("Cannot bind constructors"); + } + + @Override + public boolean isStatic() { + return true; + } + + @Override + public String toString() { + return getClass().getSimpleName() + + "(constructor=" + ctor + ", class=" + constructed + ")"; + } + } + + public static class Builder { + private final Class baseClass; + private ClassLoader loader = Thread.currentThread().getContextClassLoader(); + private Ctor ctor = null; + private Map problems = new HashMap(); + + public Builder(Class baseClass) { + this.baseClass = baseClass; + } + + public Builder() { + this.baseClass = null; + } + + /** + * Set the {@link ClassLoader} used to lookup classes by name. + *

+ * If not set, the current thread's ClassLoader is used. + * + * @param loader a ClassLoader + * @return this Builder for method chaining + */ + public Builder loader(ClassLoader loader) { + this.loader = loader; + return this; + } + + public Builder impl(String className, Class... types) { + // don't do any work if an implementation has been found + if (ctor != null) { + return this; + } + + try { + Class targetClass = Class.forName(className, true, loader); + impl(targetClass, types); + } catch (NoClassDefFoundError e) { + // cannot load this implementation + problems.put(className, e); + } catch (ClassNotFoundException e) { + // not the right implementation + problems.put(className, e); + } + return this; + } + + public Builder impl(Class targetClass, Class... types) { + // don't do any work if an implementation has been found + if (ctor != null) { + return this; + } + + try { + ctor = new Ctor(targetClass.getConstructor(types), targetClass); + } catch (NoSuchMethodException e) { + // not the right implementation + problems.put(methodName(targetClass, types), e); + } + return this; + } + + public Builder hiddenImpl(Class... types) { + hiddenImpl(baseClass, types); + return this; + } + + @SuppressWarnings("unchecked") + public Builder hiddenImpl(String className, Class... types) { + // don't do any work if an implementation has been found + if (ctor != null) { + return this; + } + + try { + Class targetClass = Class.forName(className, true, loader); + hiddenImpl(targetClass, types); + } catch (NoClassDefFoundError e) { + // cannot load this implementation + problems.put(className, e); + } catch (ClassNotFoundException e) { + // not the right implementation + problems.put(className, e); + } + return this; + } + + public Builder hiddenImpl(Class targetClass, Class... types) { + // don't do any work if an implementation has been found + if (ctor != null) { + return this; + } + + try { + Constructor hidden = targetClass.getDeclaredConstructor(types); + AccessController.doPrivileged(new MakeAccessible(hidden)); + ctor = new Ctor(hidden, targetClass); + } catch (SecurityException e) { + // unusable + problems.put(methodName(targetClass, types), e); + } catch (NoSuchMethodException e) { + // not the right implementation + problems.put(methodName(targetClass, types), e); + } + return this; + } + + @SuppressWarnings("unchecked") + public Ctor buildChecked() throws NoSuchMethodException { + if (ctor != null) { + return ctor; + } + throw new NoSuchMethodException("Cannot find constructor for " + + baseClass + "\n" + formatProblems(problems)); + } + + @SuppressWarnings("unchecked") + public Ctor build() { + if (ctor != null) { + return ctor; + } + throw new RuntimeException("Cannot find constructor for " + + baseClass + "\n" + formatProblems(problems)); + } + } + + private static class MakeAccessible implements PrivilegedAction { + private Constructor hidden; + + public MakeAccessible(Constructor hidden) { + this.hidden = hidden; + } + + @Override + public Void run() { + hidden.setAccessible(true); + return null; + } + } + + private static String formatProblems(Map problems) { + StringBuilder sb = new StringBuilder(); + boolean first = true; + for (Map.Entry problem : problems.entrySet()) { + if (first) { + first = false; + } else { + sb.append("\n"); + } + sb.append("\tMissing ").append(problem.getKey()).append(" [") + .append(problem.getValue().getClass().getName()).append(": ") + .append(problem.getValue().getMessage()).append("]"); + } + return sb.toString(); + } + + private static String methodName(Class targetClass, Class... types) { + StringBuilder sb = new StringBuilder(); + sb.append(targetClass.getName()).append("("); + boolean first = true; + for (Class type : types) { + if (first) { + first = false; + } else { + sb.append(","); + } + sb.append(type.getName()); + } + sb.append(")"); + return sb.toString(); + } +} diff --git a/parquet-common/src/main/java/org/apache/parquet/util/DynMethods.java b/parquet-common/src/main/java/org/apache/parquet/util/DynMethods.java new file mode 100644 index 0000000000..769f31c9d3 --- /dev/null +++ b/parquet-common/src/main/java/org/apache/parquet/util/DynMethods.java @@ -0,0 +1,520 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.parquet.util; + +import org.apache.parquet.Preconditions; +import java.lang.reflect.InvocationTargetException; +import java.lang.reflect.Method; +import java.lang.reflect.Modifier; +import java.security.AccessController; +import java.security.PrivilegedAction; +import java.util.Arrays; + +import static org.apache.parquet.Exceptions.throwIfInstance; + +public class DynMethods { + + /** + * Convenience wrapper class around {@link java.lang.reflect.Method}. + * + * Allows callers to invoke the wrapped method with all Exceptions wrapped by + * RuntimeException, or with a single Exception catch block. + */ + public static class UnboundMethod { + + private final Method method; + private final String name; + private final int argLength; + + UnboundMethod(Method method, String name) { + this.method = method; + this.name = name; + this.argLength = (method == null || method.isVarArgs()) ? -1 : + method.getParameterTypes().length; + } + + @SuppressWarnings("unchecked") + public R invokeChecked(Object target, Object... args) throws Exception { + try { + if (argLength < 0) { + return (R) method.invoke(target, args); + } else { + return (R) method.invoke(target, Arrays.copyOfRange(args, 0, argLength)); + } + + } catch (InvocationTargetException e) { + throwIfInstance(e.getCause(), Exception.class); + throwIfInstance(e.getCause(), RuntimeException.class); + throw new RuntimeException(e.getCause()); + } + } + + public R invoke(Object target, Object... args) { + try { + return this.invokeChecked(target, args); + } catch (Exception e) { + throwIfInstance(e, RuntimeException.class); + throw new RuntimeException(e); + } + } + + /** + * Returns this method as a BoundMethod for the given receiver. + * + * @param receiver an Object to receive the method invocation + * @return a {@link BoundMethod} for this method and the receiver + * @throws IllegalStateException if the method is static + * @throws IllegalArgumentException if the receiver's class is incompatible + */ + public BoundMethod bind(Object receiver) { + Preconditions.checkState(!isStatic(), + "Cannot bind static method " + method.toGenericString()); + Preconditions.checkArgument( + method.getDeclaringClass().isAssignableFrom(receiver.getClass()), + "Cannot bind " + method.toGenericString() + " to instance of " + + receiver.getClass()); + + return new BoundMethod(this, receiver); + } + + /** + * @return whether the method is a static method + */ + public boolean isStatic() { + return Modifier.isStatic(method.getModifiers()); + } + + /** + * @return whether the method is a noop + */ + public boolean isNoop() { + return this == NOOP; + } + + /** + * Returns this method as a StaticMethod. + * + * @return a {@link StaticMethod} for this method + * @throws IllegalStateException if the method is not static + */ + public StaticMethod asStatic() { + Preconditions.checkState(isStatic(), "Method is not static"); + return new StaticMethod(this); + } + + public String toString() { + return "DynMethods.UnboundMethod(name=" + name +" method=" + + method.toGenericString() + ")"; + } + + /** + * Singleton {@link UnboundMethod}, performs no operation and returns null. + */ + private static UnboundMethod NOOP = new UnboundMethod(null, "NOOP") { + @Override + public R invokeChecked(Object target, Object... args) throws Exception { + return null; + } + + @Override + public BoundMethod bind(Object receiver) { + return new BoundMethod(this, receiver); + } + + @Override + public StaticMethod asStatic() { + return new StaticMethod(this); + } + + @Override + public boolean isStatic() { + return true; + } + + @Override + public String toString() { + return "DynMethods.UnboundMethod(NOOP)"; + } + }; + } + + public static class BoundMethod { + private final UnboundMethod method; + private final Object receiver; + + private BoundMethod(UnboundMethod method, Object receiver) { + this.method = method; + this.receiver = receiver; + } + + public R invokeChecked(Object... args) throws Exception { + return method.invokeChecked(receiver, args); + } + + public R invoke(Object... args) { + return method.invoke(receiver, args); + } + } + + public static class StaticMethod { + private final UnboundMethod method; + + private StaticMethod(UnboundMethod method) { + this.method = method; + } + + public R invokeChecked(Object... args) throws Exception { + return method.invokeChecked(null, args); + } + + public R invoke(Object... args) { + return method.invoke(null, args); + } + } + + public static class Builder { + private final String name; + private ClassLoader loader = Thread.currentThread().getContextClassLoader(); + private UnboundMethod method = null; + + public Builder(String methodName) { + this.name = methodName; + } + + /** + * Set the {@link ClassLoader} used to lookup classes by name. + *

+ * If not set, the current thread's ClassLoader is used. + * + * @param loader a ClassLoader + * @return this Builder for method chaining + */ + public Builder loader(ClassLoader loader) { + this.loader = loader; + return this; + } + + /** + * If no implementation has been found, adds a NOOP method. + * + * Note: calls to impl will not match after this method is called! + * + * @return this Builder for method chaining + */ + public Builder orNoop() { + if (method == null) { + this.method = UnboundMethod.NOOP; + } + return this; + } + + /** + * Checks for an implementation, first finding the given class by name. + * + * @param className name of a class + * @param methodName name of a method (different from constructor) + * @param argClasses argument classes for the method + * @return this Builder for method chaining + * @see {@link java.lang.Class#forName(String)} + * @see {@link java.lang.Class#getMethod(String, Class[])} + */ + public Builder impl(String className, String methodName, Class... argClasses) { + // don't do any work if an implementation has been found + if (method != null) { + return this; + } + + try { + Class targetClass = Class.forName(className, true, loader); + impl(targetClass, methodName, argClasses); + } catch (ClassNotFoundException e) { + // not the right implementation + } + return this; + } + + /** + * Checks for an implementation, first finding the given class by name. + * + * The name passed to the constructor is the method name used. + * + * @param className name of a class + * @param argClasses argument classes for the method + * @return this Builder for method chaining + * @see {@link java.lang.Class#forName(String)} + * @see {@link java.lang.Class#getMethod(String, Class[])} + */ + public Builder impl(String className, Class... argClasses) { + impl(className, name, argClasses); + return this; + } + + /** + * Checks for a method implementation. + * + * @param methodName name of a method (different from constructor) + * @param argClasses argument classes for the method + * @return this Builder for method chaining + * @see {@link java.lang.Class#forName(String)} + * @see {@link java.lang.Class#getMethod(String, Class[])} + */ + public Builder impl(Class targetClass, String methodName, Class... argClasses) { + // don't do any work if an implementation has been found + if (method != null) { + return this; + } + + try { + this.method = new UnboundMethod( + targetClass.getMethod(methodName, argClasses), name); + } catch (NoSuchMethodException e) { + // not the right implementation + } + return this; + } + + /** + * Checks for a method implementation. + * + * The name passed to the constructor is the method name used. + * + * @param argClasses argument classes for the method + * @return this Builder for method chaining + * @see {@link java.lang.Class#forName(String)} + * @see {@link java.lang.Class#getMethod(String, Class[])} + */ + public Builder impl(Class targetClass, Class... argClasses) { + impl(targetClass, name, argClasses); + return this; + } + + public Builder ctorImpl(Class targetClass, Class... argClasses) { + // don't do any work if an implementation has been found + if (method != null) { + return this; + } + + try { + this.method = new DynConstructors.Builder() + .impl(targetClass, argClasses) + .buildChecked(); + } catch (NoSuchMethodException e) { + // not the right implementation + } + return this; + } + + public Builder ctorImpl(String className, Class... argClasses) { + // don't do any work if an implementation has been found + if (method != null) { + return this; + } + + try { + this.method = new DynConstructors.Builder() + .impl(className, argClasses) + .buildChecked(); + } catch (NoSuchMethodException e) { + // not the right implementation + } + return this; + } + + /** + * Checks for an implementation, first finding the given class by name. + * + * @param className name of a class + * @param methodName name of a method (different from constructor) + * @param argClasses argument classes for the method + * @return this Builder for method chaining + * @see {@link java.lang.Class#forName(String)} + * @see {@link java.lang.Class#getMethod(String, Class[])} + */ + public Builder hiddenImpl(String className, String methodName, Class... argClasses) { + // don't do any work if an implementation has been found + if (method != null) { + return this; + } + + try { + Class targetClass = Class.forName(className, true, loader); + hiddenImpl(targetClass, methodName, argClasses); + } catch (ClassNotFoundException e) { + // not the right implementation + } + return this; + } + + /** + * Checks for an implementation, first finding the given class by name. + * + * The name passed to the constructor is the method name used. + * + * @param className name of a class + * @param argClasses argument classes for the method + * @return this Builder for method chaining + * @see {@link java.lang.Class#forName(String)} + * @see {@link java.lang.Class#getMethod(String, Class[])} + */ + public Builder hiddenImpl(String className, Class... argClasses) { + hiddenImpl(className, name, argClasses); + return this; + } + + /** + * Checks for a method implementation. + * + * @param methodName name of a method (different from constructor) + * @param argClasses argument classes for the method + * @return this Builder for method chaining + * @see {@link java.lang.Class#forName(String)} + * @see {@link java.lang.Class#getMethod(String, Class[])} + */ + public Builder hiddenImpl(Class targetClass, String methodName, Class... argClasses) { + // don't do any work if an implementation has been found + if (method != null) { + return this; + } + + try { + Method hidden = targetClass.getDeclaredMethod(methodName, argClasses); + AccessController.doPrivileged(new MakeAccessible(hidden)); + this.method = new UnboundMethod(hidden, name); + } catch (SecurityException e) { + // unusable + } catch (NoSuchMethodException e) { + // not the right implementation + } + return this; + } + + /** + * Checks for a method implementation. + * + * The name passed to the constructor is the method name used. + * + * @param argClasses argument classes for the method + * @return this Builder for method chaining + * @see {@link java.lang.Class#forName(String)} + * @see {@link java.lang.Class#getMethod(String, Class[])} + */ + public Builder hiddenImpl(Class targetClass, Class... argClasses) { + hiddenImpl(targetClass, name, argClasses); + return this; + } + + /** + * Returns the first valid implementation as a UnboundMethod or throws a + * NoSuchMethodException if there is none. + * + * @return a {@link UnboundMethod} with a valid implementation + * @throws NoSuchMethodException if no implementation was found + */ + public UnboundMethod buildChecked() throws NoSuchMethodException { + if (method != null) { + return method; + } else { + throw new NoSuchMethodException("Cannot find method: " + name); + } + } + + /** + * Returns the first valid implementation as a UnboundMethod or throws a + * RuntimeError if there is none. + * + * @return a {@link UnboundMethod} with a valid implementation + * @throws RuntimeException if no implementation was found + */ + public UnboundMethod build() { + if (method != null) { + return method; + } else { + throw new RuntimeException("Cannot find method: " + name); + } + } + + /** + * Returns the first valid implementation as a BoundMethod or throws a + * NoSuchMethodException if there is none. + * + * @param receiver an Object to receive the method invocation + * @return a {@link BoundMethod} with a valid implementation and receiver + * @throws IllegalStateException if the method is static + * @throws IllegalArgumentException if the receiver's class is incompatible + * @throws NoSuchMethodException if no implementation was found + */ + public BoundMethod buildChecked(Object receiver) throws NoSuchMethodException { + return buildChecked().bind(receiver); + } + + /** + * Returns the first valid implementation as a BoundMethod or throws a + * RuntimeError if there is none. + * + * @param receiver an Object to receive the method invocation + * @return a {@link BoundMethod} with a valid implementation and receiver + * @throws IllegalStateException if the method is static + * @throws IllegalArgumentException if the receiver's class is incompatible + * @throws RuntimeException if no implementation was found + */ + public BoundMethod build(Object receiver) { + return build().bind(receiver); + } + + /** + * Returns the first valid implementation as a StaticMethod or throws a + * NoSuchMethodException if there is none. + * + * @return a {@link StaticMethod} with a valid implementation + * @throws IllegalStateException if the method is not static + * @throws NoSuchMethodException if no implementation was found + */ + public StaticMethod buildStaticChecked() throws NoSuchMethodException { + return buildChecked().asStatic(); + } + + /** + * Returns the first valid implementation as a StaticMethod or throws a + * RuntimeException if there is none. + * + * @return a {@link StaticMethod} with a valid implementation + * @throws IllegalStateException if the method is not static + * @throws RuntimeException if no implementation was found + */ + public StaticMethod buildStatic() { + return build().asStatic(); + } + + } + + private static class MakeAccessible implements PrivilegedAction { + private Method hidden; + + public MakeAccessible(Method hidden) { + this.hidden = hidden; + } + + @Override + public Void run() { + hidden.setAccessible(true); + return null; + } + } +} diff --git a/parquet-common/src/test/java/org/apache/parquet/TestUtils.java b/parquet-common/src/test/java/org/apache/parquet/TestUtils.java new file mode 100644 index 0000000000..2062827399 --- /dev/null +++ b/parquet-common/src/test/java/org/apache/parquet/TestUtils.java @@ -0,0 +1,70 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.parquet; + +import org.junit.Assert; +import java.util.concurrent.Callable; + +public class TestUtils { + + /** + * A convenience method to avoid a large number of @Test(expected=...) tests + * @param message A String message to describe this assertion + * @param expected An Exception class that the Runnable should throw + * @param callable A Callable that is expected to throw the exception + */ + public static void assertThrows( + String message, Class expected, Callable callable) { + try { + callable.call(); + Assert.fail("No exception was thrown (" + message + "), expected: " + + expected.getName()); + } catch (Exception actual) { + try { + Assert.assertEquals(message, expected, actual.getClass()); + } catch (AssertionError e) { + e.addSuppressed(actual); + throw e; + } + } + } + + /** + * A convenience method to avoid a large number of @Test(expected=...) tests + * @param message A String message to describe this assertion + * @param expected An Exception class that the Runnable should throw + * @param runnable A Runnable that is expected to throw the runtime exception + */ + public static void assertThrows( + String message, Class expected, Runnable runnable) { + try { + runnable.run(); + Assert.fail("No exception was thrown (" + message + "), expected: " + + expected.getName()); + } catch (Exception actual) { + try { + Assert.assertEquals(message, expected, actual.getClass()); + } catch (AssertionError e) { + e.addSuppressed(actual); + throw e; + } + } + } +} diff --git a/parquet-common/src/test/java/org/apache/parquet/util/Concatenator.java b/parquet-common/src/test/java/org/apache/parquet/util/Concatenator.java new file mode 100644 index 0000000000..261c2be113 --- /dev/null +++ b/parquet-common/src/test/java/org/apache/parquet/util/Concatenator.java @@ -0,0 +1,82 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.parquet.util; + +/** + * This is a class for testing DynMethods and DynConstructors. + */ +public class Concatenator { + public static class SomeCheckedException extends Exception { + } + + private String sep = ""; + + public Concatenator() { + } + + public Concatenator(String sep) { + this.sep = sep; + } + + private Concatenator(char sep) { + this.sep = String.valueOf(sep); + } + + public Concatenator(Exception e) throws Exception { + throw e; + } + + public static Concatenator newConcatenator(String sep) { + return new Concatenator(sep); + } + + private void setSeparator(String sep) { + this.sep = sep; + } + + public String concat(String left, String right) { + return left + sep + right; + } + + public String concat(String left, String middle, String right) { + return left + sep + middle + sep + right; + } + + public String concat(Exception e) throws Exception { + throw e; + } + + public String concat(String... strings) { + if (strings.length >= 1) { + StringBuilder sb = new StringBuilder(); + sb.append(strings[0]); + for (int i = 1; i < strings.length; i += 1) { + sb.append(sep); + sb.append(strings[i]); + } + return sb.toString(); + } + return null; + } + + public static String cat(String... strings) { + return new Concatenator().concat(strings); + } +} diff --git a/parquet-common/src/test/java/org/apache/parquet/util/TestDynConstructors.java b/parquet-common/src/test/java/org/apache/parquet/util/TestDynConstructors.java new file mode 100644 index 0000000000..1ab9582bce --- /dev/null +++ b/parquet-common/src/test/java/org/apache/parquet/util/TestDynConstructors.java @@ -0,0 +1,235 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.parquet.util; + +import org.apache.parquet.TestUtils; +import org.apache.parquet.util.Concatenator.SomeCheckedException; +import org.junit.Assert; +import org.junit.Test; +import java.util.concurrent.Callable; + +public class TestDynConstructors { + @Test + public void testNoImplCall() { + final DynConstructors.Builder builder = new DynConstructors.Builder(); + + TestUtils.assertThrows("Checked build should throw NoSuchMethodException", + NoSuchMethodException.class, new Callable() { + @Override + public Object call() throws NoSuchMethodException { + return builder.buildChecked(); + } + }); + + TestUtils.assertThrows("Normal build should throw RuntimeException", + RuntimeException.class, new Runnable() { + @Override + public void run() { + builder.build(); + } + }); + } + + @Test + public void testMissingClass() { + final DynConstructors.Builder builder = new DynConstructors.Builder() + .impl("not.a.RealClass"); + + TestUtils.assertThrows("Checked build should throw NoSuchMethodException", + NoSuchMethodException.class, new Callable() { + @Override + public Object call() throws NoSuchMethodException { + return builder.buildChecked(); + } + }); + + TestUtils.assertThrows("Normal build should throw RuntimeException", + RuntimeException.class, new Runnable() { + @Override + public void run() { + builder.build(); + } + }); + } + + @Test + public void testMissingConstructor() { + final DynConstructors.Builder builder = new DynConstructors.Builder() + .impl(Concatenator.class, String.class, String.class); + + TestUtils.assertThrows("Checked build should throw NoSuchMethodException", + NoSuchMethodException.class, new Callable() { + @Override + public Object call() throws NoSuchMethodException { + return builder.buildChecked(); + } + }); + + TestUtils.assertThrows("Normal build should throw RuntimeException", + RuntimeException.class, new Runnable() { + @Override + public void run() { + builder.build(); + } + }); + } + + @Test + public void testFirstImplReturned() throws Exception { + final DynConstructors.Ctor sepCtor = new DynConstructors.Builder() + .impl("not.a.RealClass", String.class) + .impl(Concatenator.class, String.class) + .impl(Concatenator.class) + .buildChecked(); + + Concatenator dashCat = sepCtor.newInstanceChecked("-"); + Assert.assertEquals("Should construct with the 1-arg version", + "a-b", dashCat.concat("a", "b")); + + TestUtils.assertThrows("Should complain about extra arguments", + IllegalArgumentException.class, new Callable() { + @Override + public Object call() throws Exception { + return sepCtor.newInstanceChecked("/", "-"); + } + }); + + TestUtils.assertThrows("Should complain about extra arguments", + IllegalArgumentException.class, new Callable() { + @Override + public Object call() throws Exception { + return sepCtor.newInstance("/", "-"); + } + }); + + DynConstructors.Ctor defaultCtor = new DynConstructors.Builder() + .impl("not.a.RealClass", String.class) + .impl(Concatenator.class) + .impl(Concatenator.class, String.class) + .buildChecked(); + + Concatenator cat = defaultCtor.newInstanceChecked(); + Assert.assertEquals("Should construct with the no-arg version", + "ab", cat.concat("a", "b")); + } + + @Test + public void testExceptionThrown() throws Exception { + final SomeCheckedException exc = new SomeCheckedException(); + final DynConstructors.Ctor sepCtor = new DynConstructors.Builder() + .impl("not.a.RealClass", String.class) + .impl(Concatenator.class, Exception.class) + .buildChecked(); + + TestUtils.assertThrows("Should re-throw the exception", + SomeCheckedException.class, new Callable() { + @Override + public Object call() throws Exception { + return sepCtor.newInstanceChecked(exc); + } + }); + + TestUtils.assertThrows("Should wrap the exception in RuntimeException", + RuntimeException.class, new Callable() { + @Override + public Object call() throws Exception { + return sepCtor.newInstance(exc); + } + }); + } + + @Test + public void testStringClassname() throws Exception { + final DynConstructors.Ctor sepCtor = new DynConstructors.Builder() + .impl(Concatenator.class.getName(), String.class) + .buildChecked(); + + Assert.assertNotNull("Should find 1-arg constructor", sepCtor.newInstance("-")); + } + + @Test + public void testHiddenMethod() throws Exception { + TestUtils.assertThrows("Should fail to find hidden method", + NoSuchMethodException.class, new Callable() { + @Override + public Object call() throws NoSuchMethodException { + return new DynMethods.Builder("setSeparator") + .impl(Concatenator.class, char.class) + .buildChecked(); + } + }); + + final DynConstructors.Ctor sepCtor = new DynConstructors.Builder() + .hiddenImpl(Concatenator.class.getName(), char.class) + .buildChecked(); + + Assert.assertNotNull("Should find hidden ctor with hiddenImpl", sepCtor); + + Concatenator slashCat = sepCtor.newInstanceChecked('/'); + + Assert.assertEquals("Should use separator /", + "a/b", slashCat.concat("a", "b")); + } + + @Test + public void testBind() throws Exception { + final DynConstructors.Ctor ctor = new DynConstructors.Builder() + .impl(Concatenator.class.getName()) + .buildChecked(); + + Assert.assertTrue("Should always be static", ctor.isStatic()); + + TestUtils.assertThrows("Should complain that method is static", + IllegalStateException.class, new Callable() { + @Override + public Object call() throws Exception { + return ctor.bind(null); + } + }); + } + + @Test + public void testInvoke() throws Exception { + final DynMethods.UnboundMethod ctor = new DynConstructors.Builder() + .impl(Concatenator.class.getName()) + .buildChecked(); + + TestUtils.assertThrows("Should complain that target must be null", + IllegalArgumentException.class, new Callable() { + @Override + public Object call() throws Exception { + return ctor.invokeChecked("a"); + } + }); + + TestUtils.assertThrows("Should complain that target must be null", + IllegalArgumentException.class, new Callable() { + @Override + public Object call() throws Exception { + return ctor.invoke("a"); + } + }); + + Assert.assertNotNull("Should allow invokeChecked(null, ...)", + ctor.invokeChecked(null)); + Assert.assertNotNull("Should allow invoke(null, ...)", + ctor.invoke(null)); + } +} diff --git a/parquet-common/src/test/java/org/apache/parquet/util/TestDynMethods.java b/parquet-common/src/test/java/org/apache/parquet/util/TestDynMethods.java new file mode 100644 index 0000000000..7017c6739f --- /dev/null +++ b/parquet-common/src/test/java/org/apache/parquet/util/TestDynMethods.java @@ -0,0 +1,410 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.parquet.util; + +import org.apache.parquet.TestUtils; +import org.apache.parquet.util.Concatenator.SomeCheckedException; +import org.junit.Assert; +import org.junit.Test; +import java.util.concurrent.Callable; + +public class TestDynMethods { + @Test + public void testNoImplCall() { + final DynMethods.Builder builder = new DynMethods.Builder("concat"); + + TestUtils.assertThrows("Checked build should throw NoSuchMethodException", + NoSuchMethodException.class, new Callable() { + @Override + public Object call() throws NoSuchMethodException { + return builder.buildChecked(); + } + }); + + TestUtils.assertThrows("Normal build should throw RuntimeException", + RuntimeException.class, new Runnable() { + @Override + public void run() { + builder.build(); + } + }); + } + + @Test + public void testMissingClass() { + final DynMethods.Builder builder = new DynMethods.Builder("concat") + .impl("not.a.RealClass", String.class, String.class); + + TestUtils.assertThrows("Checked build should throw NoSuchMethodException", + NoSuchMethodException.class, new Callable() { + @Override + public Object call() throws NoSuchMethodException { + return builder.buildChecked(); + } + }); + + TestUtils.assertThrows("Normal build should throw RuntimeException", + RuntimeException.class, new Runnable() { + @Override + public void run() { + builder.build(); + } + }); + } + + @Test + public void testMissingMethod() { + final DynMethods.Builder builder = new DynMethods.Builder("concat") + .impl(Concatenator.class, "cat2strings", String.class, String.class); + + TestUtils.assertThrows("Checked build should throw NoSuchMethodException", + NoSuchMethodException.class, new Callable() { + @Override + public Object call() throws NoSuchMethodException { + return builder.buildChecked(); + } + }); + + TestUtils.assertThrows("Normal build should throw RuntimeException", + RuntimeException.class, new Runnable() { + @Override + public void run() { + builder.build(); + } + }); + } + + @Test + public void testFirstImplReturned() throws Exception { + Concatenator obj = new Concatenator("-"); + DynMethods.UnboundMethod cat2 = new DynMethods.Builder("concat") + .impl("not.a.RealClass", String.class, String.class) + .impl(Concatenator.class, String.class, String.class) + .impl(Concatenator.class, String.class, String.class, String.class) + .buildChecked(); + + Assert.assertEquals("Should call the 2-arg version successfully", + "a-b", cat2.invoke(obj, "a", "b")); + + Assert.assertEquals("Should ignore extra arguments", + "a-b", cat2.invoke(obj, "a", "b", "c")); + + DynMethods.UnboundMethod cat3 = new DynMethods.Builder("concat") + .impl("not.a.RealClass", String.class, String.class) + .impl(Concatenator.class, String.class, String.class, String.class) + .impl(Concatenator.class, String.class, String.class) + .build(); + + Assert.assertEquals("Should call the 3-arg version successfully", + "a-b-c", cat3.invoke(obj, "a", "b", "c")); + + Assert.assertEquals("Should call the 3-arg version null padding", + "a-b-null", cat3.invoke(obj, "a", "b")); + } + + @Test + public void testVarArgs() throws Exception { + DynMethods.UnboundMethod cat = new DynMethods.Builder("concat") + .impl(Concatenator.class, String[].class) + .buildChecked(); + + Assert.assertEquals("Should use the varargs version", "abcde", + cat.invokeChecked( + new Concatenator(), + (Object) new String[] {"a", "b", "c", "d", "e"})); + + Assert.assertEquals("Should use the varargs version", "abcde", + cat.bind(new Concatenator()) + .invokeChecked((Object) new String[] {"a", "b", "c", "d", "e"})); + } + + @Test + public void testIncorrectArguments() throws Exception { + final Concatenator obj = new Concatenator("-"); + final DynMethods.UnboundMethod cat = new DynMethods.Builder("concat") + .impl("not.a.RealClass", String.class, String.class) + .impl(Concatenator.class, String.class, String.class) + .buildChecked(); + + TestUtils.assertThrows("Should fail if non-string arguments are passed", + IllegalArgumentException.class, new Callable() { + @Override + public Object call() throws Exception { + return cat.invoke(obj, 3, 4); + } + }); + + TestUtils.assertThrows("Should fail if non-string arguments are passed", + IllegalArgumentException.class, new Callable() { + @Override + public Object call() throws Exception { + return cat.invokeChecked(obj, 3, 4); + } + }); + } + + @Test + public void testExceptionThrown() throws Exception { + final SomeCheckedException exc = new SomeCheckedException(); + final Concatenator obj = new Concatenator("-"); + final DynMethods.UnboundMethod cat = new DynMethods.Builder("concat") + .impl("not.a.RealClass", String.class, String.class) + .impl(Concatenator.class, Exception.class) + .buildChecked(); + + TestUtils.assertThrows("Should re-throw the exception", + SomeCheckedException.class, new Callable() { + @Override + public Object call() throws Exception { + return cat.invokeChecked(obj, exc); + } + }); + + TestUtils.assertThrows("Should wrap the exception in RuntimeException", + RuntimeException.class, new Callable() { + @Override + public Object call() throws Exception { + return cat.invoke(obj, exc); + } + }); + } + + @Test + public void testNameChange() throws Exception { + Concatenator obj = new Concatenator("-"); + DynMethods.UnboundMethod cat = new DynMethods.Builder("cat") + .impl(Concatenator.class, "concat", String.class, String.class) + .buildChecked(); + + Assert.assertEquals("Should find 2-arg concat method", + "a-b", cat.invoke(obj, "a", "b")); + } + + @Test + public void testStringClassname() throws Exception { + Concatenator obj = new Concatenator("-"); + DynMethods.UnboundMethod cat = new DynMethods.Builder("concat") + .impl(Concatenator.class.getName(), String.class, String.class) + .buildChecked(); + + Assert.assertEquals("Should find 2-arg concat method", + "a-b", cat.invoke(obj, "a", "b")); + } + + @Test + public void testHiddenMethod() throws Exception { + Concatenator obj = new Concatenator("-"); + + TestUtils.assertThrows("Should fail to find hidden method", + NoSuchMethodException.class, new Callable() { + @Override + public Object call() throws NoSuchMethodException { + return new DynMethods.Builder("setSeparator") + .impl(Concatenator.class, String.class) + .buildChecked(); + } + }); + + DynMethods.UnboundMethod changeSep = new DynMethods.Builder("setSeparator") + .hiddenImpl(Concatenator.class, String.class) + .buildChecked(); + + Assert.assertNotNull("Should find hidden method with hiddenImpl", + changeSep); + + changeSep.invokeChecked(obj, "/"); + + Assert.assertEquals("Should use separator / instead of -", + "a/b", obj.concat("a", "b")); + } + + @Test + public void testBoundMethod() throws Exception { + DynMethods.UnboundMethod cat = new DynMethods.Builder("concat") + .impl(Concatenator.class, String.class, String.class) + .buildChecked(); + + // Unbound methods can be bound multiple times + DynMethods.BoundMethod dashCat = cat.bind(new Concatenator("-")); + DynMethods.BoundMethod underCat = cat.bind(new Concatenator("_")); + + Assert.assertEquals("Should use '-' object without passing", + "a-b", dashCat.invoke("a", "b")); + Assert.assertEquals("Should use '_' object without passing", + "a_b", underCat.invoke("a", "b")); + + DynMethods.BoundMethod slashCat = new DynMethods.Builder("concat") + .impl(Concatenator.class, String.class, String.class) + .buildChecked(new Concatenator("/")); + + Assert.assertEquals("Should use bound object from builder without passing", + "a/b", slashCat.invoke("a", "b")); + } + + @Test + public void testBindStaticMethod() throws Exception { + final DynMethods.Builder builder = new DynMethods.Builder("cat") + .impl(Concatenator.class, String[].class); + + TestUtils.assertThrows("Should complain that method is static", + IllegalStateException.class, new Callable() { + @Override + public Object call() throws Exception { + return builder.buildChecked(new Concatenator()); + } + }); + + TestUtils.assertThrows("Should complain that method is static", + IllegalStateException.class, new Callable() { + @Override + public Object call() throws Exception { + return builder.build(new Concatenator()); + } + }); + + final DynMethods.UnboundMethod staticCat = builder.buildChecked(); + Assert.assertTrue("Should be static", staticCat.isStatic()); + + TestUtils.assertThrows("Should complain that method is static", + IllegalStateException.class, new Callable() { + @Override + public Object call() throws Exception { + return staticCat.bind(new Concatenator()); + } + }); + } + + @Test + public void testStaticMethod() throws Exception { + DynMethods.StaticMethod staticCat = new DynMethods.Builder("cat") + .impl(Concatenator.class, String[].class) + .buildStaticChecked(); + + Assert.assertEquals("Should call varargs static method cat(String...)", + "abcde", staticCat.invokeChecked( + (Object) new String[] {"a", "b", "c", "d", "e"})); + } + + @Test + public void testNonStaticMethod() throws Exception { + final DynMethods.Builder builder = new DynMethods.Builder("concat") + .impl(Concatenator.class, String.class, String.class); + + TestUtils.assertThrows("Should complain that method is not static", + IllegalStateException.class, new Callable() { + @Override + public Object call() throws Exception { + return builder.buildStatic(); + } + }); + + TestUtils.assertThrows("Should complain that method is not static", + IllegalStateException.class, new Callable() { + @Override + public Object call() throws Exception { + return builder.buildStaticChecked(); + } + }); + + final DynMethods.UnboundMethod cat2 = builder.buildChecked(); + Assert.assertFalse("concat(String,String) should not be static", + cat2.isStatic()); + + TestUtils.assertThrows("Should complain that method is not static", + IllegalStateException.class, new Callable() { + @Override + public Object call() throws Exception { + return cat2.asStatic(); + } + }); + } + + @Test + public void testConstructorImpl() throws Exception { + final DynMethods.Builder builder = new DynMethods.Builder("newConcatenator") + .ctorImpl(Concatenator.class, String.class) + .impl(Concatenator.class, String.class); + + DynMethods.UnboundMethod newConcatenator = builder.buildChecked(); + Assert.assertTrue("Should find constructor implementation", + newConcatenator instanceof DynConstructors.Ctor); + Assert.assertTrue("Constructor should be a static method", + newConcatenator.isStatic()); + Assert.assertFalse("Constructor should not be NOOP", + newConcatenator.isNoop()); + + // constructors cannot be bound + TestUtils.assertThrows("Should complain that ctor method is static", + IllegalStateException.class, new Callable() { + @Override + public Object call() throws Exception { + return builder.buildChecked(new Concatenator()); + } + }); + TestUtils.assertThrows("Should complain that ctor method is static", + IllegalStateException.class, new Callable() { + @Override + public Object call() throws Exception { + return builder.build(new Concatenator()); + } + }); + + Concatenator concatenator = newConcatenator.asStatic().invoke("*"); + Assert.assertEquals("Should function as a concatenator", + "a*b", concatenator.concat("a", "b")); + + concatenator = newConcatenator.asStatic().invokeChecked("@"); + Assert.assertEquals("Should function as a concatenator", + "a@b", concatenator.concat("a", "b")); + } + + @Test + public void testConstructorImplAfterFactoryMethod() throws Exception { + DynMethods.UnboundMethod newConcatenator = new DynMethods.Builder("newConcatenator") + .impl(Concatenator.class, String.class) + .ctorImpl(Concatenator.class, String.class) + .buildChecked(); + + Assert.assertFalse("Should find factory method before constructor method", + newConcatenator instanceof DynConstructors.Ctor); + } + + @Test + public void testNoop() throws Exception { + // noop can be unbound, bound, or static + DynMethods.UnboundMethod noop = new DynMethods.Builder("concat") + .impl("not.a.RealClass", String.class, String.class) + .orNoop() + .buildChecked(); + + Assert.assertTrue("No implementation found, should return NOOP", + noop.isNoop()); + Assert.assertNull("NOOP should always return null", + noop.invoke(new Concatenator(), "a")); + Assert.assertNull("NOOP can be called with null", + noop.invoke(null, "a")); + Assert.assertNull("NOOP can be bound", + noop.bind(new Concatenator()).invoke("a")); + Assert.assertNull("NOOP can be bound to null", + noop.bind(null).invoke("a")); + Assert.assertNull("NOOP can be static", + noop.asStatic().invoke("a")); + } +} diff --git a/pom.xml b/pom.xml index df4bbd3cb0..0d4df8aecb 100644 --- a/pom.xml +++ b/pom.xml @@ -89,6 +89,12 @@ 1.8.1 20.0 1.10.19 + + + 2.3 + 2.3.1 + 1.35 + 1.10 @@ -97,6 +103,7 @@ parquet-benchmarks parquet-cascading parquet-cascading3 + parquet-cli parquet-column parquet-common parquet-encoding