public class OrcFileFormat extends Object implements org.apache.spark.sql.execution.datasources.FileFormat, DataSourceRegister, scala.Serializable
FileFormat
for reading ORC files. If this is moved or renamed, please update
DataSource
's backwardCompatibilityMap.Constructor and Description |
---|
OrcFileFormat() |
Modifier and Type | Method and Description |
---|---|
scala.Function1<org.apache.spark.sql.execution.datasources.PartitionedFile,scala.collection.Iterator<org.apache.spark.sql.catalyst.InternalRow>> |
buildReader(SparkSession sparkSession,
StructType dataSchema,
StructType partitionSchema,
StructType requiredSchema,
scala.collection.Seq<Filter> filters,
scala.collection.immutable.Map<String,String> options,
org.apache.hadoop.conf.Configuration hadoopConf) |
static scala.Function1<org.apache.spark.sql.execution.datasources.PartitionedFile,scala.collection.Iterator<org.apache.spark.sql.catalyst.InternalRow>> |
buildReaderWithPartitionValues(SparkSession sparkSession,
StructType dataSchema,
StructType partitionSchema,
StructType requiredSchema,
scala.collection.Seq<Filter> filters,
scala.collection.immutable.Map<String,String> options,
org.apache.hadoop.conf.Configuration hadoopConf) |
static scala.collection.immutable.Map<String,String> |
extensionsForCompressionCodecNames() |
scala.Option<StructType> |
inferSchema(SparkSession sparkSession,
scala.collection.immutable.Map<String,String> options,
scala.collection.Seq<org.apache.hadoop.fs.FileStatus> files) |
boolean |
isSplitable(SparkSession sparkSession,
scala.collection.immutable.Map<String,String> options,
org.apache.hadoop.fs.Path path) |
org.apache.spark.sql.execution.datasources.OutputWriterFactory |
prepareWrite(SparkSession sparkSession,
org.apache.hadoop.mapreduce.Job job,
scala.collection.immutable.Map<String,String> options,
StructType dataSchema) |
static void |
setRequiredColumns(org.apache.hadoop.conf.Configuration conf,
StructType dataSchema,
StructType requestedSchema) |
String |
shortName()
The string that represents the format that this data source provider uses.
|
static boolean |
supportBatch(SparkSession sparkSession,
StructType dataSchema) |
String |
toString() |
static scala.collection.Iterator<org.apache.spark.sql.catalyst.InternalRow> |
unwrapOrcStructs(org.apache.hadoop.conf.Configuration conf,
StructType dataSchema,
StructType requiredSchema,
scala.Option<org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector> maybeStructOI,
scala.collection.Iterator<org.apache.hadoop.io.Writable> iterator) |
static scala.Option<scala.collection.Seq<String>> |
vectorTypes(StructType requiredSchema,
StructType partitionSchema,
org.apache.spark.sql.internal.SQLConf sqlConf) |
public static scala.collection.immutable.Map<String,String> extensionsForCompressionCodecNames()
public static scala.collection.Iterator<org.apache.spark.sql.catalyst.InternalRow> unwrapOrcStructs(org.apache.hadoop.conf.Configuration conf, StructType dataSchema, StructType requiredSchema, scala.Option<org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector> maybeStructOI, scala.collection.Iterator<org.apache.hadoop.io.Writable> iterator)
public static void setRequiredColumns(org.apache.hadoop.conf.Configuration conf, StructType dataSchema, StructType requestedSchema)
public static boolean supportBatch(SparkSession sparkSession, StructType dataSchema)
public static scala.Option<scala.collection.Seq<String>> vectorTypes(StructType requiredSchema, StructType partitionSchema, org.apache.spark.sql.internal.SQLConf sqlConf)
public static scala.Function1<org.apache.spark.sql.execution.datasources.PartitionedFile,scala.collection.Iterator<org.apache.spark.sql.catalyst.InternalRow>> buildReaderWithPartitionValues(SparkSession sparkSession, StructType dataSchema, StructType partitionSchema, StructType requiredSchema, scala.collection.Seq<Filter> filters, scala.collection.immutable.Map<String,String> options, org.apache.hadoop.conf.Configuration hadoopConf)
public String shortName()
DataSourceRegister
override def shortName(): String = "parquet"
shortName
in interface DataSourceRegister
public String toString()
toString
in class Object
public scala.Option<StructType> inferSchema(SparkSession sparkSession, scala.collection.immutable.Map<String,String> options, scala.collection.Seq<org.apache.hadoop.fs.FileStatus> files)
inferSchema
in interface org.apache.spark.sql.execution.datasources.FileFormat
public org.apache.spark.sql.execution.datasources.OutputWriterFactory prepareWrite(SparkSession sparkSession, org.apache.hadoop.mapreduce.Job job, scala.collection.immutable.Map<String,String> options, StructType dataSchema)
prepareWrite
in interface org.apache.spark.sql.execution.datasources.FileFormat
public boolean isSplitable(SparkSession sparkSession, scala.collection.immutable.Map<String,String> options, org.apache.hadoop.fs.Path path)
isSplitable
in interface org.apache.spark.sql.execution.datasources.FileFormat
public scala.Function1<org.apache.spark.sql.execution.datasources.PartitionedFile,scala.collection.Iterator<org.apache.spark.sql.catalyst.InternalRow>> buildReader(SparkSession sparkSession, StructType dataSchema, StructType partitionSchema, StructType requiredSchema, scala.collection.Seq<Filter> filters, scala.collection.immutable.Map<String,String> options, org.apache.hadoop.conf.Configuration hadoopConf)
buildReader
in interface org.apache.spark.sql.execution.datasources.FileFormat