import random
import pyorc
data = {
"a": [1.0, 2.0, None, 4.0, 5.0],
"b": [True, False, None, True, False],
"str_direct": ["a", "cccccc", None, "ddd", "ee"],
"d": ["a", "bb", None, "ccc", "ddd"],
"e": ["ddd", "cc", None, "bb", "a"],
"f": ["aaaaa", "bbbbb", None, "ccccc", "ddddd"],
"int_short_repeated": [5, 5, None, 5, 5],
"int_neg_short_repeated": [-5, -5, None, -5, -5],
"int_delta": [1, 2, None, 4, 5],
"int_neg_delta": [5, 4, None, 2, 1],
"int_direct": [1, 6, None, 3, 2],
"int_neg_direct": [-1, -6, None, -3, -2],
"bigint_direct": [1, 6, None, 3, 2],
"bigint_neg_direct": [-1, -6, None, -3, -2],
"bigint_other": [5, -5, 1, 5, 5],
"utf8_increase": ["a", "bb", "ccc", "dddd", "eeeee"],
"utf8_decrease": ["eeeee", "dddd", "ccc", "bb", "a"],
}
def infer_schema(data):
schema = "struct<"
for key, value in data.items():
dt = type(value[0])
if dt == float:
dt = "float"
elif dt == int:
dt = "int"
elif dt == bool:
dt = "boolean"
elif dt == str:
dt = "string"
else:
raise NotImplementedError
if key.startswith("double"):
dt = "double"
if key.startswith("bigint"):
dt = "bigint"
schema += key + ":" + dt + ","
schema = schema[:-1] + ">"
return schema
def _write(
schema: str,
data,
file_name: str,
compression=pyorc.CompressionKind.NONE,
dict_key_size_threshold=0.0,
):
output = open(file_name, "wb")
writer = pyorc.Writer(
output,
schema,
dict_key_size_threshold=dict_key_size_threshold,
compression_block_size=32,
compression=compression,
)
num_rows = len(list(data.values())[0])
for x in range(num_rows):
row = tuple(values[x] for values in data.values())
writer.write(row)
writer.close()
with open(file_name, "rb") as f:
reader = pyorc.Reader(f)
list(reader)
_write(
infer_schema(data),
data,
"test.orc",
)
data_boolean = {
"long": [True] * 32,
}
_write("struct<long:boolean>", data_boolean, "long_bool.orc")
_write("struct<long:boolean>", data_boolean, "long_bool_gzip.orc", pyorc.CompressionKind.ZLIB)
data_dict = {
"dict": ["abcd", "efgh"] * 32,
}
_write("struct<dict:string>", data_dict, "string_long.orc")
data_dict = {
"dict": ["abc", "efgh"] * 32,
}
_write("struct<dict:string>", data_dict, "string_dict.orc", dict_key_size_threshold=0.1)
_write("struct<dict:string>", data_dict, "string_dict_gzip.orc", pyorc.CompressionKind.ZLIB)
data_dict = {
"dict": ["abcd", "efgh"] * (10**4 // 2),
}
_write("struct<dict:string>", data_dict, "string_long_long.orc")
_write("struct<dict:string>", data_dict, "string_long_long_gzip.orc", pyorc.CompressionKind.ZLIB)
long_f32 = {
"dict": [random.uniform(0, 1) for _ in range(10**6)],
}
_write("struct<dict:float>", long_f32, "f32_long_long_gzip.orc", pyorc.CompressionKind.ZLIB)