Skip to content

Commit 639201b

Browse files
authored
GH-44500: [Python][Parquet] Map Parquet logical types to Arrow extension types by default (#46772)
### Rationale for this change The Parquet C++ implementation now supports reading four logical types (JSON, UUID, Geometry, Geography) as Arrow extension types; however, users have to opt-in to avoid loosing the logical type on read. ### What changes are included in this PR? This PR sets the default value of `arrow_extensions_enabled` to `True` (in Python). ### Are these changes tested? Yes, the behaviour of `arrow_extensions_enabled` was already tested (and tests were updated to reflect the new default value). ### Are there any user-facing changes? **This PR includes breaking changes to public APIs.** Reading Parquet files that contained a JSON or UUID logical type will now have an extension type rather than string or fixed size binary, respectively. Python users that were relying on the previous behaviour would have to explicitly cast to storage or use `read_table(..., arrow_extensions_enabled=False)` after this PR: ```python import uuid import pyarrow as pa json_array = pa.array(['{"k": "v"}'], pa.json_()) json_array.cast(pa.string()) #> [ #> "{"k": "v"}" #> ] uuid_array = pa.array([uuid.uuid4().bytes], pa.uuid()) uuid_array.cast(pa.binary(16)) #> <pyarrow.lib.FixedSizeBinaryArray object at 0x11e42b1c0> #> [ #> 746C1022AB434A97972E1707EC3EE8F4 #> ] ``` * GitHub Issue: #44500 Authored-by: Dewey Dunnington <[email protected]> Signed-off-by: AlenkaF <[email protected]>
1 parent 7cf158a commit 639201b

File tree

2 files changed

+11
-8
lines changed

2 files changed

+11
-8
lines changed

python/pyarrow/parquet/core.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -261,7 +261,7 @@ class ParquetFile:
261261
it will be parsed as an URI to determine the filesystem.
262262
page_checksum_verification : bool, default False
263263
If True, verify the checksum for each page read from the file.
264-
arrow_extensions_enabled : bool, default False
264+
arrow_extensions_enabled : bool, default True
265265
If True, read Parquet logical types as Arrow extension types where possible,
266266
(e.g., read JSON as the canonical `arrow.json` extension type or UUID as
267267
the canonical `arrow.uuid` extension type).
@@ -314,7 +314,7 @@ def __init__(self, source, *, metadata=None, common_metadata=None,
314314
coerce_int96_timestamp_unit=None,
315315
decryption_properties=None, thrift_string_size_limit=None,
316316
thrift_container_size_limit=None, filesystem=None,
317-
page_checksum_verification=False, arrow_extensions_enabled=False):
317+
page_checksum_verification=False, arrow_extensions_enabled=True):
318318

319319
self._close_source = getattr(source, 'closed', True)
320320

@@ -1321,7 +1321,7 @@ class ParquetDataset:
13211321
sufficient for most Parquet files.
13221322
page_checksum_verification : bool, default False
13231323
If True, verify the page checksum for each page read from the file.
1324-
arrow_extensions_enabled : bool, default False
1324+
arrow_extensions_enabled : bool, default True
13251325
If True, read Parquet logical types as Arrow extension types where possible,
13261326
(e.g., read JSON as the canonical `arrow.json` extension type or UUID as
13271327
the canonical `arrow.uuid` extension type).
@@ -1339,7 +1339,7 @@ def __init__(self, path_or_paths, filesystem=None, schema=None, *, filters=None,
13391339
decryption_properties=None, thrift_string_size_limit=None,
13401340
thrift_container_size_limit=None,
13411341
page_checksum_verification=False,
1342-
arrow_extensions_enabled=False):
1342+
arrow_extensions_enabled=True):
13431343
import pyarrow.dataset as ds
13441344

13451345
# map format arguments
@@ -1739,7 +1739,7 @@ def partitioning(self):
17391739
sufficient for most Parquet files.
17401740
page_checksum_verification : bool, default False
17411741
If True, verify the checksum for each page read from the file.
1742-
arrow_extensions_enabled : bool, default False
1742+
arrow_extensions_enabled : bool, default True
17431743
If True, read Parquet logical types as Arrow extension types where possible,
17441744
(e.g., read JSON as the canonical `arrow.json` extension type or UUID as
17451745
the canonical `arrow.uuid` extension type).
@@ -1839,7 +1839,7 @@ def read_table(source, *, columns=None, use_threads=True,
18391839
decryption_properties=None, thrift_string_size_limit=None,
18401840
thrift_container_size_limit=None,
18411841
page_checksum_verification=False,
1842-
arrow_extensions_enabled=False):
1842+
arrow_extensions_enabled=True):
18431843

18441844
try:
18451845
dataset = ParquetDataset(

python/pyarrow/tests/parquet/test_data_types.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -569,14 +569,15 @@ def test_json_extension_type(storage_type):
569569
_check_roundtrip(
570570
table,
571571
pa.table({"ext": pa.array(data, pa.string())}),
572+
{"arrow_extensions_enabled": False},
572573
store_schema=False)
573574

574575
# With arrow_extensions_enabled=True on read, we get a arrow.json back
575576
# (but with string() storage)
576577
_check_roundtrip(
577578
table,
578579
pa.table({"ext": pa.array(data, pa.json_(pa.string()))}),
579-
read_table_kwargs={"arrow_extensions_enabled": True},
580+
{"arrow_extensions_enabled": True},
580581
store_schema=False)
581582

582583

@@ -594,11 +595,13 @@ def test_uuid_extension_type():
594595
_check_roundtrip(
595596
table,
596597
pa.table({"ext": pa.array(data, pa.binary(16))}),
598+
{"arrow_extensions_enabled": False},
597599
store_schema=False)
598600
_check_roundtrip(
599601
table,
600602
table,
601-
{"arrow_extensions_enabled": True}, store_schema=False)
603+
{"arrow_extensions_enabled": True},
604+
store_schema=False)
602605

603606

604607
def test_undefined_logical_type(parquet_test_datadir):

0 commit comments

Comments
 (0)