From 9af52fdd4ded2ac823912ff02946ba5c341f3e79 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Wed, 5 Jun 2024 12:43:31 +0200
Subject: [PATCH] GH-41978: [Python] Fix pandas tests to follow downstream
 datetime64 unit changes

---
 python/pyarrow/pandas_compat.py                     | 11 ++++++++---
 python/pyarrow/tests/interchange/test_conversion.py |  6 ++++--
 python/pyarrow/tests/parquet/test_datetime.py       |  1 +
 python/pyarrow/tests/test_pandas.py                 |  6 ++++--
 4 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py
index 00fa19604e5c3..f34ea8158ef4e 100644
--- a/python/pyarrow/pandas_compat.py
+++ b/python/pyarrow/pandas_compat.py
@@ -108,9 +108,10 @@ def get_logical_type_from_numpy(pandas_collection):
     except KeyError:
         if hasattr(pandas_collection.dtype, 'tz'):
             return 'datetimetz'
-        # See https://github.com/pandas-dev/pandas/issues/24739
-        if str(pandas_collection.dtype) == 'datetime64[ns]':
-            return 'datetime64[ns]'
+        # See https://github.com/pandas-dev/pandas/issues/24739 (infer_dtype will
+        # result in "datetime64" without unit, while pandas astype requires a unit)
+        if str(pandas_collection.dtype).startswith('datetime64'):
+            return str(pandas_collection.dtype)
         result = _pandas_api.infer_dtype(pandas_collection)
         if result == 'string':
             return 'unicode'
@@ -1105,6 +1106,10 @@ def _reconstruct_columns_from_metadata(columns, column_indexes):
             tz = pa.lib.string_to_tzinfo(
                 column_indexes[0]['metadata']['timezone'])
             level = pd.to_datetime(level, utc=True).tz_convert(tz)
+            if _pandas_api.is_ge_v3():
+                # with pandas 3+, to_datetime returns a unit depending on the string
+                # data, so we restore it to the original unit from the metadata
+                level = level.as_unit(np.datetime_data(dtype)[0])
         elif level.dtype != dtype:
             level = level.astype(dtype)
         # ARROW-9096: if original DataFrame was upcast we keep that
diff --git a/python/pyarrow/tests/interchange/test_conversion.py b/python/pyarrow/tests/interchange/test_conversion.py
index b1e0fa0d1c651..6d91bad57cef4 100644
--- a/python/pyarrow/tests/interchange/test_conversion.py
+++ b/python/pyarrow/tests/interchange/test_conversion.py
@@ -335,8 +335,10 @@ def test_pandas_to_pyarrow_with_missing(np_float):
     np_array = np.array([0, np.nan, 2], dtype=np_float)
     datetime_array = [None, dt(2007, 7, 14), dt(2007, 7, 15)]
     df = pd.DataFrame({
-        "a": np_array,   # float, ColumnNullType.USE_NAN
-        "dt": datetime_array  # ColumnNullType.USE_SENTINEL
+        # float, ColumnNullType.USE_NAN
+        "a": np_array,
+        # ColumnNullType.USE_SENTINEL
+        "dt": np.array(datetime_array, dtype="datetime64[ns]")
     })
     expected = pa.table({
         "a": pa.array(np_array, from_pandas=True),
diff --git a/python/pyarrow/tests/parquet/test_datetime.py b/python/pyarrow/tests/parquet/test_datetime.py
index 0896eb37e6473..08fb1098322be 100644
--- a/python/pyarrow/tests/parquet/test_datetime.py
+++ b/python/pyarrow/tests/parquet/test_datetime.py
@@ -331,6 +331,7 @@ def get_table(pq_reader_method, filename, **kwargs):
         pq_reader_method, filename, coerce_int96_timestamp_unit="s"
     )
     df_correct = tab_correct.to_pandas(timestamp_as_object=True)
+    df["a"] = df["a"].astype(object)
     tm.assert_frame_equal(df, df_correct)
 
 
diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py
index 3678b4e57a9a8..bf79670be411c 100644
--- a/python/pyarrow/tests/test_pandas.py
+++ b/python/pyarrow/tests/test_pandas.py
@@ -4730,12 +4730,12 @@ def make_df_with_timestamps():
     # Some of the milliseconds timestamps deliberately don't fit in the range
     # that is possible with nanosecond timestamps.
     df = pd.DataFrame({
-        'dateTimeMs': [
+        'dateTimeMs': np.array([
             np.datetime64('0001-01-01 00:00', 'ms'),
             np.datetime64('2012-05-02 12:35', 'ms'),
             np.datetime64('2012-05-03 15:42', 'ms'),
             np.datetime64('3000-05-03 15:42', 'ms'),
-        ],
+        ], dtype=object),
         'dateTimeNs': [
             np.datetime64('1991-01-01 00:00', 'ns'),
             np.datetime64('2012-05-02 12:35', 'ns'),
@@ -4743,8 +4743,10 @@ def make_df_with_timestamps():
             np.datetime64('2050-05-03 15:42', 'ns'),
         ],
     })
+    df['dateTimeMs'] = df['dateTimeMs'].astype('object')
     # Not part of what we're testing, just ensuring that the inputs are what we
     # expect.
+    # if Version(pd.__version__) < Version("3.0.0.dev0"):
     assert (df.dateTimeMs.dtype, df.dateTimeNs.dtype) == (
         # O == object, M8[ns] == timestamp64[ns]
         np.dtype("O"), np.dtype("M8[ns]")