BUG: concat losing columns dtypes for join=outer

pandas-dev · jreback · Jul 3, 2022 · Jul 2, 2022 · Jul 2, 2022 · Jul 2, 2022
commit 75d4fda70f9c47ce7e07a75dc7cb424b51e8ffb1
diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst
@@ -995,6 +995,7 @@ Reshaping
 - Bug in :func:`get_dummies` that selected object and categorical dtypes but not string (:issue:`44965`)
 - Bug in :meth:`DataFrame.align` when aligning a :class:`MultiIndex` to a :class:`Series` with another :class:`MultiIndex` (:issue:`46001`)
 - Bug in concatenation with ``IntegerDtype``, or ``FloatingDtype`` arrays where the resulting dtype did not mirror the behavior of the non-nullable dtypes (:issue:`46379`)
+- Bug in :func:`concat` losing dtype of columns when ``join="outer"`` and ``sort=True`` (:issue:`47329`)
 - Bug in :func:`concat` not sorting the column names when ``None`` is included (:issue:`47331`)
 - Bug in :func:`concat` with identical key leads to error when indexing :class:`MultiIndex` (:issue:`46519`)
 - Bug in :meth:`DataFrame.join` with a list when using suffixes to join DataFrames with duplicate column names (:issue:`46396`)

diff --git a/pandas/core/indexes/api.py b/pandas/core/indexes/api.py
@@ -11,6 +11,7 @@
 )
 from pandas.errors import InvalidIndexError
 
+from pandas.core.dtypes.cast import find_common_type
 from pandas.core.dtypes.common import is_dtype_equal
 
 from pandas.core.algorithms import safe_sort
@@ -223,7 +224,7 @@ def union_indexes(indexes, sort: bool | None = True) -> Index:
 
     indexes, kind = _sanitize_and_check(indexes)
 
-    def _unique_indices(inds) -> Index:
+    def _unique_indices(inds, dtype) -> Index:
         """
         Convert indexes to lists and concatenate them, removing duplicates.
 
@@ -243,7 +244,10 @@ def conv(i):
                 i = i.tolist()
             return i
 
-        return Index(lib.fast_unique_multiple_list([conv(i) for i in inds], sort=sort))
+        return Index(
+            lib.fast_unique_multiple_list([conv(i) for i in inds], sort=sort),
+            dtype=dtype,
+        )
 
     if kind == "special":
         result = indexes[0]
@@ -283,16 +287,22 @@ def conv(i):
         return result
 
     elif kind == "array":
+        dtype = find_common_type(
+            [idx.dtype for idx in indexes if isinstance(idx, Index)]
+        )
         index = indexes[0]
         if not all(index.equals(other) for other in indexes[1:]):
-            index = _unique_indices(indexes)
+            index = _unique_indices(indexes, dtype)
 
         name = get_unanimous_names(*indexes)[0]
         if name != index.name:
             index = index.rename(name)
         return index
     else:  # kind='list'
-        return _unique_indices(indexes)
+        dtype = find_common_type(
+            [idx.dtype for idx in indexes if isinstance(idx, Index)]
+        )
+        return _unique_indices(indexes, dtype)
 
 
 def _sanitize_and_check(indexes):

diff --git a/pandas/tests/reshape/concat/test_index.py b/pandas/tests/reshape/concat/test_index.py
@@ -398,3 +398,14 @@ def test_concat_range_index_result(self):
         tm.assert_frame_equal(result, expected)
         expected_index = pd.RangeIndex(0, 2)
         tm.assert_index_equal(result.index, expected_index, exact=True)
+
+    @pytest.mark.parametrize("dtype", ["Int64", "object"])
+    def test_concat_index_keep_dtype(self, dtype):
+        # GH#47329
+        df1 = DataFrame([[0, 1, 1]], columns=Index([1, 2, 3], dtype=dtype))
+        df2 = DataFrame([[0, 1]], columns=Index([1, 2], dtype=dtype))
+        result = concat([df1, df2], ignore_index=True, join="outer", sort=True)
+        expected = DataFrame(
+            [[0, 1, 1.0], [0, 1, np.nan]], columns=Index([1, 2, 3], dtype=dtype)
+        )
+        tm.assert_frame_equal(result, expected)