feat(third_party/overlays): support LargeListArray in Clickhouse
Link: https://github.com/ClickHouse/ClickHouse/pull/56118 Change-Id: I41339ce662b8a169746237eb1d0aad34453bc0a8 Reviewed-on: https://cl.tvl.fyi/c/depot/+/9986 Tested-by: BuildkiteCI Reviewed-by: flokli <flokli@flokli.de>
This commit is contained in:
		
							parent
							
								
									edea6daddd
								
							
						
					
					
						commit
						14849829fd
					
				
					 2 changed files with 114 additions and 0 deletions
				
			
		
							
								
								
									
										107
									
								
								third_party/overlays/patches/clickhouse-support-reading-arrow-LargeListArray.patch
									
										
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										107
									
								
								third_party/overlays/patches/clickhouse-support-reading-arrow-LargeListArray.patch
									
										
									
									
										vendored
									
									
										Normal file
									
								
							|  | @ -0,0 +1,107 @@ | |||
| From 26e65e4addc990cc09b59b587792ac4a454e5cdd Mon Sep 17 00:00:00 2001 | ||||
| From: edef <edef@edef.eu> | ||||
| Date: Mon, 30 Oct 2023 08:08:10 +0000 | ||||
| Subject: [PATCH] [backport] Support reading arrow::LargeListArray | ||||
| 
 | ||||
| ---
 | ||||
|  .../Formats/Impl/ArrowColumnToCHColumn.cpp    | 35 ++++++++++++++----- | ||||
|  1 file changed, 26 insertions(+), 9 deletions(-) | ||||
| 
 | ||||
| diff --git a/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp b/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp
 | ||||
| index 54a6c8493ea..94cf59fd357 100644
 | ||||
| --- a/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp
 | ||||
| +++ b/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp
 | ||||
| @@ -336,7 +336,22 @@ static ColumnPtr readByteMapFromArrowColumn(std::shared_ptr<arrow::ChunkedArray>
 | ||||
|      return nullmap_column; | ||||
|  } | ||||
|   | ||||
| -static ColumnPtr readOffsetsFromArrowListColumn(std::shared_ptr<arrow::ChunkedArray> & arrow_column)
 | ||||
| +template<typename T>
 | ||||
| +struct ArrowOffsetArray;
 | ||||
| +
 | ||||
| +template<>
 | ||||
| +struct ArrowOffsetArray<arrow::ListArray>
 | ||||
| +{
 | ||||
| +    using type = arrow::Int32Array;
 | ||||
| +};
 | ||||
| +
 | ||||
| +template<>
 | ||||
| +struct ArrowOffsetArray<arrow::LargeListArray>
 | ||||
| +{
 | ||||
| +    using type = arrow::Int64Array;
 | ||||
| +};
 | ||||
| +
 | ||||
| +template<typename ArrowListArray> static ColumnPtr readOffsetsFromArrowListColumn(std::shared_ptr<arrow::ChunkedArray> & arrow_column)
 | ||||
|  { | ||||
|      auto offsets_column = ColumnUInt64::create(); | ||||
|      ColumnArray::Offsets & offsets_data = assert_cast<ColumnVector<UInt64> &>(*offsets_column).getData(); | ||||
| @@ -346,9 +361,9 @@ static ColumnPtr readOffsetsFromArrowListColumn(std::shared_ptr<arrow::ChunkedAr
 | ||||
|   | ||||
|      for (int chunk_i = 0, num_chunks = arrow_column->num_chunks(); chunk_i < num_chunks; ++chunk_i) | ||||
|      { | ||||
| -        arrow::ListArray & list_chunk = dynamic_cast<arrow::ListArray &>(*(arrow_column->chunk(chunk_i)));
 | ||||
| +        ArrowListArray & list_chunk = dynamic_cast<ArrowListArray &>(*(arrow_column->chunk(chunk_i)));
 | ||||
|          auto arrow_offsets_array = list_chunk.offsets(); | ||||
| -        auto & arrow_offsets = dynamic_cast<arrow::Int32Array &>(*arrow_offsets_array);
 | ||||
| +        auto & arrow_offsets = dynamic_cast<typename ArrowOffsetArray<ArrowListArray>::type &>(*arrow_offsets_array);
 | ||||
|   | ||||
|          /* | ||||
|           * It seems like arrow::ListArray::values() (nested column data) might or might not be shared across chunks. | ||||
| @@ -498,13 +513,13 @@ static ColumnPtr readColumnWithIndexesData(std::shared_ptr<arrow::ChunkedArray>
 | ||||
|      } | ||||
|  } | ||||
|   | ||||
| -static std::shared_ptr<arrow::ChunkedArray> getNestedArrowColumn(std::shared_ptr<arrow::ChunkedArray> & arrow_column)
 | ||||
| +template<typename ArrowListArray> static std::shared_ptr<arrow::ChunkedArray> getNestedArrowColumn(std::shared_ptr<arrow::ChunkedArray> & arrow_column)
 | ||||
|  { | ||||
|      arrow::ArrayVector array_vector; | ||||
|      array_vector.reserve(arrow_column->num_chunks()); | ||||
|      for (int chunk_i = 0, num_chunks = arrow_column->num_chunks(); chunk_i < num_chunks; ++chunk_i) | ||||
|      { | ||||
| -        arrow::ListArray & list_chunk = dynamic_cast<arrow::ListArray &>(*(arrow_column->chunk(chunk_i)));
 | ||||
| +        ArrowListArray & list_chunk = dynamic_cast<ArrowListArray &>(*(arrow_column->chunk(chunk_i)));
 | ||||
|   | ||||
|          /* | ||||
|           * It seems like arrow::ListArray::values() (nested column data) might or might not be shared across chunks. | ||||
| @@ -636,12 +651,12 @@ static ColumnWithTypeAndName readColumnFromArrowColumn(
 | ||||
|                  if (map_type_hint) | ||||
|                      nested_type_hint = assert_cast<const DataTypeArray *>(map_type_hint->getNestedType().get())->getNestedType(); | ||||
|              } | ||||
| -            auto arrow_nested_column = getNestedArrowColumn(arrow_column);
 | ||||
| +            auto arrow_nested_column = getNestedArrowColumn<arrow::ListArray>(arrow_column);
 | ||||
|              auto nested_column = readColumnFromArrowColumn(arrow_nested_column, column_name, format_name, false, dictionary_infos, allow_null_type, skip_columns_with_unsupported_types, skipped, nested_type_hint); | ||||
|              if (skipped) | ||||
|                  return {}; | ||||
|   | ||||
| -            auto offsets_column = readOffsetsFromArrowListColumn(arrow_column);
 | ||||
| +            auto offsets_column = readOffsetsFromArrowListColumn<arrow::ListArray>(arrow_column);
 | ||||
|   | ||||
|              const auto * tuple_column = assert_cast<const ColumnTuple *>(nested_column.column.get()); | ||||
|              const auto * tuple_type = assert_cast<const DataTypeTuple *>(nested_column.type.get()); | ||||
| @@ -650,7 +665,9 @@ static ColumnWithTypeAndName readColumnFromArrowColumn(
 | ||||
|              return {std::move(map_column), std::move(map_type), column_name}; | ||||
|          } | ||||
|          case arrow::Type::LIST: | ||||
| +        case arrow::Type::LARGE_LIST:
 | ||||
|          { | ||||
| +            bool is_large = arrow_column->type()->id() == arrow::Type::LARGE_LIST;
 | ||||
|              DataTypePtr nested_type_hint; | ||||
|              if (type_hint) | ||||
|              { | ||||
| @@ -658,11 +675,11 @@ static ColumnWithTypeAndName readColumnFromArrowColumn(
 | ||||
|                  if (array_type_hint) | ||||
|                      nested_type_hint = array_type_hint->getNestedType(); | ||||
|              } | ||||
| -            auto arrow_nested_column = getNestedArrowColumn(arrow_column);
 | ||||
| +            auto arrow_nested_column = is_large ? getNestedArrowColumn<arrow::LargeListArray>(arrow_column) : getNestedArrowColumn<arrow::ListArray>(arrow_column);
 | ||||
|              auto nested_column = readColumnFromArrowColumn(arrow_nested_column, column_name, format_name, false, dictionary_infos, allow_null_type, skip_columns_with_unsupported_types, skipped, nested_type_hint); | ||||
|              if (skipped) | ||||
|                  return {}; | ||||
| -            auto offsets_column = readOffsetsFromArrowListColumn(arrow_column);
 | ||||
| +            auto offsets_column = is_large ? readOffsetsFromArrowListColumn<arrow::LargeListArray>(arrow_column) : readOffsetsFromArrowListColumn<arrow::ListArray>(arrow_column);
 | ||||
|              auto array_column = ColumnArray::create(nested_column.column, offsets_column); | ||||
|              auto array_type = std::make_shared<DataTypeArray>(nested_column.type); | ||||
|              return {std::move(array_column), std::move(array_type), column_name}; | ||||
| -- 
 | ||||
| 2.42.0 | ||||
| 
 | ||||
							
								
								
									
										7
									
								
								third_party/overlays/tvl.nix
									
										
									
									
										vendored
									
									
								
							
							
						
						
									
										7
									
								
								third_party/overlays/tvl.nix
									
										
									
									
										vendored
									
									
								
							|  | @ -147,4 +147,11 @@ depot.nix.readTree.drvTargets { | |||
|       license = licenses.asl20; | ||||
|     }; | ||||
|   }; | ||||
| 
 | ||||
|   clickhouse = super.clickhouse.overrideAttrs (old: { | ||||
|     patches = old.patches or [ ] ++ [ | ||||
|       # https://github.com/ClickHouse/ClickHouse/pull/56118 | ||||
|       ./patches/clickhouse-support-reading-arrow-LargeListArray.patch | ||||
|     ]; | ||||
|   }); | ||||
| } | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue