Merge remote-tracking branch 'upstream/master'

317brian · Aug 27, 2024 · 3e5215b · 3e5215b
2 parents 0e10af5 + e562dd3
commit 3e5215b
Show file tree

Hide file tree

Showing 352 changed files with 9,117 additions and 3,075 deletions.
diff --git a/benchmarks/src/test/java/org/apache/druid/benchmark/ExpressionSelectorBenchmark.java b/benchmarks/src/test/java/org/apache/druid/benchmark/ExpressionSelectorBenchmark.java
@@ -39,9 +39,11 @@
 import org.apache.druid.segment.Cursor;
 import org.apache.druid.segment.CursorBuildSpec;
 import org.apache.druid.segment.CursorHolder;
+import org.apache.druid.segment.Cursors;
 import org.apache.druid.segment.DimensionSelector;
 import org.apache.druid.segment.QueryableIndex;
 import org.apache.druid.segment.QueryableIndexStorageAdapter;
+import org.apache.druid.segment.QueryableIndexTimeBoundaryInspector;
 import org.apache.druid.segment.StorageAdapter;
 import org.apache.druid.segment.VirtualColumns;
 import org.apache.druid.segment.column.ColumnHolder;
@@ -192,11 +194,11 @@ public void timeFloorUsingCursor(Blackhole blackhole)
     try (final CursorHolder cursorHolder = adapter.makeCursorHolder(CursorBuildSpec.FULL_SCAN)) {
       final Cursor cursor = cursorHolder.asCursor();
       final CursorGranularizer granularizer = CursorGranularizer.create(
-          adapter,
           cursor,
+          QueryableIndexTimeBoundaryInspector.create(index),
+          Cursors.getTimeOrdering(index.getOrdering()),
           Granularities.HOUR,
-          adapter.getInterval(),
-          false
+          adapter.getInterval()
       );
       final Sequence<Long> results =
           Sequences.simple(granularizer.getBucketIterable())

diff --git a/docs/api-reference/sql-ingestion-api.md b/docs/api-reference/sql-ingestion-api.md
@@ -157,7 +157,7 @@ headers = {
   'Content-Type': 'application/json'
 }
 
-response = requests.post(url, headers=headers, data=payload, auth=('USER', 'PASSWORD'))
+response = requests.post(url, headers=headers, data=payload)
 
 print(response.text)
 
@@ -254,8 +254,9 @@ url = "http://ROUTER_IP:ROUTER_PORT/druid/indexer/v1/task/query-3dc0c45d-34d7-4b
 payload={}
 headers = {}
 
-response = requests.get(url, headers=headers, data=payload, auth=('USER', 'PASSWORD'))
+response = requests.post(url, headers=headers, data=payload)
 
+print(response.text)
 print(response.text)
 ```
 
@@ -363,7 +364,10 @@ import requests
 url = "http://ROUTER_IP:ROUTER_PORT/druid/indexer/v1/task/query-3dc0c45d-34d7-4b15-86c9-cdb2d3ebfc4e/reports"
 
 headers = {}
-response = requests.get(url, headers=headers, auth=('USER', 'PASSWORD'))
+
+response = requests.post(url, headers=headers, data=payload)
+
+print(response.text)
 print(response.text)
 ```
 
@@ -826,11 +830,12 @@ import requests
 
 url = "http://ROUTER_IP:ROUTER_PORT/druid/indexer/v1/task/query-655efe33-781a-4c50-ae84-c2911b42d63c/shutdown"
 
-payload={}
+payload = {}
 headers = {}
 
-response = requests.post(url, headers=headers, data=payload, auth=('USER', 'PASSWORD'))
+response = requests.post(url, headers=headers, data=payload)
 
+print(response.text)
 print(response.text)
 ```
 

diff --git a/docs/configuration/index.md b/docs/configuration/index.md
@@ -596,7 +596,9 @@ need arises.
 |`druid.centralizedDatasourceSchema.enabled`|Boolean flag for enabling datasource schema building in the Coordinator, this should be specified in the common runtime properties.|false|No.|
 |`druid.indexer.fork.property.druid.centralizedDatasourceSchema.enabled`| This config should be set when CentralizedDatasourceSchema feature is enabled. This should be specified in the MiddleManager runtime properties.|false|No.|
 
-For, stale schema cleanup configs, refer to properties with the prefix `druid.coordinator.kill.segmentSchema` in [Metadata Management](#metadata-management).
+If you enable this feature, you can query datasources that are only stored in deep storage and are not loaded on a Historical. For more information, see [Query from deep storage](../querying/query-from-deep-storage.md).
+
+For stale schema cleanup configs, refer to properties with the prefix `druid.coordinator.kill.segmentSchema` in [Metadata Management](#metadata-management). 
 
 ### Ingestion security configuration
 

diff --git a/docs/design/storage.md b/docs/design/storage.md
@@ -67,7 +67,7 @@ On the Coordinator / Historical side:
 Segments all have a four-part identifier with the following components:
 
 - Datasource name.
-- Time interval (for the time chunk containing the segment; this corresponds to the `segmentGranularity` specified at ingestion time).
+- Time interval for the time chunk containing the segment; this corresponds to the `segmentGranularity` specified at ingestion time. Uses the same format as [query granularity](../querying/granularities.md).
 - Version number (generally an ISO8601 timestamp corresponding to when the segment set was first started).
 - Partition number (an integer, unique within a datasource+interval+version; may not necessarily be contiguous).
 

diff --git a/docs/development/extensions-contrib/iceberg.md b/docs/development/extensions-contrib/iceberg.md
@@ -22,6 +22,9 @@ title: "Iceberg extension"
   ~ under the License.
   -->
 
+<!-- If the URL changes for this topic, make modifications
+to Apache Iceberg docs: https://github.com/apache/iceberg/blob/main/docs/mkdocs.yml -->
+
 ## Iceberg Ingest extension
 
 Apache Iceberg is an open table format for huge analytic datasets. [IcebergInputSource](../../ingestion/input-sources.md#iceberg-input-source) lets you ingest data stored in the Iceberg table format into Apache Druid. To use the iceberg extension, add the `druid-iceberg-extensions` to the list of loaded extensions. See [Loading extensions](../../configuration/extensions.md#loading-extensions) for more information.

diff --git a/docs/development/extensions-contrib/rabbit-stream-ingestion.md b/docs/development/extensions-contrib/rabbit-stream-ingestion.md
@@ -160,8 +160,6 @@ The `tuningConfig` is optional. If no `tuningConfig` is specified, default param
 |`resetOffsetAutomatically`|Boolean|Controls behavior when Druid needs to read RabbitMQ messages that are no longer available. Not supported.  |no (default == false)|
 |`skipSequenceNumberAvailabilityCheck`|Boolean|Whether to enable checking if the current sequence number is still available in a particular RabbitMQ stream. If set to false, the indexing task will attempt to reset the current sequence number (or not), depending on the value of `resetOffsetAutomatically`.|no (default == false)|
 |`workerThreads`|Integer|The number of threads that the supervisor uses to handle requests/responses for worker tasks, along with any other internal asynchronous operation.|no (default == min(10, taskCount))|
-|`chatAsync`|Boolean| If true, use asynchronous communication with indexing tasks, and ignore the `chatThreads` parameter. If false, use synchronous communication in a thread pool of size `chatThreads`.  | no (default == true) |
-|`chatThreads`|Integer| The number of threads that will be used for communicating with indexing tasks. Ignored if `chatAsync` is `true` (the default).| no (default == min(10, taskCount * replicas))|
 |`chatRetries`|Integer|The number of times HTTP requests to indexing tasks will be retried before considering tasks unresponsive.| no (default == 8)|
 |`httpTimeout`|ISO8601 Period|How long to wait for a HTTP response from an indexing task.|no (default == PT10S)|
 |`shutdownTimeout`|ISO8601 Period|How long to wait for the supervisor to attempt a graceful shutdown of tasks before exiting.|no (default == PT80S)|

diff --git a/docs/ingestion/ingestion-spec.md b/docs/ingestion/ingestion-spec.md
@@ -232,7 +232,7 @@ A `dimensionsSpec` can have the following components:
 | `spatialDimensions`    | An array of [spatial dimensions](../querying/geo.md).                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            | `[]`    |
 | `includeAllDimensions` | Note that this field only applies to string-based schema discovery where Druid ingests dimensions it discovers as strings. This is different from schema auto-discovery where Druid infers the type for data. You can set `includeAllDimensions` to true to ingest both explicit dimensions in the `dimensions` field and other dimensions that the ingestion task discovers from input data. In this case, the explicit dimensions will appear first in the order that you specify them, and the dimensions dynamically discovered will come after. This flag can be useful especially with auto schema discovery using [`flattenSpec`](./data-formats.md#flattenspec). If this is not set and the `dimensions` field is not empty, Druid will ingest only explicit dimensions. If this is not set and the `dimensions` field is empty, all discovered dimensions will be ingested. | false   |
 | `useSchemaDiscovery` | Configure Druid to use schema auto-discovery to discover some or all of the dimensions and types for your data. For any dimensions that aren't a uniform type, Druid ingests them as JSON. You can use this for native batch or streaming ingestion.  | false  | 
-
+| `forceSegmentSortByTime` | When set to true (the default), segments created by the ingestion job are sorted by `{__time, dimensions[0], dimensions[1], ...}`. When set to false, segments created by the ingestion job are sorted by `{dimensions[0], dimensions[1], ...}`. To include `__time` in the sort order when this parameter is set to `false`, you must include a dimension named `__time` with type `long` explicitly in the `dimensions` list.<br /><br />Setting this to `false` is an experimental feature; see [Sorting](partitioning.md#sorting) for details. | `true` |
 
 #### Dimension objects
 
@@ -301,15 +301,15 @@ An example `metricsSpec` is:
 
 ### `granularitySpec`
 
-The `granularitySpec` is located in `dataSchema` → `granularitySpec` and is responsible for configuring
-the following operations:
+The `granularitySpec`, located in `dataSchema` → `granularitySpec`, specifies the following:
 
-1. Partitioning a datasource into [time chunks](../design/storage.md) (via `segmentGranularity`).
-2. Truncating the timestamp, if desired (via `queryGranularity`).
-3. Specifying which time chunks of segments should be created, for batch ingestion (via `intervals`).
-4. Specifying whether ingestion-time [rollup](./rollup.md) should be used or not (via `rollup`).
+1. `segmentGranularity` to partitioning a datasource into [time chunks](../design/storage.md).
+2. `queryGranularity` to optionally truncate the timestamp.
+3. `intervals` to define the time chunks of segments to create for batch ingestion.
+4.  `rollup` to enable ingestion-time [rollup](./rollup.md) or not.
 
 Other than `rollup`, these operations are all based on the [primary timestamp](./schema-model.md#primary-timestamp).
+Use the format from [Query granularities] to specify both `segmentGranualarity` and `queryGranularity`.
 
 An example `granularitySpec` is:
 
@@ -520,6 +520,7 @@ For information on defining an `indexSpec` in a query context, see [SQL-based in
 |stringDictionaryEncoding|Encoding format for string value dictionaries used by STRING and [COMPLEX&lt;json&gt;](../querying/nested-columns.md) columns. To enable front coding, set `stringDictionaryEncoding.type` to `frontCoded`. Optionally, you can specify the `bucketSize` and `formatVersion` properties. See [Front coding](#front-coding) for more information.|`{"type":"utf8"}`|
 |metricCompression|Compression format for primitive type metric columns. Options are `lz4`, `lzf`, `zstd`, `uncompressed`, or `none` (which is more efficient than `uncompressed`, but not supported by older versions of Druid).|`lz4`|
 |longEncoding|Encoding format for long-typed columns. Applies regardless of whether they are dimensions or metrics. Options are `auto` or `longs`. `auto` encodes the values using offset or lookup table depending on column cardinality, and store them with variable size. `longs` stores the value as-is with 8 bytes each.|`longs`|
+|complexMetricCompression|Compression format for complex type metric columns. Options are `lz4`, `lzf`, `zstd`, `uncompressed`. Options other than `uncompressed` are not compatible with Druid versions older than 31, and only applies to complex metrics which do not have specialized column formats.|`uncompressed`|
 |jsonCompression|Compression format to use for nested column raw data. Options are `lz4`, `lzf`, `zstd`, or `uncompressed`.|`lz4`|
 
 #### Front coding
@@ -561,4 +562,4 @@ For example:
     }
   }
 }
-```
+```
diff --git a/docs/ingestion/kafka-ingestion.md b/docs/ingestion/kafka-ingestion.md
@@ -421,8 +421,6 @@ For configuration properties shared across all streaming ingestion methods, refe
 |Property|Type|Description|Required|Default|
 |--------|----|-----------|--------|-------|
 |`numPersistThreads`|Integer|The number of threads to use to create and persist incremental segments on the disk. Higher ingestion data throughput results in a larger number of incremental segments, causing significant CPU time to be spent on the creation of the incremental segments on the disk. For datasources with number of columns running into hundreds or thousands, creation of the incremental segments may take up significant time, in the order of multiple seconds. In both of these scenarios, ingestion can stall or pause frequently, causing it to fall behind. You can use additional threads to parallelize the segment creation without blocking ingestion as long as there are sufficient CPU resources available.|No|1|
-|`chatAsync`|Boolean|If `true`, use asynchronous communication with indexing tasks, and ignore the `chatThreads` parameter. If `false`, use synchronous communication in a thread pool of size `chatThreads`.|No|`true`|
-|`chatThreads`|Integer|The number of threads to use for communicating with indexing tasks. Ignored if `chatAsync` is `true`.|No|`min(10, taskCount * replicas)`|
 
 ## Deployment notes on Kafka partitions and Druid segments