-
Notifications
You must be signed in to change notification settings - Fork 3.8k
feat: Add configurable truncation for string columns #19146
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
ed7de0a
9958e34
9033b75
4d6bb1f
47a0684
51b8fe2
faaa820
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -21,15 +21,26 @@ | |
|
|
||
| import com.fasterxml.jackson.annotation.JsonCreator; | ||
| import com.fasterxml.jackson.annotation.JsonIgnore; | ||
| import com.fasterxml.jackson.annotation.JsonInclude; | ||
| import com.fasterxml.jackson.annotation.JsonProperty; | ||
| import org.apache.druid.guice.BuiltInTypesModule; | ||
| import org.apache.druid.segment.DimensionHandler; | ||
| import org.apache.druid.segment.StringDimensionHandler; | ||
| import org.apache.druid.segment.column.ColumnType; | ||
|
|
||
| import javax.annotation.Nullable; | ||
|
|
||
| public class StringDimensionSchema extends DimensionSchema | ||
| { | ||
| private static final boolean DEFAULT_CREATE_BITMAP_INDEX = true; | ||
|
|
||
| public static int getDefaultMaxStringLength() | ||
| { | ||
| return BuiltInTypesModule.getMaxStringLength(); | ||
| } | ||
|
|
||
| private final int maxStringLength; | ||
|
|
||
| @JsonCreator | ||
| public static StringDimensionSchema create(String name) | ||
| { | ||
|
|
@@ -40,15 +51,33 @@ public static StringDimensionSchema create(String name) | |
| public StringDimensionSchema( | ||
| @JsonProperty("name") String name, | ||
| @JsonProperty("multiValueHandling") MultiValueHandling multiValueHandling, | ||
| @JsonProperty("createBitmapIndex") Boolean createBitmapIndex | ||
| @JsonProperty("createBitmapIndex") Boolean createBitmapIndex, | ||
| @JsonProperty("maxStringLength") @Nullable Integer maxStringLength | ||
|
Comment on lines
+54
to
+55
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. drive by comment (i'll have a closer look at rest of PR later) instead of adding additional arguments here, I was hoping to deprecate these arguments in favor of adding a column format spec similar to was done for auto/json columns in #17762, which could serve as a reference for how this should be wired up. I was planning to move the existing
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for taking a look @clintropolis. Adding something like
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. my preference would be that this refactor is done before a release that contains it so that we don't have to support both styles of configuration, apologies I didn't have a chance to do a review before it was merged |
||
| ) | ||
| { | ||
| super(name, multiValueHandling, createBitmapIndex == null ? DEFAULT_CREATE_BITMAP_INDEX : createBitmapIndex); | ||
| this.maxStringLength = maxStringLength != null && maxStringLength > 0 ? maxStringLength : getDefaultMaxStringLength(); | ||
| } | ||
|
|
||
| public StringDimensionSchema( | ||
| String name, | ||
| MultiValueHandling multiValueHandling, | ||
| Boolean createBitmapIndex | ||
| ) | ||
| { | ||
| this(name, multiValueHandling, createBitmapIndex, getDefaultMaxStringLength()); | ||
| } | ||
|
|
||
| public StringDimensionSchema(String name) | ||
| { | ||
| this(name, null, DEFAULT_CREATE_BITMAP_INDEX); | ||
| this(name, null, DEFAULT_CREATE_BITMAP_INDEX, getDefaultMaxStringLength()); | ||
| } | ||
|
|
||
| @JsonProperty | ||
| @JsonInclude(JsonInclude.Include.NON_DEFAULT) | ||
| public int getMaxStringLength() | ||
| { | ||
| return maxStringLength; | ||
| } | ||
|
|
||
| @Override | ||
|
|
@@ -73,6 +102,6 @@ public boolean canBeMultiValued() | |
| @Override | ||
| public DimensionHandler getDimensionHandler() | ||
| { | ||
| return new StringDimensionHandler(getName(), getMultiValueHandling(), hasBitmapIndex(), false); | ||
| return new StringDimensionHandler(getName(), getMultiValueHandling(), hasBitmapIndex(), false, maxStringLength); | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -53,6 +53,7 @@ public class BuiltInTypesModule implements DruidModule | |
| */ | ||
| private static DimensionSchema.MultiValueHandling STRING_MV_MODE = DimensionSchema.MultiValueHandling.SORTED_ARRAY; | ||
| private static IndexSpec DEFAULT_INDEX_SPEC = IndexSpec.builder().build(); | ||
| private static int MAX_STRING_LENGTH = 0; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Would it make sense to set this to Integer max value? In case this is used elsewhere in the future there wouldn't need to explicit handling for 0 like you have in truncateIfNeeded
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We are using NON_DEFAULT to not serialize the default value and I think for integer jackson's default is 0. If we set the MAX_STRING_LENGTH default as int max, it'll serialize this value for each dimension. |
||
|
|
||
| /** | ||
| * @return the configured string multi value handling mode from the system config if set; otherwise, returns | ||
|
|
@@ -89,6 +90,7 @@ public void configure(Binder binder) | |
| public SideEffectRegisterer initDimensionHandlerAndMvHandlingMode(DefaultColumnFormatConfig formatsConfig) | ||
| { | ||
| setStringMultiValueHandlingModeIfConfigured(formatsConfig.getStringMultiValueHandlingMode()); | ||
| setMaxStringLengthIfConfigured(formatsConfig.getMaxStringLength()); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You can take a look at druid.indexing.formats.stringMultiValueHandlingMode in BuiltInTypesModuleTest It would be good to have some test coverage for the new property
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Added tests for this property. |
||
| setIndexSpecDefaults(formatsConfig.getIndexSpec()); | ||
| setNestedColumnDefaults(formatsConfig); | ||
|
|
||
|
|
@@ -128,6 +130,24 @@ private static void registerSerde() | |
| } | ||
| } | ||
|
|
||
| private static void setMaxStringLengthIfConfigured(@Nullable Integer maxStringLength) | ||
| { | ||
| if (maxStringLength != null) { | ||
| MAX_STRING_LENGTH = maxStringLength; | ||
| } | ||
| } | ||
|
|
||
| @VisibleForTesting | ||
| public static void setMaxStringLength(int maxStringLength) | ||
| { | ||
| MAX_STRING_LENGTH = maxStringLength; | ||
| } | ||
|
|
||
| public static int getMaxStringLength() | ||
| { | ||
| return MAX_STRING_LENGTH; | ||
| } | ||
|
|
||
| private static void setStringMultiValueHandlingModeIfConfigured(@Nullable String stringMultiValueHandlingMode) | ||
| { | ||
| if (stringMultiValueHandlingMode != null) { | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -68,6 +68,21 @@ private static String validateMultiValueHandlingMode( | |
| return stringMultiValueHandlingMode; | ||
| } | ||
|
|
||
| @Nullable | ||
| private static Integer validateMaxStringLength(@Nullable Integer maxStringLength) | ||
| { | ||
| if (maxStringLength != null && maxStringLength <= 0) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The current documentation notes set 0 to disable, but this check would fail startup if an operator sets it to 0 right? Should this check be If we do make the default be -1, then this check wouldn't be relevant anymore |
||
| throw DruidException.forPersona(DruidException.Persona.OPERATOR) | ||
| .ofCategory(DruidException.Category.INVALID_INPUT) | ||
| .build( | ||
| "Invalid value[%s] specified for 'druid.indexing.formats.maxStringLength'." | ||
| + " Value must be a positive integer.", | ||
| maxStringLength | ||
| ); | ||
| } | ||
| return maxStringLength; | ||
| } | ||
|
|
||
| @JsonProperty("stringMultiValueHandlingMode") | ||
| @Nullable | ||
| private final Integer nestedColumnFormatVersion; | ||
|
|
@@ -80,11 +95,16 @@ private static String validateMultiValueHandlingMode( | |
| @Nullable | ||
| private final IndexSpec indexSpec; | ||
|
|
||
| @JsonProperty("maxStringLength") | ||
| @Nullable | ||
| private final Integer maxStringLength; | ||
|
|
||
| @JsonCreator | ||
| public DefaultColumnFormatConfig( | ||
| @JsonProperty("stringMultiValueHandlingMode") @Nullable String stringMultiValueHandlingMode, | ||
| @JsonProperty("nestedColumnFormatVersion") @Nullable Integer nestedColumnFormatVersion, | ||
| @JsonProperty("indexSpec") @Nullable IndexSpec indexSpec | ||
| @JsonProperty("indexSpec") @Nullable IndexSpec indexSpec, | ||
| @JsonProperty("maxStringLength") @Nullable Integer maxStringLength | ||
| ) | ||
| { | ||
| validateMultiValueHandlingMode(stringMultiValueHandlingMode); | ||
|
|
@@ -93,6 +113,7 @@ public DefaultColumnFormatConfig( | |
| this.stringMultiValueHandlingMode = validateMultiValueHandlingMode(stringMultiValueHandlingMode); | ||
| this.nestedColumnFormatVersion = nestedColumnFormatVersion; | ||
| this.indexSpec = indexSpec; | ||
| this.maxStringLength = validateMaxStringLength(maxStringLength); | ||
| } | ||
|
|
||
| @Nullable | ||
|
|
@@ -116,6 +137,13 @@ public IndexSpec getIndexSpec() | |
| return indexSpec; | ||
| } | ||
|
|
||
| @Nullable | ||
| @JsonProperty("maxStringLength") | ||
| public Integer getMaxStringLength() | ||
| { | ||
| return maxStringLength; | ||
| } | ||
|
|
||
| @Override | ||
| public boolean equals(Object o) | ||
| { | ||
|
|
@@ -128,13 +156,14 @@ public boolean equals(Object o) | |
| DefaultColumnFormatConfig that = (DefaultColumnFormatConfig) o; | ||
| return Objects.equals(nestedColumnFormatVersion, that.nestedColumnFormatVersion) | ||
| && Objects.equals(stringMultiValueHandlingMode, that.stringMultiValueHandlingMode) | ||
| && Objects.equals(indexSpec, that.indexSpec); | ||
| && Objects.equals(indexSpec, that.indexSpec) | ||
| && Objects.equals(maxStringLength, that.maxStringLength); | ||
| } | ||
|
|
||
| @Override | ||
| public int hashCode() | ||
| { | ||
| return Objects.hash(nestedColumnFormatVersion, stringMultiValueHandlingMode, indexSpec); | ||
| return Objects.hash(nestedColumnFormatVersion, stringMultiValueHandlingMode, indexSpec, maxStringLength); | ||
| } | ||
|
|
||
| @Override | ||
|
|
@@ -144,6 +173,7 @@ public String toString() | |
| "stringMultiValueHandlingMode=" + stringMultiValueHandlingMode + | ||
| ", nestedColumnFormatVersion=" + nestedColumnFormatVersion + | ||
| ", indexSpec=" + indexSpec + | ||
| ", maxStringLength=" + maxStringLength + | ||
| '}'; | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change | ||||||||
|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -24,6 +24,7 @@ | |||||||||
| import org.apache.druid.collections.bitmap.BitmapFactory; | ||||||||||
| import org.apache.druid.collections.bitmap.MutableBitmap; | ||||||||||
| import org.apache.druid.data.input.impl.DimensionSchema.MultiValueHandling; | ||||||||||
| import org.apache.druid.data.input.impl.StringDimensionSchema; | ||||||||||
| import org.apache.druid.error.DruidException; | ||||||||||
| import org.apache.druid.java.util.common.ISE; | ||||||||||
| import org.apache.druid.java.util.common.StringUtils; | ||||||||||
|
|
@@ -57,18 +58,38 @@ public class StringDimensionIndexer extends DictionaryEncodedColumnIndexer<int[] | |||||||||
| private final MultiValueHandling multiValueHandling; | ||||||||||
| private final boolean hasBitmapIndexes; | ||||||||||
| private final boolean hasSpatialIndexes; | ||||||||||
| private final int maxStringLength; | ||||||||||
| private volatile boolean hasMultipleValues = false; | ||||||||||
|
|
||||||||||
| public StringDimensionIndexer( | ||||||||||
| @Nullable MultiValueHandling multiValueHandling, | ||||||||||
| boolean hasBitmapIndexes, | ||||||||||
| boolean hasSpatialIndexes | ||||||||||
| ) | ||||||||||
| { | ||||||||||
| this(multiValueHandling, hasBitmapIndexes, hasSpatialIndexes, StringDimensionSchema.getDefaultMaxStringLength()); | ||||||||||
| } | ||||||||||
|
|
||||||||||
| public StringDimensionIndexer( | ||||||||||
| @Nullable MultiValueHandling multiValueHandling, | ||||||||||
| boolean hasBitmapIndexes, | ||||||||||
| boolean hasSpatialIndexes, | ||||||||||
| int maxStringLength | ||||||||||
| ) | ||||||||||
| { | ||||||||||
| super(new StringDimensionDictionary()); | ||||||||||
| this.multiValueHandling = multiValueHandling == null ? MultiValueHandling.ofDefault() : multiValueHandling; | ||||||||||
| this.hasBitmapIndexes = hasBitmapIndexes; | ||||||||||
| this.hasSpatialIndexes = hasSpatialIndexes; | ||||||||||
| this.maxStringLength = maxStringLength; | ||||||||||
| } | ||||||||||
|
|
||||||||||
| private String truncateIfNeeded(String value) | ||||||||||
|
Comment on lines
+86
to
+87
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please add a short javadoc here
Suggested change
For the future, it would be helpful to have a counter when this truncation occurs, so we have visibility into data integrity. The counter can then be periodically emitted, similar to thrownAway/dropped event metrics, etc. |
||||||||||
| { | ||||||||||
| if (maxStringLength > 0 && value != null && value.length() > maxStringLength) { | ||||||||||
| return value.substring(0, maxStringLength); | ||||||||||
| } | ||||||||||
| return value; | ||||||||||
| } | ||||||||||
|
|
||||||||||
| @Override | ||||||||||
|
|
@@ -92,7 +113,7 @@ public EncodedKeyComponent<int[]> processRowValsToUnsortedEncodedKeyComponent(@N | |||||||||
| dimLookup.add(null); | ||||||||||
| encodedDimensionValues = IntArrays.EMPTY_ARRAY; | ||||||||||
| } else if (dimValuesList.size() == 1) { | ||||||||||
| encodedDimensionValues = new int[]{dimLookup.add(Evals.asString(dimValuesList.get(0)))}; | ||||||||||
| encodedDimensionValues = new int[]{dimLookup.add(truncateIfNeeded(Evals.asString(dimValuesList.get(0))))}; | ||||||||||
| } else { | ||||||||||
| hasMultipleValues = true; | ||||||||||
| final String[] dimensionValues = new String[dimValuesList.size()]; | ||||||||||
|
|
@@ -125,7 +146,7 @@ public EncodedKeyComponent<int[]> processRowValsToUnsortedEncodedKeyComponent(@N | |||||||||
| encodedDimensionValues = | ||||||||||
| new int[]{dimLookup.add(Evals.asString(StringUtils.encodeBase64String((byte[]) dimValues)))}; | ||||||||||
| } else { | ||||||||||
| encodedDimensionValues = new int[]{dimLookup.add(Evals.asString(dimValues))}; | ||||||||||
| encodedDimensionValues = new int[]{dimLookup.add(truncateIfNeeded(Evals.asString(dimValues)))}; | ||||||||||
| } | ||||||||||
|
|
||||||||||
| // If dictionary size has changed, the sorted lookup is no longer valid. | ||||||||||
|
|
||||||||||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should the default be -1? 0 seems counterintuitive
Also making it -1 will align with
maxBytesInMemory,maxParseExceptionsetc