Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@

package org.opensearch.sql.calcite.big5;

import static org.opensearch.sql.util.MatcherUtils.assertYamlEqualsIgnoreId;

import java.io.IOException;
import org.junit.FixMethodOrder;
import org.junit.Test;
Expand Down Expand Up @@ -42,4 +44,32 @@ public void coalesce_nonexistent_field_fallback() throws IOException {
String ppl = sanitize(loadExpectedQuery("coalesce_nonexistent_field_fallback.ppl"));
timing(summary, "coalesce_nonexistent_field_fallback", ppl);
}

@Test
public void dedup_metrics_size_field() throws IOException {
String ppl = sanitize(loadExpectedQuery("dedup_metrics_size_field.ppl"));
timing(summary, "dedup_metrics_size_field", ppl);
String expected = loadExpectedPlan("dedup_metrics_size_field.yaml");
assertYamlEqualsIgnoreId(expected, explainQueryYaml(ppl));
}

@Test
public void parse_regex_with_cast_transformation() throws IOException {
String ppl = sanitize(loadExpectedQuery("parse_regex_with_cast_transformation.ppl"));
timing(summary, "parse_regex_with_cast_transformation", ppl);
String expected = loadExpectedPlan("parse_regex_with_cast_transformation.yaml");
assertYamlEqualsIgnoreId(expected, explainQueryYaml(ppl));
}

@Test
public void script_engine_like_pattern_with_aggregation() throws IOException {
String ppl = sanitize(loadExpectedQuery("script_engine_like_pattern_with_aggregation.ppl"));
timing(summary, "script_engine_like_pattern_with_aggregation", ppl);
}

@Test
public void script_engine_like_pattern_with_sort() throws IOException {
String ppl = sanitize(loadExpectedQuery("script_engine_like_pattern_with_sort.ppl"));
timing(summary, "script_engine_like_pattern_with_sort", ppl);
}
}
4 changes: 4 additions & 0 deletions integ-test/src/test/resources/big5/data/big5.json
Original file line number Diff line number Diff line change
@@ -1,2 +1,6 @@
{"index":{}}
{"message":"2023-04-30T21:48:56.160Z Apr 30 21:48:56 ip-66-221-134-40 journal: donkey glazer fly shark whip servant thornfalcon","process":{"name":"journal"},"aws.cloudwatch":{"ingestion_time":"2023-04-30T21:48:56.160Z","log_group":"/var/log/messages","log_stream":"luckcrafter"},"tags":["preserve_original_event"],"meta":{"file":"2023-01-02/1682891301-gotext.ndjson.gz"},"cloud":{"region":"eu-central-1"},"@timestamp":"2023-01-02T22:02:34.000Z","input":{"type":"aws-cloudwatch"},"metrics":{"tmin":849,"size":1981},"log.file.path":"/var/log/messages/luckcrafter","event":{"id":"sunsetmark","dataset":"generic","ingested":"2023-07-20T03:36:30.223806Z"},"agent":{"id":"c315dc22-3ea6-44dc-8d56-fd02f675367b","name":"fancydancer","ephemeral_id":"c315dc22-3ea6-44dc-8d56-fd02f675367b","type":"filebeat","version":"8.8.0"}}
{"index":{}}
{"message":"2024-04-11T18:00:10.965Z Apr 11 18:00:10 ip-32-11-43-93 sshd: cloak bolt thorn hugger rib jackal wolverine shaker boar fighter taker boulderfox","process":{"name":"sshd"},"aws.cloudwatch":{"log_stream":"mirrorlighter","ingestion_time":"2024-04-11T18:00:10.965Z","log_group":"/var/log/messages"},"tags":["preserve_original_event"],"meta":{"file":"2024-04-11/1712851210-sshd.ndjson.gz"},"cloud":{"region":"ap-southeast-3"},"@timestamp":"2023-05-01T21:59:58.000Z","input":{"type":"aws-cloudwatch"},"metrics":{"size":3166,"tmin":1},"log.file.path":"/var/log/messages/mirrorlighter","event":{"id":"patternantler","ingested":"2024-04-11T17:39:10.965818973Z","dataset":"generic"},"agent":{"id":"c79a289f-6c16-4de2-a6c8-8ee5c84473d5","name":"brindlehugger","type":"filebeat","version":"8.8.0","ephemeral_id":"c79a289f-6c16-4de2-a6c8-8ee5c84473d5"}}
{"index":{}}
{"message":"2024-04-11T10:15:01.628Z Apr 11 10:15:01 ip-95-21-51-112 kernel: kicker stinger slave dolphin sparkox","process":{"name":"kernel"},"aws.cloudwatch":{"log_stream":"plumebard","ingestion_time":"2024-04-11T10:15:01.628Z","log_group":"/var/log/messages"},"tags":["preserve_original_event"],"meta":{"file":"2024-04-11/1712826901-kernel.ndjson.gz"},"cloud":{"region":"ap-south-1"},"@timestamp":"2023-03-01T22:31:11.000Z","input":{"type":"aws-cloudwatch"},"metrics":{"size":3993,"tmin":1},"log.file.path":"/var/log/messages/plumebard","event":{"id":"chipgambler","ingested":"2024-04-11T10:09:29.628941177Z","dataset":"generic"},"agent":{"id":"5f25fa16-6a99-489f-b1c5-f27c0627a459","name":"lemongrabber","type":"filebeat","version":"8.8.0","ephemeral_id":"5f25fa16-6a99-489f-b1c5-f27c0627a459"}}
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
/*
{
"name": "dedup_metrics_size_field",
"operation-type": "search",
"index": "{{index_name | default('custom-big5')}}",
"body": {
"query": {
"exists": {
"field": "metrics.size",
"boost": 1.0
}
},
"_source": {
"includes": ["agent", "process", "log", "message", "tags", "cloud", "input", "@timestamp", "ecs", "data_stream", "meta", "host", "metrics", "metrics.size", "aws", "event"],
"excludes": []
}
}
}
*/
source = big5
| dedup `metrics.size`
| sort - `@timestamp`
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
/*
{
"name": "parse_regex_with_cast_transformation",
"operation-type": "search",
"index": "{{index_name | default('big5')}}",
"body": {
"query": {
"match_all": {}
},
"_source": {
"includes": ["log.file.path", "@timestamp"],
"excludes": []
},
"sort": [
{
"@timestamp": {
"order": "desc",
"missing": "_last"
}
}
]
}
}
*/
source = big5
| parse `log.file.path` '/var/log/(?<logType>\\w+)/(?<filename>\\w+)'
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

let's use rex command

| eval filename_len = length(filename)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

parse_regex_with_cast_transformation.ppl

cast is not accurate, remove it.

| fields `log.file.path`, logType, filename, filename_len, `@timestamp`
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: backtick is optional

| sort - `@timestamp`
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
/*
{
"name": "script_engine_like_pattern_with_aggregation",
"operation-type": "search",
"index": "{{index_name | default('custom-big5')}}",
"body": {
"query": {
"script": {
"script": {
"source": "{\"langType\":\"calcite\",\"script\":\"...\"}",
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what is message field type?

"lang": "opensearch_compounded_script",
"params": {
"utcTimestamp": "{{current_timestamp}}"
}
},
"boost": 1.0
}
},
"_source": {
"includes": ["message", "metrics.size"],
"excludes": []
},
"aggregations": {
"composite_buckets": {
"composite": {
"size": 10000,
"sources": [
{
"metrics.size": {
"terms": {
"field": "metrics.size",
"missing_bucket": true,
"missing_order": "first",
"order": "asc"
}
}
}
]
}
}
}
}
}
*/
source = big5
| where like(`message`, '%sshd%')
| stats count() by `metrics.size`
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
/*
{
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Query body should include size:10?

"name": "script_engine_like_pattern_with_sort",
"operation-type": "search",
"index": "{{index_name | default('big5')}}",
"body": {
"query": {
"script": {
"script": {
"source": "{\"langType\":\"calcite\",\"script\":\"...\"}",
"lang": "opensearch_compounded_script",
"params": {
"utcTimestamp": "{{current_timestamp}}"
}
},
"boost": 1.0
}
},
"_source": {
"includes": ["agent", "process", "log", "message", "tags", "cloud", "input", "@timestamp", "ecs", "data_stream", "meta", "host", "metrics", "aws", "event"],
"excludes": []
},
"sort": [
{
"@timestamp": {
"order": "desc",
"missing": "_last"
}
}
]
}
}
*/
source = big5
| where like(`message`, '%sshd%')
| sort - `@timestamp`
| head 10
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
calcite:
logical: |
LogicalSystemLimit(sort0=[$7], dir0=[DESC-nulls-last], fetch=[10000], type=[QUERY_SIZE_LIMIT])
LogicalProject(agent=[$0], process=[$6], log=[$8], message=[$11], tags=[$12], cloud=[$13], input=[$15], @timestamp=[$17], ecs=[$18], data_stream=[$20], meta=[$24], host=[$26], metrics=[$27], aws=[$30], event=[$35])
LogicalSort(sort0=[$17], dir0=[DESC-nulls-last])
LogicalProject(agent=[$0], agent.ephemeral_id=[$1], agent.id=[$2], agent.name=[$3], agent.type=[$4], agent.version=[$5], process=[$6], process.name=[$7], log=[$8], log.file=[$9], log.file.path=[$10], message=[$11], tags=[$12], cloud=[$13], cloud.region=[$14], input=[$15], input.type=[$16], @timestamp=[$17], ecs=[$18], ecs.version=[$19], data_stream=[$20], data_stream.dataset=[$21], data_stream.namespace=[$22], data_stream.type=[$23], meta=[$24], meta.file=[$25], host=[$26], metrics=[$27], metrics.size=[$28], metrics.tmin=[$29], aws=[$30], aws.cloudwatch=[$31], aws.cloudwatch.ingestion_time=[$32], aws.cloudwatch.log_group=[$33], aws.cloudwatch.log_stream=[$34], event=[$35], event.dataset=[$36], event.id=[$37], event.ingested=[$38], _id=[$39], _index=[$40], _score=[$41], _maxscore=[$42], _sort=[$43], _routing=[$44])
LogicalFilter(condition=[<=($45, 1)])
LogicalProject(agent=[$0], agent.ephemeral_id=[$1], agent.id=[$2], agent.name=[$3], agent.type=[$4], agent.version=[$5], process=[$6], process.name=[$7], log=[$8], log.file=[$9], log.file.path=[$10], message=[$11], tags=[$12], cloud=[$13], cloud.region=[$14], input=[$15], input.type=[$16], @timestamp=[$17], ecs=[$18], ecs.version=[$19], data_stream=[$20], data_stream.dataset=[$21], data_stream.namespace=[$22], data_stream.type=[$23], meta=[$24], meta.file=[$25], host=[$26], metrics=[$27], metrics.size=[$28], metrics.tmin=[$29], aws=[$30], aws.cloudwatch=[$31], aws.cloudwatch.ingestion_time=[$32], aws.cloudwatch.log_group=[$33], aws.cloudwatch.log_stream=[$34], event=[$35], event.dataset=[$36], event.id=[$37], event.ingested=[$38], _id=[$39], _index=[$40], _score=[$41], _maxscore=[$42], _sort=[$43], _routing=[$44], _row_number_dedup_=[ROW_NUMBER() OVER (PARTITION BY $28 ORDER BY $28)])
LogicalFilter(condition=[IS NOT NULL($28)])
CalciteLogicalIndexScan(table=[[OpenSearch, big5]])
physical: |
EnumerableLimit(fetch=[10000])
EnumerableSort(sort0=[$7], dir0=[DESC-nulls-last])
EnumerableCalc(expr#0..16=[{inputs}], expr#17=[1], expr#18=[<=($t16, $t17)], proj#0..12=[{exprs}], aws=[$t14], event=[$t15], $condition=[$t18])
EnumerableWindow(window#0=[window(partition {13} order by [13] rows between UNBOUNDED PRECEDING and CURRENT ROW aggs [ROW_NUMBER()])])
CalciteEnumerableIndexScan(table=[[OpenSearch, big5]], PushDownContext=[[PROJECT->[agent, process, log, message, tags, cloud, input, @timestamp, ecs, data_stream, meta, host, metrics, metrics.size, aws, event], FILTER->IS NOT NULL($13)], OpenSearchRequestBuilder(sourceBuilder={"from":0,"timeout":"1m","query":{"exists":{"field":"metrics.size","boost":1.0}},"_source":{"includes":["agent","process","log","message","tags","cloud","input","@timestamp","ecs","data_stream","meta","host","metrics","metrics.size","aws","event"],"excludes":[]}}, requestedTotalSize=2147483647, pageSize=null, startFrom=0)])
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
calcite:
logical: |
LogicalSystemLimit(sort0=[$4], dir0=[DESC-nulls-last], fetch=[10000], type=[QUERY_SIZE_LIMIT])
LogicalSort(sort0=[$4], dir0=[DESC-nulls-last])
LogicalProject(log.file.path=[$10], logType=[ITEM(PARSE($10, '/var/log/(?<logType>\w+)/(?<filename>\w+)':VARCHAR, 'regex':VARCHAR), 'logType':VARCHAR)], filename=[ITEM(PARSE($10, '/var/log/(?<logType>\w+)/(?<filename>\w+)':VARCHAR, 'regex':VARCHAR), 'filename':VARCHAR)], filename_len=[CHAR_LENGTH(ITEM(PARSE($10, '/var/log/(?<logType>\w+)/(?<filename>\w+)':VARCHAR, 'regex':VARCHAR), 'filename':VARCHAR))], @timestamp=[$17])
CalciteLogicalIndexScan(table=[[OpenSearch, big5]])
physical: |
EnumerableCalc(expr#0..1=[{inputs}], expr#2=['/var/log/(?<logType>\w+)/(?<filename>\w+)':VARCHAR], expr#3=['regex':VARCHAR], expr#4=[PARSE($t0, $t2, $t3)], expr#5=['logType':VARCHAR], expr#6=[ITEM($t4, $t5)], expr#7=['filename':VARCHAR], expr#8=[ITEM($t4, $t7)], expr#9=[CHAR_LENGTH($t8)], log.file.path=[$t0], $f1=[$t6], $f2=[$t8], $f3=[$t9], @timestamp=[$t1])
CalciteEnumerableIndexScan(table=[[OpenSearch, big5]], PushDownContext=[[PROJECT->[log.file.path, @timestamp], SORT->[{
"@timestamp" : {
"order" : "desc",
"missing" : "_last"
}
}], LIMIT->10000], OpenSearchRequestBuilder(sourceBuilder={"from":0,"size":10000,"timeout":"1m","_source":{"includes":["log.file.path","@timestamp"],"excludes":[]},"sort":[{"@timestamp":{"order":"desc","missing":"_last"}}]}, requestedTotalSize=10000, pageSize=null, startFrom=0)])
Loading