-
Notifications
You must be signed in to change notification settings - Fork 81
Added require operation
#1715
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Added require operation
#1715
Changes from all commits
6cde7b9
4557153
0329c9c
af2d0e3
d5c55d7
0a70e1a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,63 @@ | ||
| package org.jetbrains.kotlinx.dataframe.api | ||
|
|
||
| import org.jetbrains.kotlinx.dataframe.ColumnSelector | ||
| import org.jetbrains.kotlinx.dataframe.DataFrame | ||
| import org.jetbrains.kotlinx.dataframe.annotations.DataSchema | ||
| import org.jetbrains.kotlinx.dataframe.annotations.Interpretable | ||
| import org.jetbrains.kotlinx.dataframe.annotations.Refine | ||
| import org.jetbrains.kotlinx.dataframe.impl.api.requireImpl | ||
| import kotlin.reflect.typeOf | ||
|
|
||
| /** | ||
| * Resolves [column] in this [DataFrame] and checks that its runtime type is a subtype of [C]. | ||
| * Throws if the column can't be resolved or if its type doesn't match. | ||
| * | ||
| * From the compiler plugin perspective, a new column will appear in the compile-time schema as a result of this operation. | ||
| * | ||
| * The aim here is to help incrementally migrate workflows to extension properties API. | ||
| * | ||
| * We recommend considering declaring a [DataSchema] and use [cast] or [convertTo] if you end up with more than a few `requireColumn` calls. | ||
| * | ||
| * Example: | ||
| * | ||
| * ```kotlin | ||
| * val repos = DataFrame | ||
| * .readCsv("https://raw.githubusercontent.com/Kotlin/dataframe/master/data/jetbrains_repositories.csv") | ||
| * | ||
| * repos | ||
| * .filter { "stargazers_count"<Int>() > 100 } | ||
| * .sortByDesc("stargazers_count") | ||
| * .select("full_name", "stargazers_count") | ||
| * ``` | ||
| * | ||
| * Notice how `stargazers_count` String is repeated three times. We can refactor this code using `requireColumn`: | ||
| * | ||
| * ``` | ||
| * val repos = DataFrame | ||
| * .readCsv("https://raw.githubusercontent.com/Kotlin/dataframe/master/data/jetbrains_repositories.csv") | ||
| * .requireColumn { "stargazers_count"<Int>() } | ||
| * | ||
| * repos | ||
| * .filter { stargazers_count > 100 } | ||
| * .sortByDesc { stargazers_count } | ||
| * .select { "full_name" and stargazers_count } | ||
| * ``` | ||
| * | ||
| * This way code becomes a bit more robust. For example, usages of a renamed column will become compile time errors that are easy to spot and update: | ||
| * ```kotlin | ||
| * val repos = DataFrame | ||
| * .readCsv("https://raw.githubusercontent.com/Kotlin/dataframe/master/data/jetbrains_repositories.csv") | ||
| * .requireColumn { "stargazers_count"<Int>() } | ||
| * .rename { stargazers_count }.into("stars") | ||
| * | ||
| * repos | ||
| * .filter { stars > 100 } | ||
| * .sortByDesc { stars } | ||
| * .select { "full_name" and stars } | ||
| * ``` | ||
| * | ||
| */ | ||
| @Refine | ||
| @Interpretable("Require0") | ||
| public inline fun <T, reified C> DataFrame<T>.requireColumn(noinline column: ColumnSelector<T, C>): DataFrame<T> = | ||
| requireImpl(column, typeOf<C>()) |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,18 @@ | ||
| package org.jetbrains.kotlinx.dataframe.impl.api | ||
|
|
||
| import org.jetbrains.kotlinx.dataframe.ColumnSelector | ||
| import org.jetbrains.kotlinx.dataframe.DataFrame | ||
| import org.jetbrains.kotlinx.dataframe.api.getColumnWithPath | ||
| import org.jetbrains.kotlinx.dataframe.api.isSubtypeOf | ||
| import org.jetbrains.kotlinx.dataframe.type | ||
| import kotlin.reflect.KType | ||
|
|
||
| @PublishedApi | ||
| internal fun <T, C> DataFrame<T>.requireImpl(column: ColumnSelector<T, C>, type: KType): DataFrame<T> { | ||
| val resolvedColumn = getColumnWithPath(column) | ||
| val actualType = resolvedColumn.data.type | ||
| require(resolvedColumn.data.isSubtypeOf(type)) { | ||
| "Column '${resolvedColumn.path.joinToString()}' has type '$actualType', which is not subtype of required '$type' type." | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. *a subtype of the required '$type' type. |
||
| } | ||
| return this | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,51 @@ | ||
| package org.jetbrains.kotlinx.dataframe.api | ||
|
|
||
| import io.kotest.assertions.throwables.shouldThrow | ||
| import io.kotest.assertions.throwables.shouldThrowAny | ||
| import io.kotest.matchers.shouldBe | ||
| import org.junit.Test | ||
|
|
||
| class RequireTests : ColumnsSelectionDslTests() { | ||
|
|
||
| @Test | ||
| fun `require returns same dataframe for existing typed column`() { | ||
| val checked = df.requireColumn { "name"["firstName"]<String>() } | ||
| checked shouldBe df | ||
| } | ||
|
|
||
| @Test | ||
| fun `require throws on type mismatch`() { | ||
| val throwable = shouldThrow<IllegalArgumentException> { | ||
| df.requireColumn { "name"["firstName"]<Int>() } | ||
| } | ||
| throwable.message shouldBe | ||
| "Column 'name/firstName' has type 'kotlin.String', which is not subtype of required 'kotlin.Int' type." | ||
| } | ||
|
|
||
| @Test | ||
| fun `require throws when column cannot be resolved`() { | ||
| val exception = shouldThrowAny { | ||
| df.requireColumn { "name"["unknown"]<String>() } | ||
| } | ||
| exception.message shouldBe | ||
| "Column 'name/unknown' not found among columns of 'name': [firstName, lastName]." | ||
| } | ||
|
|
||
| @Test | ||
| fun `require missing parent message includes available columns`() { | ||
| val exception = shouldThrowAny { | ||
| df.requireColumn { "name2"["unknown"]<String>() } | ||
| } | ||
| exception.message shouldBe | ||
| "Column 'name2' not found among [name, age, city, weight, isHappy]." | ||
| } | ||
|
|
||
| @Test | ||
| fun `require deep missing parent message uses nearest existing ancestor`() { | ||
| val exception = shouldThrowAny { | ||
| df.requireColumn { "name"["unknownGroup"]["value"]<String>() } | ||
| } | ||
| exception.message shouldBe | ||
| "Column 'name/unknownGroup' not found among columns of 'name': [firstName, lastName]." | ||
| } | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,66 @@ | ||
| [//]: # (title: requireColumn) | ||
| <!---IMPORT org.jetbrains.kotlinx.dataframe.samples.api.Require--> | ||
|
|
||
| Throws an exception if the specified column is missing or its type is not subtype of `C`. | ||
| From the compiler plugin perspective, a new column will appear in the compile-time schema as a result of this operation. | ||
| The aim here is to help incrementally migrate workflows to [extension properties API](extensionPropertiesApi.md). | ||
| We recommend considering declaring a [DataSchema](dataSchema.md) and use [](cast.md) or [](convertTo) if you end up with more than a few `requireColumn` calls. | ||
|
|
||
| Will work in compiler plugin starting from IntelliJ IDEA 2026.2 and Kotlin 2.4.0. | ||
|
|
||
| ```text | ||
| requireColumn { column } | ||
| ``` | ||
|
|
||
| **Related operations**: [](cast.md), [](convertTo) | ||
|
|
||
| ```kotlin | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I cannot do it yet because require is not supported in compiler plugin :( But i'll do after we update to 2.4.0-RC or something
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. but i'll update the code snippet
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, I really didn't think about that, sorry 😄 !
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
| // Before `requireColumn` extension property will not be resolved | ||
| // peopleDf.select { name.firstName } | ||
|
|
||
| // Require a column with a runtime check | ||
| val df = peopleDf.requireColumn { "name"["firstName"]<String>() } | ||
| // Use extension property after `requireColumn` | ||
| val v: String = df.name.firstName[0] | ||
| ``` | ||
|
|
||
| ### Advanced example | ||
|
|
||
| Let's start with a pipeline that uses only String Column Accessors and String API overloads: | ||
|
|
||
| ```kotlin | ||
| val repos = DataFrame | ||
| .readCsv("https://raw.githubusercontent.com/Kotlin/dataframe/master/data/jetbrains_repositories.csv") | ||
|
|
||
| repos | ||
| .filter { "stargazers_count"<Int>() > 100 } | ||
| .sortByDesc("stargazers_count") | ||
| .select("full_name", "stargazers_count") | ||
| ``` | ||
|
|
||
| Notice how stargazers_count String is repeated three times. We can refactor this code using `requireColumn`: | ||
|
|
||
| ```kotlin | ||
| val repos = DataFrame | ||
| .readCsv("https://raw.githubusercontent.com/Kotlin/dataframe/master/data/jetbrains_repositories.csv") | ||
| .requireColumn { "stargazers_count"<Int>() } | ||
|
|
||
| repos | ||
| .filter { stargazers_count > 100 } | ||
| .sortByDesc { stargazers_count } | ||
| .select { "full_name" and stargazers_count } | ||
| ``` | ||
|
|
||
| This way code becomes a bit more robust. For example, usages of a renamed column will become compile time errors that are easy to spot and update: | ||
|
|
||
| ```kotlin | ||
| val repos = DataFrame | ||
| .readCsv("https://raw.githubusercontent.com/Kotlin/dataframe/master/data/jetbrains_repositories.csv") | ||
| .requireColumn { "stargazers_count"<Int>() } | ||
| .rename { stargazers_count }.into("stars") | ||
|
|
||
| repos | ||
| .filter { stars > 100 } | ||
| .sortByDesc { stars } | ||
| .select { "full_name" and stars } | ||
| ``` | ||
Uh oh!
There was an error while loading. Please reload this page.