Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions sjsonnet/src-js/sjsonnet/Platform.scala
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@ import java.util.regex.Pattern
import scala.collection.mutable

object Platform {
// Scala.js Long is expensive, so keep the ASCII stripChars fast path on Int masks.
final val useIntStripCharsBitset: Boolean = true

private def repeatCapacity(s: String, count: Int): Int =
if (count > 0 && s.length <= Int.MaxValue / count) s.length * count else 0

Expand Down
3 changes: 3 additions & 0 deletions sjsonnet/src-jvm/sjsonnet/Platform.scala
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@ import scala.jdk.CollectionConverters.*
object Platform {
private val hexFormat = HexFormat.of()

// JVM JMH runs faster with Int masks and the ASCII single-char shortcut.
final val useIntStripCharsBitset: Boolean = true

def repeatString(s: String, count: Int): String =
if (count <= 0) "" else s.repeat(count)

Expand Down
3 changes: 3 additions & 0 deletions sjsonnet/src-native/sjsonnet/Platform.scala
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@ import org.virtuslab.yaml.*
object Platform {
private val hexChars = "0123456789abcdef".toCharArray

// Scala Native local hyperfine is steadier on the Long-mask path than the single-char shortcut.
final val useIntStripCharsBitset: Boolean = false

private def repeatCapacity(s: String, count: Int): Int =
if (count > 0 && s.length <= Int.MaxValue / count) s.length * count else 0

Expand Down
183 changes: 151 additions & 32 deletions sjsonnet/src/sjsonnet/stdlib/StringModule.scala
Original file line number Diff line number Diff line change
Expand Up @@ -240,26 +240,30 @@ object StringModule extends AbstractFunctionModule {
}

private object StripUtils {
def codePointsSet(str: String): collection.Set[Int] = {
val chars = Set.newBuilder[Int]
chars.sizeHint(str.codePointCount(0, str.length))
var i = 0
while (i < str.length) {
val codePoint = str.codePointAt(i)
chars += codePoint
i += Character.charCount(codePoint)
}
chars.result()
}

/**
* Optimized strip implementation for std.stripChars/lstripChars/rstripChars. Most strip sets
* are ASCII/BMP delimiters; handling them before building a boxed Set[Int] keeps the hot path
* allocation-light and lets the JVM inline the membership checks.
* Optimized strip implementation for std.stripChars/lstripChars/rstripChars. ASCII delimiter
* sets use mask checks without allocating a Set or BitSet. Non-ASCII BMP delimiter sets keep
* the current BitSet path, and surrogate-containing delimiter sets fall back to codepoint
* iteration.
*/
def strip(str: String, chars: String, left: Boolean, right: Boolean): String = {
if (str.isEmpty || chars.isEmpty) return str

// JVM/JS benchmark faster with Int masks and the single-char shortcut. Scala Native's
// LLVM output is faster on the Long-mask path, so it deliberately skips that shortcut.
if (sjsonnet.Platform.useIntStripCharsBitset) {
if (chars.length == 1) {
val ch = chars.charAt(0)
if (ch.toInt < 128) return stripSingleChar(str, ch, left, right)
}
val ascii = stripAsciiInt(str, chars, left, right)
if (ascii != null) return ascii
} else {
val ascii = stripAsciiLong(str, chars, left, right)
if (ascii != null) return ascii
}

val single = singleBmpNonSurrogate(chars)
if (single >= 0) {
return stripSingleChar(str, single.toChar, left, right)
Expand All @@ -273,6 +277,88 @@ object StringModule extends AbstractFunctionModule {
unspecializedStrip(str, codePointsSet(chars), left, right)
}

private def stripAsciiLong(
str: String,
charsStr: String,
left: Boolean,
right: Boolean): String = {
var loBits = 0L
var hiBits = 0L
var i = 0
while (i < charsStr.length) {
val c = charsStr.charAt(i).toInt
if (c >= 128) {
return null
} else {
if (c < 64) loBits |= 1L << c
else hiBits |= 1L << (c - 64)
}
i += 1
}
asciiStripLong(str, loBits, hiBits, left, right)
}

@inline private def asciiContainsLong(loBits: Long, hiBits: Long, c: Char): Boolean = {
val cp = c.toInt
if (cp < 64) ((loBits >>> cp) & 1L) != 0L
else if (cp < 128) ((hiBits >>> (cp - 64)) & 1L) != 0L
else false
}

private def stripAsciiInt(
str: String,
charsStr: String,
left: Boolean,
right: Boolean): String = {
var bits0 = 0
var bits1 = 0
var bits2 = 0
var bits3 = 0
var i = 0
while (i < charsStr.length) {
val c = charsStr.charAt(i).toInt
if (c >= 128) {
return null
} else if (c < 32) {
bits0 |= 1 << c
} else if (c < 64) {
bits1 |= 1 << (c - 32)
} else if (c < 96) {
bits2 |= 1 << (c - 64)
} else {
bits3 |= 1 << (c - 96)
}
i += 1
}
asciiStripInt(str, bits0, bits1, bits2, bits3, left, right)
}

@inline private def asciiContainsInt(
bits0: Int,
bits1: Int,
bits2: Int,
bits3: Int,
c: Char): Boolean = {
val cp = c.toInt
if (cp < 32) ((bits0 >>> cp) & 1) != 0
else if (cp < 64) ((bits1 >>> (cp - 32)) & 1) != 0
else if (cp < 96) ((bits2 >>> (cp - 64)) & 1) != 0
else if (cp < 128) ((bits3 >>> (cp - 96)) & 1) != 0
else false
}

def codePointsSet(str: String): collection.Set[Int] = {
val chars = Set.newBuilder[Int]
chars.sizeHint(str.codePointCount(0, str.length))
var i = 0
while (i < str.length) {
val codePoint = str.codePointAt(i)
chars += codePoint
i += Character.charCount(codePoint)
}
chars.result()
}

def unspecializedStrip(
str: String,
charsSet: collection.Set[Int],
Expand Down Expand Up @@ -343,6 +429,48 @@ object StringModule extends AbstractFunctionModule {
}
str.substring(start, end)
}

private def asciiStripLong(
str: String,
loBits: Long,
hiBits: Long,
left: Boolean,
right: Boolean): String = {
var start = 0
var end = str.length
if (left) {
while (start < end && asciiContainsLong(loBits, hiBits, str.charAt(start)))
start += 1
}
if (right) {
while (end > start && asciiContainsLong(loBits, hiBits, str.charAt(end - 1)))
end -= 1
}
if (start == 0 && end == str.length) str
else str.substring(start, end)
}

private def asciiStripInt(
str: String,
bits0: Int,
bits1: Int,
bits2: Int,
bits3: Int,
left: Boolean,
right: Boolean): String = {
var start = 0
var end = str.length
if (left) {
while (start < end && asciiContainsInt(bits0, bits1, bits2, bits3, str.charAt(start)))
start += 1
}
if (right) {
while (end > start && asciiContainsInt(bits0, bits1, bits2, bits3, str.charAt(end - 1)))
end -= 1
}
if (start == 0 && end == str.length) str
else str.substring(start, end)
}
}

/**
Expand All @@ -354,14 +482,11 @@ object StringModule extends AbstractFunctionModule {
*/
private object StripChars extends Val.Builtin2("stripChars", "str", "chars") {
def evalRhs(str: Eval, chars: Eval, ev: EvalScope, pos: Position): Val = {
val charsStr = chars.value.asString
val strValue = str.value.asString
Val.Str(
pos,
StripUtils.strip(
str.value.asString,
chars.value.asString,
left = true,
right = true
)
StripUtils.strip(strValue, charsStr, left = true, right = true)
)
}
}
Expand All @@ -375,14 +500,11 @@ object StringModule extends AbstractFunctionModule {
*/
private object LStripChars extends Val.Builtin2("lstripChars", "str", "chars") {
def evalRhs(str: Eval, chars: Eval, ev: EvalScope, pos: Position): Val = {
val charsStr = chars.value.asString
val strValue = str.value.asString
Val.Str(
pos,
StripUtils.strip(
str.value.asString,
chars.value.asString,
left = true,
right = false
)
StripUtils.strip(strValue, charsStr, left = true, right = false)
)
}
}
Expand All @@ -396,14 +518,11 @@ object StringModule extends AbstractFunctionModule {
*/
private object RStripChars extends Val.Builtin2("rstripChars", "str", "chars") {
def evalRhs(str: Eval, chars: Eval, ev: EvalScope, pos: Position): Val = {
val charsStr = chars.value.asString
val strValue = str.value.asString
Val.Str(
pos,
StripUtils.strip(
str.value.asString,
chars.value.asString,
left = false,
right = true
)
StripUtils.strip(strValue, charsStr, left = false, right = true)
)
}
}
Expand Down
41 changes: 40 additions & 1 deletion sjsonnet/test/src/sjsonnet/StdStripCharsTests.scala
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
package sjsonnet

import sjsonnet.TestUtils.eval
import sjsonnet.TestUtils.{eval, evalErr}
import utest._
object StdStripCharsTests extends TestSuite {
private val party = new String(Array(0xd83c.toChar, 0xdf89.toChar))
private val partyEsc = "\\" + "uD83C" + "\\" + "uDF89"
private val eAcuteEsc = "\\" + "u00e9"

def tests: Tests = Tests {
test("stdRStripChars") {
Expand Down Expand Up @@ -32,6 +35,42 @@ object StdStripCharsTests extends TestSuite {

eval("std.stripChars(\"[aaabbbbcccc]\", \"ac[]\")").toString() ==> """"bbbb""""
}
test("asciiStripCharsKeepsNonAsciiCodepoints") {
eval(s"""std.stripChars("--${partyEsc}hello${partyEsc}--", "-")""").str ==> s"${party}hello${party}"
eval(s"""std.lstripChars("--${partyEsc}hello", "-")""").str ==> s"${party}hello"
eval(s"""std.rstripChars("hello${partyEsc}--", "-")""").str ==> s"hello${party}"
}
test("stripCharsFallsBackForNonAsciiStripSet") {
eval(s"""std.stripChars("${partyEsc}-hello-${partyEsc}", "${partyEsc}-")""").str ==> "hello"
eval(s"""std.lstripChars("${partyEsc}-hello-${partyEsc}", "${partyEsc}-")""")
.str ==> s"hello-${party}"
eval(s"""std.rstripChars("${partyEsc}-hello-${partyEsc}", "${partyEsc}-")""")
.str ==> s"${party}-hello"
eval(s"""std.stripChars("${eAcuteEsc}-hello-${eAcuteEsc}", "${eAcuteEsc}-")""").str ==> "hello"
}
test("asciiStripCharsHandlesBitsetBoundaries") {
eval("std.stripChars(\"?@hello\\u007f\", \"?@\\u007f\")").toString() ==> """"hello""""
eval("std.stripChars(\"\\u001f_hello_\\u007f\", \"\\u001f_\\u007f\")").toString() ==> """"hello""""
eval("std.stripChars(\"\", \"?@\\u007f\")").toString() ==> """"""""
eval("""std.stripChars("hello", "")""").toString() ==> """"hello""""
}
test("stripCharsForcesCharsBeforeStr") {
assert(
evalErr("""std.stripChars(error "str first", error "chars first")""").startsWith(
"sjsonnet.Error: chars first"
)
)
assert(
evalErr("""std.lstripChars(error "str first", error "chars first")""").startsWith(
"sjsonnet.Error: chars first"
)
)
assert(
evalErr("""std.rstripChars(error "str first", error "chars first")""").startsWith(
"sjsonnet.Error: chars first"
)
)
}

}
}
Loading