Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion parent/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ application while protecting against XSS.
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
<guava.version>27.1-jre</guava.version>
<guava.version>30.1-jre</guava.version>
</properties>

<build>
Expand Down
2 changes: 1 addition & 1 deletion scripts/build_for_travis.sh
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ if [ -n "$IS_LEGACY" ]; then
else
# Build the whole kit-n-kaboodle.
mvn -f aggregate/pom.xml source:jar javadoc:jar verify $COMMON_FLAGS \
&& mvn -Dguava.version=27.1-jre -f aggregate/pom.xml clean source:jar javadoc:jar verify $COMMON_FLAGS \
&& mvn -Dguava.version=30.1-jre -f aggregate/pom.xml clean source:jar javadoc:jar verify $COMMON_FLAGS \
&& mvn jacoco:report coveralls:report \
&& mvn org.sonatype.ossindex.maven:ossindex-maven-plugin:audit -f aggregate $COMMON_FLAGS
fi
219 changes: 116 additions & 103 deletions src/main/java/org/owasp/html/Encoding.java
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,9 @@
package org.owasp.html;

import java.io.IOException;

import java.util.Collections;
import java.util.HashSet;
import java.util.Set;
import javax.annotation.Nullable;

/** Encoders and decoders for HTML. */
Expand Down Expand Up @@ -94,6 +96,7 @@ static void stripBannedCodeunits(StringBuilder sb) {
stripBannedCodeunits(sb, 0);
}


@TCB
private static void stripBannedCodeunits(StringBuilder sb, int start) {
int k = start;
Expand All @@ -108,13 +111,16 @@ private static void stripBannedCodeunits(StringBuilder sb, int start) {
if (i+1 < n) {
char next = sb.charAt(i+1);
if (Character.isSurrogatePair(ch, next)) {
sb.setCharAt(k++, ch);
sb.setCharAt(k++, next);
// The last two code points in each plane are non-characters that should be elided.
if ((ch & 0xfc3f) != 0xd83f || (next & 0xfffe) != 0xdffe) {
sb.setCharAt(k++, ch);
sb.setCharAt(k++, next);
}
++i;
}
}
continue;
} else if ((ch & 0xfffe) == 0xfffe) {
} else if ((ch & 0xfffe) == 0xfffe || (0xfdd0 <= ch && ch <= 0xfdef)) {
continue;
}
}
Expand All @@ -139,19 +145,34 @@ private static int longestPrefixOfGoodCodeunits(String s) {
}
} else if (0xd800 <= ch) {
if (ch <= 0xdfff) {
if (i+1 < n && Character.isSurrogatePair(ch, s.charAt(i+1))) {
++i; // Skip over low surrogate since we know it's ok.
if (i + 1 < n ) {
// could be a surrogate pair
char cn = s.charAt(i+1);
if( Character.isSurrogatePair(ch,cn) ) {
int cp = Character.toCodePoint(ch, cn);
// Could be a non-character
if ((cp & 0xfffe) == 0xfffe) {
// not valid
return i;
}

// skip over trailing surrogate since we know it is OK
i++;
} else {
// not a surrogate pair
return i;
}
} else {
// isolated surrogate at end of string
return i;
}
} else if ((ch & 0xfffe) == 0xfffe) {
} else if ((ch & 0xfffe) == 0xfffe || (0xfdd0 <= ch && ch <= 0xfdef)) {
return i;
}
}
}
return -1;
}

/**
* Appends an encoded form of plainText to output where the encoding is
* sufficient to prevent an HTML parser from interpreting any characters in
Expand Down Expand Up @@ -196,6 +217,7 @@ static void encodePcdataOnto(String plainText, Appendable output)
encodeHtmlOnto(plainText, output, "{<!-- -->");
}


/**
* Appends an encoded form of plainText to putput where the encoding is
* sufficient to prevent an HTML parser from transitioning out of the
Expand Down Expand Up @@ -240,127 +262,94 @@ private static void encodeHtmlOnto(
char ch = plainText.charAt(i);
if (ch < REPLACEMENTS.length) { // Handles all ASCII.
String repl = REPLACEMENTS[ch];
if (ch == '{' && repl == null) {
if (i + 1 == n || plainText.charAt(i + 1) == '{') {
repl = braceReplacement;
if( repl==null ) {
if (ch == '{') {
if (i + 1 == n || plainText.charAt(i + 1) == '{') {
// "{{" detected, so use the brace replacement
repl = braceReplacement;
}
}
if (ch == '\r') {
// If this CR is followed by a LF, just remove it. Otherwise replace it with a LF.
if (i + 1 == n || plainText.charAt(i + 1) != '\n' ) {
// CR not followed by LF, so turn into LF
repl = "\n";
} else {
// CRLF, so remove CR
repl = "";
}
}
}
if (repl != null) {
output.append(plainText, pos, i).append(repl);
pos = i + 1;
}
} else if ((0x93A <= ch && ch <= 0xC4C)
&& (
// Devanagari vowel
ch <= 0x94F
// Benagli vowels
|| 0x985 <= ch && ch <= 0x994
|| 0x9BE <= ch && ch < 0x9CC // 0x9CC (Bengali AU) is ok
|| 0x9E0 <= ch && ch <= 0x9E3
// Telugu vowels
|| 0xC05 <= ch && ch <= 0xC14
|| 0xC3E <= ch && ch != 0xC48 /* 0xC48 (Telugu AI) is ok */)) {
// https://manishearth.github.io/blog/2018/02/15/picking-apart-the-crashing-ios-string/
// > So, ultimately, the full set of cases that cause the crash are:
// > Any sequence <consonant1, virama, consonant2, ZWNJ, vowel>
// > in Devanagari, Bengali, and Telugu, where: ...

// TODO: This is needed as of February 2018, but hopefully not long after that.
// We eliminate the ZWNJ which seems the minimally damaging thing to do to
// Telugu rendering per the article above:
// > a ZWNJ before a vowel doesn’t really do anything for most Indic scripts.

if (pos < i) {
if (plainText.charAt(i - 1) == 0x200C /* ZWNJ */) {
output.append(plainText, pos, i - 1);
// Drop the ZWNJ on the floor.
pos = i;
}
} else if (output instanceof StringBuilder) {
StringBuilder sb = (StringBuilder) output;
int len = sb.length();
if (len != 0) {
if (sb.charAt(len - 1) == 0x200C /* ZWNJ */) {
sb.setLength(len - 1);
}
}
}
} else if (((char) 0xd800) <= ch) {
if (ch <= ((char) 0xdfff)) {
char next;
if (i + 1 < n
&& Character.isSurrogatePair(
ch, next = plainText.charAt(i + 1))) {
// Emit supplemental codepoints as entity so that they cannot
// be mis-encoded as UTF-8 of surrogates instead of UTF-8 proper
// and get involved in UTF-16/UCS-2 confusion.
int codepoint = Character.toCodePoint(ch, next);
output.append(plainText, pos, i);
} else if (RISKY_NORMALIZATION.contains(ch)) {
// Application of unicode compatibility normalization produces a risky character.
output.append(plainText, pos, i);
pos = i + 1;
appendNumericEntity(ch,output);
} else if ((ch <= 0x9f) || (0xfdd0 <= ch && ch <= 0xfdef) || ((ch & 0xfffe) == 0xfffe)) {
// Elide C1 escapes and BMP non-characters.
output.append(plainText, pos, i);
pos = i + 1;
} else if (0xd800 <= ch && ch <= 0xdfff) {
// handle surrogates
char next;
if (i + 1 < n && Character.isSurrogatePair(ch, next = plainText.charAt(i + 1))) {
// Emit supplemental codepoints as entity so that they cannot
// be mis-encoded as UTF-8 of surrogates instead of UTF-8 proper
// and get involved in UTF-16/UCS-2 confusion.
int codepoint = Character.toCodePoint(ch, next);
output.append(plainText, pos, i);
// do not append 0xfffe and 0xffff from any plane
if( (codepoint & 0xfffe) != 0xfffe ) {
appendNumericEntity(codepoint, output);
++i;
pos = i + 1;
} else {
output.append(plainText, pos, i);
// Elide the orphaned surrogate.
pos = i + 1;
}
} else if (0xfe60 <= ch) {
// Is a control character or possible full-width version of a
// special character, a BOM, or one of the FE60 block that might
// be elided or normalized to an HTML special character.
// Running
// cat NormalizationText.txt \
// | perl -pe 's/ ?#.*//' \
// | egrep '(;003C(;|$)|003E|0026|0022|0027|0060)'
// dumps a list of code-points that can normalize to HTML special
// characters.
++i;
pos = i + 1;
} else {
output.append(plainText, pos, i);
// Elide the orphaned surrogate.
pos = i + 1;
if ((ch & 0xfffe) == 0xfffe) {
// Elide since not an the XML Character.
} else {
appendNumericEntity(ch, output);
}
}
} else if (ch == '\u1FEF') { // Normalizes to backtick.
output.append(plainText, pos, i).append("&#8175;");
pos = i + 1;
}
}
output.append(plainText, pos, n);
}


/**
* Append a codepoint to the output as a numeric entity.
*
* @param codepoint the codepoint
* @param output the output
*
* @throws IOException if the output cannot be written to
* @throws IllegalArgumentException if the codepoint cannot be represented as a numeric escape.
*/
@TCB
static void appendNumericEntity(int codepoint, Appendable output)
throws IOException {
if (((codepoint <= 0x1f) && (codepoint != 9 && codepoint != 0xa)) || (0x7f <= codepoint && codepoint <= 0x9f)) {
throw new IllegalArgumentException("Illegal numeric escape. Cannot represent control code: " + codepoint);
}
if ((0xfdd0 <= codepoint && codepoint <= 0xfdef) || ((codepoint & 0xfffe) == 0xfffe)) {
throw new IllegalArgumentException("Illegal numeric escape. Cannot represent non-character: " + codepoint);
}

output.append("&#");
if (codepoint < 100) {
// TODO: is this dead code due to REPLACEMENTS above.
if (codepoint < 10) {
output.append((char) ('0' + codepoint));
} else {
output.append((char) ('0' + (codepoint / 10)));
output.append((char) ('0' + (codepoint % 10)));
}
// Below 100, a decimal representation is shortest
output.append(Integer.toString(codepoint));
} else {
int nDigits = (codepoint < 0x1000
? codepoint < 0x100 ? 2 : 3
: (codepoint < 0x10000 ? 4
: codepoint < 0x100000 ? 5 : 6));
// Append a hexadecimal value
output.append('x');
for (int digit = nDigits; --digit >= 0;) {
int hexDigit = (codepoint >>> (digit << 2)) & 0xf;
output.append(HEX_NUMERAL[hexDigit]);
}
output.append(Integer.toHexString(codepoint));
}
output.append(";");
}

private static final char[] HEX_NUMERAL = {
'0', '1', '2', '3', '4', '5', '6', '7',
'8', '9', 'a', 'b', 'c', 'd', 'e', 'f',
};

/** Maps ASCII chars that need to be encoded to an equivalent HTML entity. */
private static final String[] REPLACEMENTS = new String[0x80];
static {
Expand All @@ -385,17 +374,41 @@ static void appendNumericEntity(int codepoint, Appendable output)
REPLACEMENTS['>'] = "&gt;"; // HTML special.
REPLACEMENTS['@'] = "&#" + ((int) '@') + ";"; // Conditional compilation.
REPLACEMENTS['`'] = "&#" + ((int) '`') + ";"; // Attribute delimiter.
REPLACEMENTS['\u007f'] = ""; // Elide delete
}

/**
* IS_BANNED_ASCII[i] where is an ASCII control character codepoint (&lt; 0x20)
* is true for control characters that are not allowed in an XML source text.
*/
private static boolean[] IS_BANNED_ASCII = new boolean[0x20];
private static final boolean[] IS_BANNED_ASCII = new boolean[0x20];
static {
for (int i = 0; i < IS_BANNED_ASCII.length; ++i) {
IS_BANNED_ASCII[i] = !(i == '\t' || i == '\n' || i == '\r');
}
}

/** Set of all Unicode characters which when processed with unicode compatibility decomposition will include a non-alphanumeric ascii character. */
static final Set<Character> RISKY_NORMALIZATION;
static {
HashSet<Character> set = new HashSet<Character>();

// These characters all decompose riskily
String singles = "\u037e\u1fef\u203c\u207a\u208a\u2100\u2101\u2105\u2106\u2260\u226e\u226f\u33c2\u33c7\u33d8\ufb29\ufe10\ufe19\ufe30\ufe47\ufe48\ufe52";
for(char ch : singles.toCharArray()) {
set.add(ch);
}

// This string is composed of pairs of characters defining inclusive start and end ranges.
String pairs =
"\u2024\u2026\u2047\u2049\u207c\u207e\u208c\u208e\u2474\u24b5\u2a74\u2a76\u3200\u321e\u3220\u3243\ufe13\ufe16\ufe33"
+ "\ufe38\ufe4d\ufe50\ufe54\ufe57\ufe59\ufe5c\ufe5f\ufe66\ufe68\ufe6b\uff01\uff0f\uff1a\uff20\uff3b\uff40\uff5b\uff5e";
for(int i=0;i<pairs.length();i+=2) {
for(char ch=pairs.charAt(i);ch<=pairs.charAt(i+1);ch++) {
set.add(ch);
}
}

RISKY_NORMALIZATION = Collections.unmodifiableSet(set);
}
}
21 changes: 16 additions & 5 deletions src/main/java/org/owasp/html/HtmlLexer.java
Original file line number Diff line number Diff line change
Expand Up @@ -527,7 +527,7 @@ private HtmlToken parseToken() {
break;
}
}
} else if (!Character.isWhitespace(ch)) {
} else if (!isAsciiWhitespace(ch)) {
type = HtmlTokenType.TEXT;
for (; end < limit; ++end) {
ch = input.charAt(end);
Expand All @@ -538,12 +538,12 @@ private HtmlToken parseToken() {
&& '>' == input.charAt(end + 1)) {
break;
} else if ('>' == ch || '=' == ch
|| Character.isWhitespace(ch)) {
|| isAsciiWhitespace(ch)) {
break;
} else if ('"' == ch || '\'' == ch) {
if (end + 1 < limit) {
char ch2 = input.charAt(end + 1);
if (Character.isWhitespace(ch2)
if (isAsciiWhitespace(ch2)
|| ch2 == '>' || ch2 == '/') {
++end;
break;
Expand All @@ -554,7 +554,7 @@ private HtmlToken parseToken() {
} else {
// We skip whitespace tokens inside tag bodies.
type = HtmlTokenType.IGNORABLE;
while (end < limit && Character.isWhitespace(input.charAt(end))) {
while (end < limit && isAsciiWhitespace(input.charAt(end))) {
++end;
}
}
Expand Down Expand Up @@ -604,7 +604,7 @@ private HtmlToken parseToken() {
ch = input.charAt(end);
switch (state) {
case TAGNAME:
if (Character.isWhitespace(ch)
if (isAsciiWhitespace(ch)
|| '>' == ch || '/' == ch || '<' == ch) {
// End processing of an escape exempt block when we see
// a corresponding end tag.
Expand Down Expand Up @@ -749,6 +749,17 @@ private String canonicalElementName(int start, int end) {
return HtmlLexer.canonicalElementName(input.substring(start, end));
}

/**
* Test if a character is an ASCII whitespace according to the HTML rules. Other Unicode whitespace characters do not count.
*
* @param ch the character to test
*
* @return true if it is one of TAB, LF, FF, CR or SPACE
*/
private static boolean isAsciiWhitespace(int ch) {
return (ch == ' ') || (ch == '\t') || (ch == '\n') || (ch == '\r') || (ch == '\f');
}

private static boolean isIdentStart(char ch) {
return ch >= 'A' && ch <= 'z' && (ch <= 'Z' || ch >= 'a');
}
Expand Down
Loading