-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathhtml2text.go
More file actions
119 lines (108 loc) · 3.11 KB
/
html2text.go
File metadata and controls
119 lines (108 loc) · 3.11 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
package html2text
import (
"strings"
"golang.org/x/net/html"
"golang.org/x/net/html/atom"
)
const (
newLine = "\n"
spaceRune = ' '
)
// HTML2Text extracts text from html
func HTML2Text(htmlString string) string {
var (
// prevents from many new lines in a row
canPrintNewline = false
// prevents from many spaces in a row
isSpaceNeeded = false
// tells if was some space between tags
wasSpace = false
// unwanted tags counter
skipTags = 0
)
// use tokenizer, not parser, because it faster, and we do not need html tree
tokenizer := html.NewTokenizer(strings.NewReader(htmlString))
clearString := strings.Builder{}
clearString.Grow(len(htmlString))
// writeString writes text to string builder
writeString := func(text string) {
if len(strings.TrimSpace(text)) > 0 {
if isSpaceNeeded && wasSpace && text[0] != spaceRune {
clearString.WriteRune(spaceRune)
isSpaceNeeded = false
}
clearString.WriteString(text)
canPrintNewline = true
isSpaceNeeded = text[len(text)-1] != spaceRune
wasSpace = false
} else {
wasSpace = true
}
}
// writeNewLine writes new line without conditions, e.g. because of <br> tag
writeNewLine := func() {
if skipTags == 0 {
clearString.WriteString(newLine)
isSpaceNeeded = false
}
}
// writeNewLineConditional writes new line only if needed
writeNewLineConditional := func() {
if skipTags == 0 && canPrintNewline {
clearString.WriteString(newLine)
canPrintNewline = false
isSpaceNeeded = false
}
}
// parse new token
tokenType := tokenizer.Next()
for tokenType != html.ErrorToken {
switch tokenType {
// if token is text - write it (skip empty strings)
case html.TextToken:
// do not move skipTags == 0 to writeString in order to avoid unnecessary tokenizer operations
if skipTags == 0 {
text := tokenizer.Token().Data
writeString(text)
}
// add new line instead of some tags
case html.StartTagToken:
switch tokenizer.Token().DataAtom {
case atom.Br, atom.Li:
writeNewLine()
case atom.P, atom.H1, atom.H2, atom.H3, atom.H4, atom.H5, atom.H6:
writeNewLineConditional()
case atom.Noscript:
tokenizer.Next()
// because of bug in golang.org/x/net/html (all tokens inside <noscript> are TextToken)
// we have to parse tags inside noscript tag one more time
// do not move skipTags == 0 to writeString in order to avoid unnecessary recursion
if skipTags == 0 {
writeString(HTML2Text(tokenizer.Token().Data))
}
// we do not want to parse content from these tags, so skip them
case atom.Head, atom.Script, atom.Style:
skipTags++
}
// add new line instead of some tags
case html.EndTagToken:
switch tokenizer.Token().DataAtom {
case atom.Ul:
writeNewLine()
case atom.P, atom.H1, atom.H2, atom.H3, atom.H4, atom.H5, atom.H6:
writeNewLineConditional()
// end of unwanted tags
case atom.Head, atom.Script, atom.Style:
skipTags--
}
case html.SelfClosingTagToken:
switch tokenizer.Token().DataAtom {
case atom.Br, atom.Li:
writeNewLine()
}
}
// parse next token
tokenType = tokenizer.Next()
}
return clearString.String()
}