html2text/html2text.go at main · levinishka/html2text · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
package html2text

import (
	"strings"

	"golang.org/x/net/html"
	"golang.org/x/net/html/atom"
)

const (
	newLine   = "\n"
	spaceRune = ' '
)

// HTML2Text extracts text from html
func HTML2Text(htmlString string) string {
	var (
		// prevents from many new lines in a row
		canPrintNewline = false
		// prevents from many spaces in a row
		isSpaceNeeded = false
		// tells if was some space between tags
		wasSpace = false
		// unwanted tags counter
		skipTags = 0
	)

	// use tokenizer, not parser, because it faster, and we do not need html tree
	tokenizer := html.NewTokenizer(strings.NewReader(htmlString))
	clearString := strings.Builder{}
	clearString.Grow(len(htmlString))

	// writeString writes text to string builder
	writeString := func(text string) {
		if len(strings.TrimSpace(text)) > 0 {
			if isSpaceNeeded && wasSpace && text[0] != spaceRune {
				clearString.WriteRune(spaceRune)
				isSpaceNeeded = false
			}
			clearString.WriteString(text)
			canPrintNewline = true
			isSpaceNeeded = text[len(text)-1] != spaceRune
			wasSpace = false
		} else {
			wasSpace = true
		}
	}

	// writeNewLine writes new line without conditions, e.g. because of <br> tag
	writeNewLine := func() {
		if skipTags == 0 {
			clearString.WriteString(newLine)
			isSpaceNeeded = false
		}
	}

	// writeNewLineConditional writes new line only if needed
	writeNewLineConditional := func() {
		if skipTags == 0 && canPrintNewline {
			clearString.WriteString(newLine)
			canPrintNewline = false
			isSpaceNeeded = false
		}
	}

	// parse new token
	tokenType := tokenizer.Next()
	for tokenType != html.ErrorToken {
		switch tokenType {
		// if token is text - write it (skip empty strings)
		case html.TextToken:
			// do not move skipTags == 0 to writeString in order to avoid unnecessary tokenizer operations
			if skipTags == 0 {
				text := tokenizer.Token().Data
				writeString(text)
			}
		// add new line instead of some tags
		case html.StartTagToken:
			switch tokenizer.Token().DataAtom {
			case atom.Br, atom.Li:
				writeNewLine()
			case atom.P, atom.H1, atom.H2, atom.H3, atom.H4, atom.H5, atom.H6:
				writeNewLineConditional()
			case atom.Noscript:
				tokenizer.Next()
				// because of bug in golang.org/x/net/html (all tokens inside <noscript> are TextToken)
				// we have to parse tags inside noscript tag one more time
				// do not move skipTags == 0 to writeString in order to avoid unnecessary recursion
				if skipTags == 0 {
					writeString(HTML2Text(tokenizer.Token().Data))
				}
			// we do not want to parse content from these tags, so skip them
			case atom.Head, atom.Script, atom.Style:
				skipTags++
			}
		// add new line instead of some tags
		case html.EndTagToken:
			switch tokenizer.Token().DataAtom {
			case atom.Ul:
				writeNewLine()
			case atom.P, atom.H1, atom.H2, atom.H3, atom.H4, atom.H5, atom.H6:
				writeNewLineConditional()
			// end of unwanted tags
			case atom.Head, atom.Script, atom.Style:
				skipTags--
			}
		case html.SelfClosingTagToken:
			switch tokenizer.Token().DataAtom {
			case atom.Br, atom.Li:
				writeNewLine()
			}
		}

		// parse next token
		tokenType = tokenizer.Next()
	}

	return clearString.String()
}