gparselib/simple_parser.go at master · flowdev/gparselib · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
package gparselib

import (
	"errors"
	"fmt"
	"regexp"
	"strconv"
	"strings"
	"unicode"
	"unicode/utf8"
)

// ParseLiteral parses a literal value at the current position of the parser.
// The configuration has to be the literal string we expect.
func ParseLiteral(
	pd *ParseData, ctx interface{},
	pluginSemantics SemanticsOp,
	cfgLiteral string,
) (*ParseData, interface{}) {
	cfgN := len(cfgLiteral)
	pos := pd.Source.pos
	if len(pd.Source.content) >= pos+cfgN &&
		pd.Source.content[pos:pos+cfgN] == cfgLiteral {

		createMatchedResult(pd, cfgN)
	} else {
		createUnmatchedResult(
			pd,
			0,
			"Literal '"+cfgLiteral+"' expected",
			nil)
	}
	return handleSemantics(pluginSemantics, pd, ctx)
}

// NewParseLiteralPlugin creates a plugin sporting a literal parser.
func NewParseLiteralPlugin(pluginSemantics SemanticsOp, cfgLiteral string) SubparserOp {
	return func(pd *ParseData, ctx interface{}) (*ParseData, interface{}) {
		return ParseLiteral(pd, ctx, pluginSemantics, cfgLiteral)
	}
}

// ParseIdent parses an identifier at the current position of the parser.
// If allows Unicode letters for the first character and Unicode letters
// and Unicode numbers for all following characters.
// The configuration has to be the additional characters
// allowed for the first and following characters.
func ParseIdent(
	pd *ParseData, ctx interface{},
	pluginSemantics SemanticsOp,
	cfgFirstChar, cfgFollowingChars string,
) (*ParseData, interface{}) {
	var n int
	pos := pd.Source.pos
	substr := pd.Source.content[pos:]

	for {
		r, size := utf8.DecodeRuneInString(substr)
		if r == utf8.RuneError {
			break
		}
		if (unicode.IsLetter(r)) || // letters are always allowed
			(n > 0 && unicode.IsNumber(r)) || // digits are only allowed as following chars
			(n == 0 && strings.ContainsRune(cfgFirstChar, r)) || // configured for first char
			(n > 0 && strings.ContainsRune(cfgFollowingChars, r)) { // configured for following chars

			n += size
			substr = substr[size:]
		} else { // no ident
			break
		}
	}

	if n > 0 {
		createMatchedResult(pd, n)
	} else {
		createUnmatchedResult(pd, 0, "Identifier expected", nil)
	}
	pd, ctx = handleSemantics(pluginSemantics, pd, ctx)
	return pd, ctx
}

// NewParseIdentPlugin creates a plugin sporting an identifier parser.
func NewParseIdentPlugin(pluginSemantics SemanticsOp, cfgFirstChar, cfgFollowingChars string) SubparserOp {
	return func(pd *ParseData, ctx interface{}) (*ParseData, interface{}) {
		return ParseIdent(pd, ctx, pluginSemantics, cfgFirstChar, cfgFollowingChars)
	}
}

// This is needed for: ParseNatural
const allDigits = "0123456789abcdefghijklmnopqrstuvwxyz"

// ParseNatural parses a natural number at the current position of the parser.
// The configuration has to be the radix of accepted numbers (e.g.: 10).
// If the radix is smaller than 2 or larger than 36 an error is returned.
func ParseNatural(
	pd *ParseData, ctx interface{},
	pluginSemantics SemanticsOp,
	cfgRadix int,
) (*ParseData, interface{}, error) {
	if cfgRadix < 2 || cfgRadix > 36 {
		return nil, nil,
			&parseError{
				where: "",
				myErr: fmt.Sprintf(
					"The radix has to be between 2 and 36, but is: %d",
					cfgRadix,
				),
				baseErr: nil,
			}
	}
	cfgDigits := allDigits[:cfgRadix]

	var n int
	pos := pd.Source.pos
	substr := pd.Source.content[pos:]

	for i, digit := range substr {
		if strings.IndexRune(cfgDigits, unicode.ToLower(digit)) >= 0 {
			n = i + 1
		} else {
			break
		}
	}
	if n > 0 {
		val, err := strconv.ParseUint(substr[:n], cfgRadix, 64)
		if err == nil {
			createMatchedResult(pd, n)
			pd.Result.Value = val
		} else {
			createUnmatchedResult(pd, 0, "Natural number expected", err)
		}
	} else {
		createUnmatchedResult(pd, 0, "Natural number expected", nil)
	}
	pd, ctx = handleSemantics(pluginSemantics, pd, ctx)
	return pd, ctx, nil
}

// NewParseNaturalPlugin creates a plugin sporting a number parser.
func NewParseNaturalPlugin(pluginSemantics SemanticsOp, cfgRadix int) (SubparserOp, error) {
	pd := &ParseData{Source: SourceData{}}
	_, _, err := ParseNatural(pd, nil, nil, cfgRadix)
	if err != nil {
		return nil, err
	}

	return func(pd *ParseData, ctx interface{}) (*ParseData, interface{}) {
		pd, ctx, _ = ParseNatural(pd, ctx, pluginSemantics, cfgRadix)
		return pd, ctx
	}, nil
}

// ParseEOF only matches at the end of the input.
func ParseEOF(
	pd *ParseData, ctx interface{},
	pluginSemantics SemanticsOp,
) (*ParseData, interface{}) {
	pos := pd.Source.pos
	n := len(pd.Source.content)

	if n > pos {
		createUnmatchedResult(pd, 0,
			fmt.Sprintf(
				"Expecting end of input but still got %d bytes",
				n-pos,
			),
			nil,
		)
	} else {
		createMatchedResult(pd, 0)
	}
	return handleSemantics(pluginSemantics, pd, ctx)
}

// NewParseEOFPlugin creates a plugin sporting an EOF parser.
func NewParseEOFPlugin(pluginSemantics SemanticsOp) SubparserOp {
	return func(pd *ParseData, ctx interface{}) (*ParseData, interface{}) {
		return ParseEOF(pd, ctx, pluginSemantics)
	}
}

// ParseSpace parses one or more space characters.
// Space is defined by unicode.IsSpace().
// It can be configured wether EOL ('\n') is to be interpreted as space or not.
func ParseSpace(
	pd *ParseData, ctx interface{},
	pluginSemantics SemanticsOp,
	cfgEOLOK bool,
) (*ParseData, interface{}) {
	var n int
	pos := pd.Source.pos
	substr := pd.Source.content[pos:]

	for {
		r, size := utf8.DecodeRuneInString(substr)
		if r == utf8.RuneError {
			break
		}
		if unicode.IsSpace(r) && (cfgEOLOK || r != '\n') {
			n += size
			substr = substr[size:]
		} else {
			break
		}
	}
	if n > 0 {
		createMatchedResult(pd, n)
	} else {
		createUnmatchedResult(pd, 0, "Expecting white space", nil)
	}
	return handleSemantics(pluginSemantics, pd, ctx)
}

// NewParseSpacePlugin creates a plugin sporting a space parser.
func NewParseSpacePlugin(pluginSemantics SemanticsOp, cfgEOLOK bool) SubparserOp {
	return func(pd *ParseData, ctx interface{}) (*ParseData, interface{}) {
		return ParseSpace(pd, ctx, pluginSemantics, cfgEOLOK)
	}
}

// RegexpParser parses text according to a predefined regular expression.
// The regular expression (e.g.: `^[a-z]+`) has to be configured.
// If the regular expression doesn't start with a `^` it will be added
// automatically.
// If the regular expression can't be compiled an error is returned.
type RegexpParser regexp.Regexp

// NewRegexpParser creates a new parser for the given regular expression.
// If the regular expression is invalid an error is returned.
func NewRegexpParser(cfgRegexp string) (*RegexpParser, error) {
	if cfgRegexp[0] != '^' {
		cfgRegexp = "^" + cfgRegexp
	}
	re, err := regexp.Compile(cfgRegexp)
	return (*RegexpParser)(re), err
}

// ParseRegexp is the input port of the RegexpParser operation.
func (pr *RegexpParser) ParseRegexp(
	pd *ParseData, ctx interface{},
	pluginSemantics SemanticsOp,
) (*ParseData, interface{}) {
	re := (*regexp.Regexp)(pr)
	pos := pd.Source.pos
	substr := pd.Source.content[pos:]
	match := re.FindStringIndex(substr)

	if match != nil {
		createMatchedResult(pd, match[1])
		pd.Result.Value = pd.Result.Text
	} else {
		createUnmatchedResult(
			pd,
			0,
			"Expecting match for regexp `"+re.String()[1:]+"`",
			nil,
		)
	}
	return handleSemantics(pluginSemantics, pd, ctx)
}

// NewParseRegexpPlugin creates a plugin sporting a regular expression parser.
func NewParseRegexpPlugin(
	pluginSemantics SemanticsOp,
	cfgRegexp string,
) (SubparserOp, error) {
	pr, err := NewRegexpParser(cfgRegexp)
	if err != nil {
		return nil, err
	}

	return func(pd *ParseData, ctx interface{}) (*ParseData, interface{}) {
		return pr.ParseRegexp(pd, ctx, pluginSemantics)
	}, nil
}

// ParseLineComment parses a comment until the end of the line.
// The string that starts the comment (e.g.: `//`) has to be configured.
// If the start of the comment is empty an error is returned.
func ParseLineComment(
	pd *ParseData, ctx interface{},
	pluginSemantics SemanticsOp,
	cfgStart string,
) (*ParseData, interface{}, error) {
	if cfgStart == "" {
		return nil, nil,
			errors.New(
				"expected start of line comment as config, got empty string",
			)
	}

	pos := pd.Source.pos
	l := len(cfgStart)
	n := min(pos+l, len(pd.Source.content))
	substr := pd.Source.content[pos:n]

	if substr == cfgStart {
		i := strings.IndexRune(pd.Source.content[n:], '\n')
		if i >= 0 {
			l += i
		} else {
			l = len(pd.Source.content) - pos
		}
		createMatchedResult(pd, l)
		pd.Result.Value = ""
	} else {
		createUnmatchedResult(pd, 0, "Expecting line comment", nil)
	}
	pd, ctx = handleSemantics(pluginSemantics, pd, ctx)
	return pd, ctx, nil
}

// NewParseLineCommentPlugin creates a plugin sporting a line comment parser.
func NewParseLineCommentPlugin(
	pluginSemantics SemanticsOp,
	cfgStart string,
) (SubparserOp, error) {
	pd := &ParseData{Source: SourceData{}}
	_, _, err := ParseLineComment(pd, nil, nil, cfgStart)
	if err != nil {
		return nil, err
	}

	return func(pd *ParseData, ctx interface{}) (*ParseData, interface{}) {
		pd, ctx, _ = ParseLineComment(pd, ctx, pluginSemantics, cfgStart)
		return pd, ctx
	}, nil
}

// ParseBlockComment parses a comment until the end of the line.
// The strings that start and end the comment (e.g.: `/*`, `*/`)
// have to be configured.
// A comment start or end inside a string literal (', " and `) is ignored.
// If the start or end of the comment is empty an error is returned.
func ParseBlockComment(
	pd *ParseData, ctx interface{},
	pluginSemantics SemanticsOp,
	cfgStart, cfgEnd string,
) (*ParseData, interface{}, error) {
	if cfgStart == "" {
		return nil, nil,
			errors.New(
				"expected start of block comment as config, got empty string",
			)
	}
	if cfgEnd == "" {
		return nil, nil,
			errors.New(
				"expected end of block comment as config, got empty string",
			)
	}
	lBeg := len(cfgStart)
	lEnd := len(cfgEnd)

	pos := pd.Source.pos
	n := min(pos+lBeg, len(pd.Source.content))
	substr := pd.Source.content[pos:n]

	if substr == cfgStart {
		afterBackslash := false
		stringType := ' '
		found := false
		endRune, _ := utf8.DecodeRuneInString(cfgEnd)
		reststr := pd.Source.content[n:]

	RuneLoop:
		for i, r := range reststr {
			switch {
			case afterBackslash:
				afterBackslash = false
			case stringType == '\'' || stringType == '"':
				switch r {
				case '\\':
					afterBackslash = true
				case stringType:
					stringType = ' '
				}
			case stringType == '`':
				if r == '`' {
					stringType = ' '
				}
			default:
				switch r {
				case '\'', '"', '`':
					stringType = r
				case endRune:
					if len(reststr) >= i+lEnd &&
						reststr[i:i+lEnd] == cfgEnd {

						found = true
						pos = i + lEnd
						break RuneLoop
					}
				}
			}
		}
		if found {
			createMatchedResult(pd, lBeg+pos)
			pd.Result.Value = ""
		} else {
			createUnmatchedResult(
				pd,
				lBeg,
				fmt.Sprintf("Block comment isn't closed with '%s'", cfgEnd),
				nil,
			)
			pd.Source.pos += lBeg
		}
	} else {
		createUnmatchedResult(
			pd,
			0,
			fmt.Sprintf(
				"Expecting block comment starting with '%s', got '%s'",
				cfgStart,
				substr),
			nil,
		)
	}
	pd, ctx = handleSemantics(pluginSemantics, pd, ctx)
	return pd, ctx, nil
}

// NewParseBlockCommentPlugin creates a plugin sporting a number parser.
func NewParseBlockCommentPlugin(
	pluginSemantics SemanticsOp,
	cfgStart, cfgEnd string,
) (SubparserOp, error) {
	pd := &ParseData{Source: SourceData{}}
	_, _, err := ParseBlockComment(pd, nil, nil, cfgStart, cfgEnd)
	if err != nil {
		return nil, err
	}

	return func(pd *ParseData, ctx interface{}) (*ParseData, interface{}) {
		pd, ctx, _ = ParseBlockComment(pd, ctx, pluginSemantics, cfgStart, cfgEnd)
		return pd, ctx
	}, nil
}

// ParseGoodRunes parses as long as the runes are accepted by the configured function.
// If no good rune is found, an error is returned.
func ParseGoodRunes(
	pd *ParseData, ctx interface{},
	pluginSemantics SemanticsOp,
	cfgAccept func(rune) bool,
) (*ParseData, interface{}) {
	var n int
	pos := pd.Source.pos
	substr := pd.Source.content[pos:]

	for {
		r, size := utf8.DecodeRuneInString(substr)
		if r == utf8.RuneError {
			break
		}
		if cfgAccept(r) {
			n += size
			substr = substr[size:]
		} else {
			break
		}
	}

	if n > 0 {
		createMatchedResult(pd, n)
	} else {
		createUnmatchedResult(pd, 0, "Acceptable runes expected", nil)
	}
	pd, ctx = handleSemantics(pluginSemantics, pd, ctx)
	return pd, ctx
}

// NewParseGoodRunesPlugin creates a plugin sporting a parser for accepted runes.
func NewParseGoodRunesPlugin(pluginSemantics SemanticsOp, cfgAccept func(rune) bool) SubparserOp {
	return func(pd *ParseData, ctx interface{}) (*ParseData, interface{}) {
		return ParseGoodRunes(pd, ctx, pluginSemantics, cfgAccept)
	}
}