ghmoon/ansi2gfm.py at main · kernelkit/ghmoon · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
#!/usr/bin/env python3
"""
Convert ANSI escape sequences to GitHub Flavored Markdown compatible markup.

GFM supports limited HTML tags within <pre> blocks:
- <b>, <strong> for bold
- <i>, <em> for italics
- <mark> for highlighting
- <del> for strikethrough
- <ins> for underline

Since GFM doesn't support color styling, we use semantic markup instead.
"""

import re
import sys


# ANSI color/style codes we care about
ANSI_PATTERNS = {
    # Reset
    '[0m': '',
    '[00m': '',

    # Text styles (preservable in GFM)
    '[1m': '<b>',          # Bold
    '[01m': '<b>',
    '[21m': '</b>',        # Bold off
    '[22m': '</b>',        # Normal intensity
    '[2m': '<i>',          # Dim (use italic as approximation)
    '[02m': '<i>',
    '[3m': '<i>',          # Italic
    '[03m': '<i>',
    '[23m': '</i>',        # Italic off
    '[4m': '<ins>',        # Underline
    '[04m': '<ins>',
    '[24m': '</ins>',      # Underline off
    '[9m': '<del>',        # Strikethrough
    '[09m': '<del>',
    '[29m': '</del>',      # Strikethrough off

    # Foreground colors (strip, but could add emojis for semantic meaning)
    '[30m': '',   # Black
    '[31m': '',   # Red
    '[32m': '',   # Green
    '[33m': '',   # Yellow
    '[34m': '',   # Blue
    '[35m': '',   # Magenta
    '[36m': '',   # Cyan
    '[37m': '',   # White
    '[90m': '',   # Bright Black (Gray)
    '[91m': '',   # Bright Red
    '[92m': '',   # Bright Green
    '[93m': '',   # Bright Yellow
    '[94m': '',   # Bright Blue
    '[95m': '',   # Bright Magenta
    '[96m': '',   # Bright Cyan
    '[97m': '',   # Bright White

    # Background colors (use mark for any background)
    '[40m': '<mark>',   # Black bg
    '[41m': '<mark>',   # Red bg
    '[42m': '<mark>',   # Green bg
    '[43m': '<mark>',   # Yellow bg
    '[44m': '<mark>',   # Blue bg
    '[45m': '<mark>',   # Magenta bg
    '[46m': '<mark>',   # Cyan bg
    '[47m': '<mark>',   # White bg
    '[100m': '<mark>',  # Bright Black bg
    '[101m': '<mark>',  # Bright Red bg
    '[102m': '<mark>',  # Bright Green bg
    '[103m': '<mark>',  # Bright Yellow bg
    '[104m': '<mark>',  # Bright Blue bg
    '[105m': '<mark>',  # Bright Magenta bg
    '[106m': '<mark>',  # Bright Cyan bg
    '[107m': '<mark>',  # Bright White bg

    # Combined codes (common patterns)
    '[37;44m': '<mark><b>',  # White on blue (highlight)
    '[1;31m': '<b>',         # Bold red -> just bold
    '[1;32m': '<b>',         # Bold green -> just bold
    '[1;33m': '<b>',         # Bold yellow -> just bold
}


def convert_ansi_to_gfm(text):
    """Convert ANSI escape sequences to GFM-compatible HTML tags with semantic meaning."""

    lines = text.split('\n')
    clean_lines = []

    i = 0
    while i < len(lines):
        line = lines[i]
        original_line = line

        # Check for semantic patterns at start of line
        # [93m at line start = <h1> (9PM main heading)
        if line.startswith('\x1b[93m'):
            # Remove the code
            line = line[5:]
            # Check if it ends with reset
            if '\x1b[0m' in line:
                line = line.replace('\x1b[0m', '')
            line = '<h1>' + line + '</h1>'
            clean_lines.append(line)
            i += 1
            continue

        # [0m[93m at line start - could be heading or skip
        if line.startswith('\x1b[0m\x1b[93m'):
            # Remove the leading codes (\x1b[0m = 4 bytes, \x1b[93m = 5 bytes)
            content = line[9:]

            # Check if there's actual content on this line (not just whitespace/newline)
            # Remove any trailing reset codes for this check
            test_content = content.replace('\x1b[0m', '').strip()

            if test_content:
                # Has content - this is a skip line, use <del>
                # Clean up any reset codes
                line = content.replace('\x1b[0m', '')
                line = '<del>' + line + '</del>'
                clean_lines.append(line)
                i += 1
                continue
            else:
                # Empty or just whitespace - could be heading on next line
                # Look ahead to see if next line has content
                if i + 1 < len(lines):
                    next_line = lines[i + 1].strip()
                    # Check if next line has content and no ANSI codes at start
                    if next_line and not next_line.startswith('\x1b['):
                        # Check if it starts with a letter (could be skip/fail marker)
                        # Skip markers in summary: "s 0076 Container..."
                        if next_line and next_line[0].isalpha() and len(next_line) > 2 and next_line[1] == ' ':
                            # This is likely a skip/fail marker in summary, treat as strikethrough
                            line = '<del>' + next_line + '</del>'
                            clean_lines.append(line)
                            i += 2  # Skip both this line and next line
                            continue
                        else:
                            # Next line is a heading
                            line = '<h2>' + next_line + '</h2>'
                            clean_lines.append(line)
                            i += 2  # Skip both this line and next line
                            continue

                # No content on next line, skip this empty line
                line = ''
                clean_lines.append(line)
                i += 1
                continue

        # Process other ANSI codes
        # [92m = pass (green - highlighted)
        line = re.sub(r'\x1b\[0m\x1b\[92m', '<mark><b>', line)
        line = re.sub(r'\x1b\[92m', '<mark><b>', line)

        # [91m = fail (red - bold)
        line = re.sub(r'\x1b\[0m\x1b\[91m', '<b>', line)
        line = re.sub(r'\x1b\[91m', '<b>', line)

        # [93m = skip/warning (yellow)
        # If followed immediately by a character (like 's'), it's a skip marker in summary
        # Use negative lookahead to check if NOT followed by [0m (which would be empty)
        line = re.sub(r'\x1b\[93m(?=[a-zA-Z])', '<del>', line)
        # Plain [93m (not followed by letter) = just bold
        line = re.sub(r'\x1b\[93m', '<b>', line)

        # [94m = new test/heading (blue - h3)
        line = re.sub(r'\x1b\[94m', '<h3>', line)

        # [2m = dim, just strip it (regular text)
        line = re.sub(r'\x1b\[0?2m', '', line)

        # [37;44m = white on blue (mark + bold)
        line = re.sub(r'\x1b\[37;44m', '<mark><b>', line)

        # Replace each [0m with appropriate closing tags
        # First, detect what tags are open
        def get_open_tags(text):
            """Scan text to find unclosed HTML tags"""
            tags = []
            # Track opens and closes
            if '<mark><b>' in text:
                tags.extend(['mark', 'b'])
            elif '<b>' in text:
                tags.append('b')
            if '<del>' in text and '</del>' not in text:
                tags.append('del')
            if '<h3>' in text and '</h3>' not in text:
                tags.append('h3')

            # Remove tags that are already closed
            if '</b></mark>' in text or ('</b>' in text and '</mark>' in text):
                if 'b' in tags:
                    tags.remove('b')
                if 'mark' in tags:
                    tags.remove('mark')
            elif '</b>' in text and 'b' in tags:
                tags.remove('b')
            elif '</mark>' in text and 'mark' in tags:
                tags.remove('mark')
            elif '</del>' in text and 'del' in tags:
                tags.remove('del')

            return tags

        # Replace [0m codes with closing tags
        while '\x1b[0m' in line:
            open_tags = get_open_tags(line)
            if not open_tags:
                # No tags to close, just remove the code
                line = line.replace('\x1b[0m', '', 1)
                continue

            # Close tags in proper order
            closing = ''
            if 'b' in open_tags and 'mark' in open_tags:
                closing = '</b></mark>'
            elif 'del' in open_tags:
                closing = '</del>'
            elif 'b' in open_tags:
                closing = '</b>'
            elif 'h3' in open_tags:
                closing = '</h3>'
            elif 'mark' in open_tags:
                closing = '</mark>'
            else:
                closing = ''  # Nothing to close

            # Replace first occurrence of [0m
            line = line.replace('\x1b[0m', closing, 1)

        # Clean up any remaining reset codes
        line = line.replace('\x1b[00m', '')

        # Clean up any remaining ANSI codes we didn't handle
        line = re.sub(r'\x1b\[[0-9;]*m', '', line)

        # Auto-close any tags still open at end of line
        # But ONLY if next line doesn't start with a reset code
        final_open_tags = get_open_tags(line)
        if final_open_tags:
            # Check if next line starts with reset code
            next_line_has_reset = False
            if i + 1 < len(lines):
                next_line_has_reset = lines[i + 1].lstrip().startswith('\x1b[0m')

            # Only auto-close if next line won't close them
            if not next_line_has_reset:
                closing = ''
                if 'b' in final_open_tags and 'mark' in final_open_tags:
                    closing = '</b></mark>'
                elif 'del' in final_open_tags:
                    closing = '</del>'
                elif 'b' in final_open_tags:
                    closing = '</b>'
                elif 'h3' in final_open_tags:
                    closing = '</h3>'
                elif 'mark' in final_open_tags:
                    closing = '</mark>'
                line += closing

        clean_lines.append(line)
        i += 1

    return '\n'.join(clean_lines)


def main():
    """Read from stdin or file, convert ANSI to GFM, write to stdout."""
    import argparse

    parser = argparse.ArgumentParser(
        description='Convert ANSI escape sequences to GFM-compatible markup'
    )
    parser.add_argument(
        'input',
        nargs='?',
        type=argparse.FileType('r'),
        default=sys.stdin,
        help='Input file (default: stdin)'
    )
    parser.add_argument(
        '-o', '--output',
        type=argparse.FileType('w'),
        default=sys.stdout,
        help='Output file (default: stdout)'
    )

    args = parser.parse_args()

    content = args.input.read()
    converted = convert_ansi_to_gfm(content)
    args.output.write(converted)


if __name__ == '__main__':
    main()