repos / gbc

GBC - Go B Compiler
git clone https://github.com/xplshn/gbc.git

gbc / examples
xplshn  ·  2025-08-13

sax-xml-parser.b

B
  1// SAX XML Parser
  2//
  3// Simple XML-aware parser. CDATA is not supported.
  4//
  5// Usage: sax-xml-parser [OPTIONS] <input...>
  6// OPTIONS:
  7//    -f <file>
  8//        Parse content of the file
  9//    -s <str>
 10//        Parse input string
 11//    -e
 12//        Parse example string: <!DOCTYPE html><html><body><h1>My Heading</h1><p>My paragraph</p></body></html>
 13//    -t <file>
 14//        Extract text from a XML(HTML) file
 15//
 16// Written by Pavel Chumakou ([email protected])
 17
 18
 19parse(str, len, characters, start_tag, end_tag, start_document, end_document)
 20{
 21    auto text_start; text_start = -1;
 22    auto tag_start; tag_start = -1;
 23    auto tag_name_end; tag_name_end = -1;
 24    auto insideTag; insideTag = 0;
 25
 26    (&*start_document)();
 27
 28    auto i; i = 0; while(i < len)
 29    {
 30        auto current; current = char(str, i);
 31
 32        if (current == '<')
 33        {
 34
 35            if (insideTag == 1)
 36            {
 37                goto continue; // ... <xxx < ....  - skip this case
 38            }
 39            tag_start = i;
 40            tag_name_end = -1;
 41            insideTag = 1;
 42            goto continue;
 43        }
 44
 45        if (current == ' ')
 46        {
 47            if ((insideTag == 1) & (tag_name_end < 0))
 48            {
 49                tag_name_end = i;
 50            }
 51        }
 52
 53        if (current == '>')
 54        {
 55            if ((insideTag == 1) & (tag_start + 1 < i))
 56            {
 57                if (char(str, tag_start + 1) == '?' | char(str, tag_start + 1) == '!')
 58                {
 59                    // skip <?xml > ... <!DOCTYPE>
 60                    text_start = -1;
 61                    tag_start = -1;
 62                    tag_name_end = -1;
 63                    insideTag = 0;
 64                    goto continue;
 65                }
 66
 67                auto tag_name_start; tag_name_start = tag_start + 1;
 68                auto openTag; openTag = 1;
 69                if (char(str, tag_start + 1) == '/')
 70                {
 71                    openTag = 0; // </xxx ...
 72                    tag_name_start++;
 73                }
 74
 75                auto singleTag; singleTag = 0;
 76                auto prevByte; prevByte = char(str, i - 1);
 77                if (prevByte == '/')
 78                {
 79                    singleTag = 1; // <xxx />
 80                    if (tag_name_end < 0)
 81                    {
 82                        tag_name_end = i - 1;
 83                    }
 84                }
 85
 86                if (tag_name_end < 0)
 87                {
 88                    tag_name_end = i;
 89                }
 90
 91                if (text_start >= 0)
 92                {
 93                    (&*characters)(str, text_start, tag_start);
 94                    text_start = -1;
 95                }
 96
 97                if (singleTag)
 98                {
 99                    (&*start_tag)(str, tag_start, i + 1, tag_name_start, tag_name_end, 1);
100                } else {
101                    if (openTag) {
102                        (&*start_tag)(str, tag_start, i + 1, tag_name_start, tag_name_end, 0);
103                    } else {
104                        (&*end_tag)(str, tag_start, i + 1, tag_name_start, tag_name_end);
105                    }
106                }
107
108                tag_start = -1;
109                tag_name_end = -1;
110                insideTag = 0;
111            }
112            goto continue;
113        }
114
115        if (current == '\r' | current == '\n' | current == '\t')
116        {
117            goto continue;
118        }
119
120        if (insideTag == 0)
121        {
122            if (text_start < 0) {
123                text_start = i;
124            }
125        }
126
127continue:
128        i++;
129    } // while
130
131    if (text_start >= 0) {
132        (&*characters)(str, text_start, len);
133    }
134
135    (&*end_document)();
136}
137
138//// Parse event handlers /////////////
139
140characters(str, start, end)
141{
142    printf("characters: ");
143    print_substring_n(str, start, end);
144}
145
146start_tag(str, tag_start, tag_end, name_start, name_end, is_self_closing)
147{
148    printf("start_tag: ");
149    print_substring_n(str, name_start, name_end);
150}
151
152end_tag(str, tag_start, tag_end, name_start, name_end)
153{
154    printf("end_tag: ");
155    print_substring_n(str, name_start, name_end);
156}
157
158start_document()
159{
160    printf("start_document\n");
161}
162
163end_document()
164{
165    printf("end_document\n");
166}
167
168//// Extract text event handlers /////
169
170ext_characters(str, start, end)
171{
172    print_substring_n(str, start, end);
173}
174
175ext_start_tag(str, tag_start, tag_end, name_start, name_end, is_self_closing){}
176
177ext_end_tag(str, tag_start, tag_end, name_start, name_end){}
178
179ext_start_document() {}
180
181ext_end_document() {}
182
183/////////////////////////////////
184
185print_substring(str, start, end)
186{
187    auto i; i = start; while (i < end)
188    {
189        putchar(char(str, i));
190        i++;
191    }
192}
193
194print_substring_n(str, start, end)
195{
196    print_substring(str, start, end);
197    putchar('\n');
198}
199
200print_usage()
201{
202    printf("SAX XML Parser\n");
203    printf("Usage: sax-xml-parser [OPTIONS] <input...>\n");
204    printf("OPTIONS:\n");
205    printf("    -f <file>\n");
206    printf("        Parse content of a file\n");
207    printf("    -s \"<str>\"\n");
208    printf("        Parse input string\n");
209    printf("    -e  \n");
210    printf("        Parse example string: <!DOCTYPE html><html><body><h1>My Heading</h1><p>My paragraph</p></body></html>\n");
211    printf("    -t <file>\n");
212    printf("        Extract text from a XML(HTML) file\n");
213}
214
215buffer;
216file_size;
217
218read_file(fname)
219{
220    extrn fopen, ftell, fseek, fread, malloc;
221    auto fp; fp = fopen(fname, "rb");
222    if (fp == 0)
223    {
224        printf("File %s not found\n", fname);
225        exit(-1);
226    }
227    fseek(fp, 0, 2); // fseek(fp, 0, SEEK_END)
228    file_size = ftell(fp);
229    fseek(fp, 0, 0); //fseek(fp, 0, SEEK_SET);
230    buffer = malloc(file_size);
231    fread(buffer, 1, file_size, fp);
232}
233
234main(argc, argv)
235{
236    extrn malloc, strlen;
237
238    if (argc <= 1)
239    {
240        print_usage();
241        return(0);
242    }
243
244    auto opt; opt = argv[1];
245    if (char(opt, 0) == '-' & char(opt, 1) == 'f')
246    {
247        if (argc <= 2)
248        {
249            print_usage();
250            return(0);
251        }
252        read_file(argv[2]);
253        parse(buffer, file_size, &characters, &start_tag, &end_tag, &start_document, &end_document);
254        return(0);
255    }
256
257    if (char(opt, 0) == '-' & char(opt, 1) == 's')
258    {
259        if (argc <= 2)
260        {
261            print_usage();
262            return(0);
263        }
264        parse(argv[2], strlen(argv[2]), &characters, &start_tag, &end_tag, &start_document, &end_document);
265        return(0);
266    }
267
268    if (char(opt, 0) == '-' & char(opt, 1) == 'e')
269    {
270        auto str; str = "<!DOCTYPE html><html><body><h1>My Heading</h1><p>My paragraph</p></body></html>";
271        printf("Parsing example string: %s\n", str);
272        parse(str, strlen(str), &characters, &start_tag, &end_tag, &start_document, &end_document);
273        return(0);
274    }
275
276    if (char(opt, 0) == '-' & char(opt, 1) == 't')
277    {
278        if (argc <= 2)
279        {
280            print_usage();
281            return(0);
282        }
283        read_file(argv[2]);
284        parse(buffer, file_size, &ext_characters, &ext_start_tag, &ext_end_tag, &ext_start_document, &ext_end_document);
285        return(0);
286    }
287
288    print_usage();
289    return(0);
290
291}