xplshn
·
2025-08-13
sax-xml-parser.b
B
1// SAX XML Parser
2//
3// Simple XML-aware parser. CDATA is not supported.
4//
5// Usage: sax-xml-parser [OPTIONS] <input...>
6// OPTIONS:
7// -f <file>
8// Parse content of the file
9// -s <str>
10// Parse input string
11// -e
12// Parse example string: <!DOCTYPE html><html><body><h1>My Heading</h1><p>My paragraph</p></body></html>
13// -t <file>
14// Extract text from a XML(HTML) file
15//
16// Written by Pavel Chumakou ([email protected])
17
18
19parse(str, len, characters, start_tag, end_tag, start_document, end_document)
20{
21 auto text_start; text_start = -1;
22 auto tag_start; tag_start = -1;
23 auto tag_name_end; tag_name_end = -1;
24 auto insideTag; insideTag = 0;
25
26 (&*start_document)();
27
28 auto i; i = 0; while(i < len)
29 {
30 auto current; current = char(str, i);
31
32 if (current == '<')
33 {
34
35 if (insideTag == 1)
36 {
37 goto continue; // ... <xxx < .... - skip this case
38 }
39 tag_start = i;
40 tag_name_end = -1;
41 insideTag = 1;
42 goto continue;
43 }
44
45 if (current == ' ')
46 {
47 if ((insideTag == 1) & (tag_name_end < 0))
48 {
49 tag_name_end = i;
50 }
51 }
52
53 if (current == '>')
54 {
55 if ((insideTag == 1) & (tag_start + 1 < i))
56 {
57 if (char(str, tag_start + 1) == '?' | char(str, tag_start + 1) == '!')
58 {
59 // skip <?xml > ... <!DOCTYPE>
60 text_start = -1;
61 tag_start = -1;
62 tag_name_end = -1;
63 insideTag = 0;
64 goto continue;
65 }
66
67 auto tag_name_start; tag_name_start = tag_start + 1;
68 auto openTag; openTag = 1;
69 if (char(str, tag_start + 1) == '/')
70 {
71 openTag = 0; // </xxx ...
72 tag_name_start++;
73 }
74
75 auto singleTag; singleTag = 0;
76 auto prevByte; prevByte = char(str, i - 1);
77 if (prevByte == '/')
78 {
79 singleTag = 1; // <xxx />
80 if (tag_name_end < 0)
81 {
82 tag_name_end = i - 1;
83 }
84 }
85
86 if (tag_name_end < 0)
87 {
88 tag_name_end = i;
89 }
90
91 if (text_start >= 0)
92 {
93 (&*characters)(str, text_start, tag_start);
94 text_start = -1;
95 }
96
97 if (singleTag)
98 {
99 (&*start_tag)(str, tag_start, i + 1, tag_name_start, tag_name_end, 1);
100 } else {
101 if (openTag) {
102 (&*start_tag)(str, tag_start, i + 1, tag_name_start, tag_name_end, 0);
103 } else {
104 (&*end_tag)(str, tag_start, i + 1, tag_name_start, tag_name_end);
105 }
106 }
107
108 tag_start = -1;
109 tag_name_end = -1;
110 insideTag = 0;
111 }
112 goto continue;
113 }
114
115 if (current == '\r' | current == '\n' | current == '\t')
116 {
117 goto continue;
118 }
119
120 if (insideTag == 0)
121 {
122 if (text_start < 0) {
123 text_start = i;
124 }
125 }
126
127continue:
128 i++;
129 } // while
130
131 if (text_start >= 0) {
132 (&*characters)(str, text_start, len);
133 }
134
135 (&*end_document)();
136}
137
138//// Parse event handlers /////////////
139
140characters(str, start, end)
141{
142 printf("characters: ");
143 print_substring_n(str, start, end);
144}
145
146start_tag(str, tag_start, tag_end, name_start, name_end, is_self_closing)
147{
148 printf("start_tag: ");
149 print_substring_n(str, name_start, name_end);
150}
151
152end_tag(str, tag_start, tag_end, name_start, name_end)
153{
154 printf("end_tag: ");
155 print_substring_n(str, name_start, name_end);
156}
157
158start_document()
159{
160 printf("start_document\n");
161}
162
163end_document()
164{
165 printf("end_document\n");
166}
167
168//// Extract text event handlers /////
169
170ext_characters(str, start, end)
171{
172 print_substring_n(str, start, end);
173}
174
175ext_start_tag(str, tag_start, tag_end, name_start, name_end, is_self_closing){}
176
177ext_end_tag(str, tag_start, tag_end, name_start, name_end){}
178
179ext_start_document() {}
180
181ext_end_document() {}
182
183/////////////////////////////////
184
185print_substring(str, start, end)
186{
187 auto i; i = start; while (i < end)
188 {
189 putchar(char(str, i));
190 i++;
191 }
192}
193
194print_substring_n(str, start, end)
195{
196 print_substring(str, start, end);
197 putchar('\n');
198}
199
200print_usage()
201{
202 printf("SAX XML Parser\n");
203 printf("Usage: sax-xml-parser [OPTIONS] <input...>\n");
204 printf("OPTIONS:\n");
205 printf(" -f <file>\n");
206 printf(" Parse content of a file\n");
207 printf(" -s \"<str>\"\n");
208 printf(" Parse input string\n");
209 printf(" -e \n");
210 printf(" Parse example string: <!DOCTYPE html><html><body><h1>My Heading</h1><p>My paragraph</p></body></html>\n");
211 printf(" -t <file>\n");
212 printf(" Extract text from a XML(HTML) file\n");
213}
214
215buffer;
216file_size;
217
218read_file(fname)
219{
220 extrn fopen, ftell, fseek, fread, malloc;
221 auto fp; fp = fopen(fname, "rb");
222 if (fp == 0)
223 {
224 printf("File %s not found\n", fname);
225 exit(-1);
226 }
227 fseek(fp, 0, 2); // fseek(fp, 0, SEEK_END)
228 file_size = ftell(fp);
229 fseek(fp, 0, 0); //fseek(fp, 0, SEEK_SET);
230 buffer = malloc(file_size);
231 fread(buffer, 1, file_size, fp);
232}
233
234main(argc, argv)
235{
236 extrn malloc, strlen;
237
238 if (argc <= 1)
239 {
240 print_usage();
241 return(0);
242 }
243
244 auto opt; opt = argv[1];
245 if (char(opt, 0) == '-' & char(opt, 1) == 'f')
246 {
247 if (argc <= 2)
248 {
249 print_usage();
250 return(0);
251 }
252 read_file(argv[2]);
253 parse(buffer, file_size, &characters, &start_tag, &end_tag, &start_document, &end_document);
254 return(0);
255 }
256
257 if (char(opt, 0) == '-' & char(opt, 1) == 's')
258 {
259 if (argc <= 2)
260 {
261 print_usage();
262 return(0);
263 }
264 parse(argv[2], strlen(argv[2]), &characters, &start_tag, &end_tag, &start_document, &end_document);
265 return(0);
266 }
267
268 if (char(opt, 0) == '-' & char(opt, 1) == 'e')
269 {
270 auto str; str = "<!DOCTYPE html><html><body><h1>My Heading</h1><p>My paragraph</p></body></html>";
271 printf("Parsing example string: %s\n", str);
272 parse(str, strlen(str), &characters, &start_tag, &end_tag, &start_document, &end_document);
273 return(0);
274 }
275
276 if (char(opt, 0) == '-' & char(opt, 1) == 't')
277 {
278 if (argc <= 2)
279 {
280 print_usage();
281 return(0);
282 }
283 read_file(argv[2]);
284 parse(buffer, file_size, &ext_characters, &ext_start_tag, &ext_end_tag, &ext_start_document, &ext_end_document);
285 return(0);
286 }
287
288 print_usage();
289 return(0);
290
291}