xplshn
·
2025-08-13
scrapper.go
Go
1package main
2
3import (
4 "encoding/xml"
5 "fmt"
6 "io"
7 "net/http"
8 "os"
9 "path/filepath"
10 "sort"
11 "strconv"
12 "strings"
13
14 "github.com/k3a/html2text"
15 "github.com/fxamacker/cbor/v2"
16 "github.com/goccy/go-json"
17 "github.com/shamaton/msgpack/v2"
18 minify "github.com/tdewolff/minify/v2"
19 mjson "github.com/tdewolff/minify/v2/json"
20)
21
22type Tag struct {
23 XMLName xml.Name
24 Content string `xml:",innerxml"`
25 Lang string `xml:"lang,attr"`
26}
27
28type Components struct {
29 XMLName xml.Name `xml:"components"`
30 Components []Component `xml:"component"`
31}
32
33type ScreenshotImage struct {
34 Type string `xml:"type,attr"`
35 Width string `xml:"width,attr"`
36 Height string `xml:"height,attr"`
37 Url string `xml:",innerxml"`
38}
39
40type Screenshot struct {
41 Type string `xml:"type,attr"`
42 Caption string `xml:"caption"`
43 Images []ScreenshotImage `xml:"image"`
44}
45
46type Release struct {
47 Version string `xml:"version,attr"`
48 Date string `xml:"date,attr"`
49}
50
51type Releases struct {
52 Release []Release `xml:"release"`
53}
54
55type Component struct {
56 Names []struct {
57 Lang string `xml:"lang,attr"`
58 Content string `xml:",chardata"`
59 } `xml:"name"`
60 Summaries []struct {
61 Lang string `xml:"lang,attr"`
62 Content string `xml:",chardata"`
63 } `xml:"summary"`
64 Descriptions []struct {
65 Lang string `xml:"lang,attr"`
66 Content string `xml:",innerxml"`
67 } `xml:"description"`
68 Categories []Tag `xml:"categories>category"`
69 Keywords []Tag `xml:"keywords>keyword"`
70 Icons []struct {
71 Type string `xml:"type,attr"`
72 Width string `xml:"width,attr"`
73 Height string `xml:"height,attr"`
74 Url string `xml:",innerxml"`
75 } `xml:"icon"`
76 Url []struct {
77 Type string `xml:"type,attr"`
78 Url string `xml:",chardata"`
79 } `xml:"url"`
80 Type string `xml:"type,attr"`
81 Id string `xml:"id"`
82 ProjectLicense string `xml:"project_license"`
83 Launchable struct {
84 DesktopId string `xml:"desktop-id"`
85 } `xml:"launchable"`
86 ContentRating []Tag `xml:"content_rating"`
87 Releases Releases `xml:"releases"`
88 Screenshots []Screenshot `xml:"screenshots>screenshot"`
89}
90
91type AppStreamData struct {
92 AppId string `json:"app_id,omitempty"`
93 Name string `json:"name,omitempty"`
94 Summary string `json:"summary,omitempty"`
95 ContentRating string `json:"content_rating,omitempty"`
96 Icons []string `json:"icons,omitempty"`
97 Screenshots []string `json:"screenshots,omitempty"`
98 Categories string `json:"categories,omitempty"`
99 RichDescription string `json:"rich_description,omitempty"`
100 Version string `json:"version,omitempty"`
101}
102
103func downloadFile(url string, dest string) error {
104 resp, err := http.Get(url)
105 if err != nil {
106 return err
107 }
108 defer resp.Body.Close()
109
110 out, err := os.Create(dest)
111 if err != nil {
112 return err
113 }
114 defer out.Close()
115
116 _, err = io.Copy(out, resp.Body)
117 return err
118}
119
120func saveAll(filename string, metadata []AppStreamData) error {
121 if err := saveJSON(filename, metadata); err != nil {
122 return err
123 }
124 if err := saveCBOR(filename, metadata); err != nil {
125 return err
126 }
127 return saveMsgp(filename, metadata)
128}
129
130func saveCBOR(filename string, metadata []AppStreamData) error {
131 cborData, err := cbor.Marshal(metadata)
132 if err != nil {
133 return err
134 }
135 return os.WriteFile(filename+".cbor", cborData, 0644)
136}
137
138func saveJSON(filename string, metadata []AppStreamData) error {
139 var buffer strings.Builder
140 encoder := json.NewEncoder(&buffer)
141 encoder.SetEscapeHTML(false)
142 encoder.SetIndent("", " ")
143
144 if err := encoder.Encode(metadata); err != nil {
145 return err
146 }
147
148 jsonData := []byte(buffer.String())
149 if err := os.WriteFile(filename+".json", jsonData, 0644); err != nil {
150 return err
151 }
152
153 m := minify.New()
154 m.AddFunc("application/json", mjson.Minify)
155 if minifiedData, err := m.Bytes("application/json", jsonData); err != nil {
156 return err
157 } else if err := os.WriteFile(filename+".min.json", minifiedData, 0644); err != nil {
158 return err
159 }
160 return nil
161}
162
163func saveMsgp(filename string, metadata []AppStreamData) error {
164 msgpData, err := msgpack.Marshal(metadata)
165 if err != nil {
166 return err
167 }
168 return os.WriteFile(filename+".msgp", msgpData, 0644)
169}
170
171func getCategoriesString(categories []Tag) string {
172 var categoryStrings []string
173 for _, cat := range categories {
174 if cat.Content != "" {
175 categoryStrings = append(categoryStrings, cat.Content)
176 }
177 }
178 return strings.Join(categoryStrings, ",")
179}
180
181func getRichDescription(descriptions []struct {
182 Lang string `xml:"lang,attr"`
183 Content string `xml:",innerxml"`
184}) string {
185 return getContentByLang(descriptions)
186}
187
188func getName(names []struct {
189 Lang string `xml:"lang,attr"`
190 Content string `xml:",chardata"`
191}) string {
192 return getContentByLang(names)
193}
194
195func getSummary(summaries []struct {
196 Lang string `xml:"lang,attr"`
197 Content string `xml:",chardata"`
198}) string {
199 return getContentByLang(summaries)
200}
201
202func getContentRating(ratings []Tag) string {
203 var ratingStrings []string
204 for _, rating := range ratings {
205 decoder := xml.NewDecoder(strings.NewReader(rating.Content))
206 for {
207 tok, err := decoder.Token()
208 if err == io.EOF {
209 break
210 }
211 if err != nil {
212 continue
213 }
214 switch se := tok.(type) {
215 case xml.StartElement:
216 if se.Name.Local == "content_attribute" {
217 var id string
218 for _, attr := range se.Attr {
219 if attr.Name.Local == "id" {
220 id = attr.Value
221 break
222 }
223 }
224 var value string
225 decoder.DecodeElement(&value, &se)
226 if id != "" && value != "" {
227 ratingStrings = append(ratingStrings, fmt.Sprintf("%s:%s", id, strings.TrimSpace(value)))
228 }
229 }
230 }
231 }
232 }
233 return strings.Join(ratingStrings, ",")
234}
235
236func getContentByLang[T any](elements []T) string {
237 for _, elem := range elements {
238 switch v := any(elem).(type) {
239 case struct {
240 Lang string `xml:"lang,attr"`
241 Content string `xml:",chardata"`
242 }:
243 if v.Lang == "en" || v.Lang == "en_US" || v.Lang == "en_GB" {
244 return strings.TrimSpace(v.Content)
245 }
246 case struct {
247 Lang string `xml:"lang,attr"`
248 Content string `xml:",innerxml"`
249 }:
250 if v.Lang == "en" || v.Lang == "en_US" || v.Lang == "en_GB" {
251 return strings.TrimSpace(v.Content)
252 }
253 }
254 }
255
256 for _, elem := range elements {
257 switch v := any(elem).(type) {
258 case struct {
259 Lang string `xml:"lang,attr"`
260 Content string `xml:",chardata"`
261 }:
262 if v.Lang == "" {
263 return strings.TrimSpace(v.Content)
264 }
265 case struct {
266 Lang string `xml:"lang,attr"`
267 Content string `xml:",innerxml"`
268 }:
269 if v.Lang == "" {
270 return strings.TrimSpace(v.Content)
271 }
272 }
273 }
274
275 if len(elements) > 0 {
276 switch v := any(elements[0]).(type) {
277 case struct {
278 Lang string `xml:"lang,attr"`
279 Content string `xml:",chardata"`
280 }:
281 return strings.TrimSpace(v.Content)
282 case struct {
283 Lang string `xml:"lang,attr"`
284 Content string `xml:",innerxml"`
285 }:
286 return strings.TrimSpace(v.Content)
287 }
288 }
289
290 return ""
291}
292
293func main() {
294 tmpDir := os.TempDir()
295 xmlFilePath := filepath.Join(tmpDir, "FLATPAK_APPSTREAM.xml")
296
297 if _, err := os.Stat(xmlFilePath); os.IsNotExist(err) {
298 url := "https://github.com/Azathothas/pkgcache/raw/refs/heads/main/FLATPAK_APPSTREAM.xml"
299 if err := downloadFile(url, xmlFilePath); err != nil {
300 fmt.Printf("Error downloading file: %v\n", err)
301 return
302 }
303 }
304
305 xmlData, err := os.ReadFile(xmlFilePath)
306 if err != nil {
307 fmt.Printf("Error reading XML file: %v\n", err)
308 return
309 }
310
311 var components Components
312 if err := xml.Unmarshal(xmlData, &components); err != nil {
313 fmt.Printf("Error parsing XML file: %v\n", err)
314 return
315 }
316
317 var metadata []AppStreamData
318 for _, component := range components.Components {
319 var icons []string
320 var screenshots []string
321
322 for _, icon := range component.Icons {
323 if icon.Type == "remote" {
324 width, err1 := strconv.Atoi(icon.Width)
325 height, err2 := strconv.Atoi(icon.Height)
326 if err1 == nil && err2 == nil && width >= 128 && height >= 128 {
327 icons = append(icons, icon.Url)
328 }
329 }
330 }
331
332 for _, screenshot := range component.Screenshots {
333 sort.Slice(screenshot.Images, func(i, j int) bool {
334 widthI, _ := strconv.Atoi(screenshot.Images[i].Width)
335 heightI, _ := strconv.Atoi(screenshot.Images[i].Height)
336 widthJ, _ := strconv.Atoi(screenshot.Images[j].Width)
337 heightJ, _ := strconv.Atoi(screenshot.Images[j].Height)
338 areaI := widthI * heightI
339 areaJ := widthJ * heightJ
340 return areaI > areaJ
341 })
342
343 for _, image := range screenshot.Images {
344 if image.Type == "source" || image.Type == "default" {
345 screenshots = append(screenshots, image.Url)
346 }
347 }
348 }
349
350 categories := getCategoriesString(component.Categories)
351 richDescription := getRichDescription(component.Descriptions)
352 name := getName(component.Names)
353 summary := getSummary(component.Summaries)
354 contentRating := getContentRating(component.ContentRating)
355 version := ""
356 if len(component.Releases.Release) > 0 {
357 version = component.Releases.Release[0].Version
358 }
359
360 metadata = append(metadata, AppStreamData{
361 AppId: component.Id,
362 Name: name,
363 Summary: html2text.HTML2Text(summary),
364 ContentRating: contentRating,
365 Icons: icons,
366 Screenshots: screenshots,
367 Categories: categories,
368 RichDescription: html2text.HTML2Text(richDescription),
369 Version: version,
370 })
371 }
372
373 if err := saveAll("appstream_metadata", metadata); err != nil {
374 fmt.Printf("Error saving metadata: %v\n", err)
375 } else {
376 fmt.Println("Metadata saved successfully.")
377 }
378}
379
380
381