repos / dbin

📦 Poor man's package manager.
git clone https://github.com/xplshn/dbin.git

dbin / misc / cmd / flatpakAppStreamScrapper
xplshn  ·  2025-08-13

scrapper.go

Go
  1package main
  2
  3import (
  4	"encoding/xml"
  5	"fmt"
  6	"io"
  7	"net/http"
  8	"os"
  9	"path/filepath"
 10	"sort"
 11	"strconv"
 12	"strings"
 13
 14	"github.com/k3a/html2text"
 15	"github.com/fxamacker/cbor/v2"
 16	"github.com/goccy/go-json"
 17	"github.com/shamaton/msgpack/v2"
 18	minify "github.com/tdewolff/minify/v2"
 19	mjson "github.com/tdewolff/minify/v2/json"
 20)
 21
 22type Tag struct {
 23	XMLName xml.Name
 24	Content string `xml:",innerxml"`
 25	Lang    string `xml:"lang,attr"`
 26}
 27
 28type Components struct {
 29	XMLName    xml.Name    `xml:"components"`
 30	Components []Component `xml:"component"`
 31}
 32
 33type ScreenshotImage struct {
 34	Type   string `xml:"type,attr"`
 35	Width  string `xml:"width,attr"`
 36	Height string `xml:"height,attr"`
 37	Url    string `xml:",innerxml"`
 38}
 39
 40type Screenshot struct {
 41	Type    string            `xml:"type,attr"`
 42	Caption string            `xml:"caption"`
 43	Images  []ScreenshotImage `xml:"image"`
 44}
 45
 46type Release struct {
 47	Version string `xml:"version,attr"`
 48	Date    string `xml:"date,attr"`
 49}
 50
 51type Releases struct {
 52	Release []Release `xml:"release"`
 53}
 54
 55type Component struct {
 56	Names       []struct {
 57		Lang    string `xml:"lang,attr"`
 58		Content string `xml:",chardata"`
 59	} `xml:"name"`
 60	Summaries   []struct {
 61		Lang    string `xml:"lang,attr"`
 62		Content string `xml:",chardata"`
 63	} `xml:"summary"`
 64	Descriptions []struct {
 65		Lang    string `xml:"lang,attr"`
 66		Content string `xml:",innerxml"`
 67	} `xml:"description"`
 68	Categories []Tag `xml:"categories>category"`
 69	Keywords   []Tag `xml:"keywords>keyword"`
 70	Icons      []struct {
 71		Type   string `xml:"type,attr"`
 72		Width  string `xml:"width,attr"`
 73		Height string `xml:"height,attr"`
 74		Url    string `xml:",innerxml"`
 75	} `xml:"icon"`
 76	Url []struct {
 77		Type string `xml:"type,attr"`
 78		Url  string `xml:",chardata"`
 79	} `xml:"url"`
 80	Type           string `xml:"type,attr"`
 81	Id             string `xml:"id"`
 82	ProjectLicense string `xml:"project_license"`
 83	Launchable     struct {
 84		DesktopId string `xml:"desktop-id"`
 85	} `xml:"launchable"`
 86	ContentRating []Tag    `xml:"content_rating"`
 87	Releases      Releases `xml:"releases"`
 88	Screenshots   []Screenshot `xml:"screenshots>screenshot"`
 89}
 90
 91type AppStreamData struct {
 92	AppId           string   `json:"app_id,omitempty"`
 93	Name            string   `json:"name,omitempty"`
 94	Summary         string   `json:"summary,omitempty"`
 95	ContentRating   string   `json:"content_rating,omitempty"`
 96	Icons           []string `json:"icons,omitempty"`
 97	Screenshots     []string `json:"screenshots,omitempty"`
 98	Categories      string   `json:"categories,omitempty"`
 99	RichDescription string   `json:"rich_description,omitempty"`
100	Version         string   `json:"version,omitempty"`
101}
102
103func downloadFile(url string, dest string) error {
104	resp, err := http.Get(url)
105	if err != nil {
106		return err
107	}
108	defer resp.Body.Close()
109
110	out, err := os.Create(dest)
111	if err != nil {
112		return err
113	}
114	defer out.Close()
115
116	_, err = io.Copy(out, resp.Body)
117	return err
118}
119
120func saveAll(filename string, metadata []AppStreamData) error {
121	if err := saveJSON(filename, metadata); err != nil {
122		return err
123	}
124	if err := saveCBOR(filename, metadata); err != nil {
125		return err
126	}
127	return saveMsgp(filename, metadata)
128}
129
130func saveCBOR(filename string, metadata []AppStreamData) error {
131	cborData, err := cbor.Marshal(metadata)
132	if err != nil {
133		return err
134	}
135	return os.WriteFile(filename+".cbor", cborData, 0644)
136}
137
138func saveJSON(filename string, metadata []AppStreamData) error {
139	var buffer strings.Builder
140	encoder := json.NewEncoder(&buffer)
141	encoder.SetEscapeHTML(false)
142	encoder.SetIndent("", " ")
143
144	if err := encoder.Encode(metadata); err != nil {
145		return err
146	}
147
148	jsonData := []byte(buffer.String())
149	if err := os.WriteFile(filename+".json", jsonData, 0644); err != nil {
150		return err
151	}
152
153	m := minify.New()
154	m.AddFunc("application/json", mjson.Minify)
155	if minifiedData, err := m.Bytes("application/json", jsonData); err != nil {
156		return err
157	} else if err := os.WriteFile(filename+".min.json", minifiedData, 0644); err != nil {
158		return err
159	}
160	return nil
161}
162
163func saveMsgp(filename string, metadata []AppStreamData) error {
164	msgpData, err := msgpack.Marshal(metadata)
165	if err != nil {
166		return err
167	}
168	return os.WriteFile(filename+".msgp", msgpData, 0644)
169}
170
171func getCategoriesString(categories []Tag) string {
172	var categoryStrings []string
173	for _, cat := range categories {
174		if cat.Content != "" {
175			categoryStrings = append(categoryStrings, cat.Content)
176		}
177	}
178	return strings.Join(categoryStrings, ",")
179}
180
181func getRichDescription(descriptions []struct {
182	Lang    string `xml:"lang,attr"`
183	Content string `xml:",innerxml"`
184}) string {
185	return getContentByLang(descriptions)
186}
187
188func getName(names []struct {
189	Lang    string `xml:"lang,attr"`
190	Content string `xml:",chardata"`
191}) string {
192	return getContentByLang(names)
193}
194
195func getSummary(summaries []struct {
196	Lang    string `xml:"lang,attr"`
197	Content string `xml:",chardata"`
198}) string {
199	return getContentByLang(summaries)
200}
201
202func getContentRating(ratings []Tag) string {
203	var ratingStrings []string
204	for _, rating := range ratings {
205		decoder := xml.NewDecoder(strings.NewReader(rating.Content))
206		for {
207			tok, err := decoder.Token()
208			if err == io.EOF {
209				break
210			}
211			if err != nil {
212				continue
213			}
214			switch se := tok.(type) {
215			case xml.StartElement:
216				if se.Name.Local == "content_attribute" {
217					var id string
218					for _, attr := range se.Attr {
219						if attr.Name.Local == "id" {
220							id = attr.Value
221							break
222						}
223					}
224					var value string
225					decoder.DecodeElement(&value, &se)
226					if id != "" && value != "" {
227						ratingStrings = append(ratingStrings, fmt.Sprintf("%s:%s", id, strings.TrimSpace(value)))
228					}
229				}
230			}
231		}
232	}
233	return strings.Join(ratingStrings, ",")
234}
235
236func getContentByLang[T any](elements []T) string {
237	for _, elem := range elements {
238		switch v := any(elem).(type) {
239		case struct {
240			Lang    string `xml:"lang,attr"`
241			Content string `xml:",chardata"`
242		}:
243			if v.Lang == "en" || v.Lang == "en_US" || v.Lang == "en_GB" {
244				return strings.TrimSpace(v.Content)
245			}
246		case struct {
247			Lang    string `xml:"lang,attr"`
248			Content string `xml:",innerxml"`
249		}:
250			if v.Lang == "en" || v.Lang == "en_US" || v.Lang == "en_GB" {
251				return strings.TrimSpace(v.Content)
252			}
253		}
254	}
255
256	for _, elem := range elements {
257		switch v := any(elem).(type) {
258		case struct {
259			Lang    string `xml:"lang,attr"`
260			Content string `xml:",chardata"`
261		}:
262			if v.Lang == "" {
263				return strings.TrimSpace(v.Content)
264			}
265		case struct {
266			Lang    string `xml:"lang,attr"`
267			Content string `xml:",innerxml"`
268		}:
269			if v.Lang == "" {
270				return strings.TrimSpace(v.Content)
271			}
272		}
273	}
274
275	if len(elements) > 0 {
276		switch v := any(elements[0]).(type) {
277		case struct {
278			Lang    string `xml:"lang,attr"`
279			Content string `xml:",chardata"`
280		}:
281			return strings.TrimSpace(v.Content)
282		case struct {
283			Lang    string `xml:"lang,attr"`
284			Content string `xml:",innerxml"`
285		}:
286			return strings.TrimSpace(v.Content)
287		}
288	}
289
290	return ""
291}
292
293func main() {
294	tmpDir := os.TempDir()
295	xmlFilePath := filepath.Join(tmpDir, "FLATPAK_APPSTREAM.xml")
296
297	if _, err := os.Stat(xmlFilePath); os.IsNotExist(err) {
298		url := "https://github.com/Azathothas/pkgcache/raw/refs/heads/main/FLATPAK_APPSTREAM.xml"
299		if err := downloadFile(url, xmlFilePath); err != nil {
300			fmt.Printf("Error downloading file: %v\n", err)
301			return
302		}
303	}
304
305	xmlData, err := os.ReadFile(xmlFilePath)
306	if err != nil {
307		fmt.Printf("Error reading XML file: %v\n", err)
308		return
309	}
310
311	var components Components
312	if err := xml.Unmarshal(xmlData, &components); err != nil {
313		fmt.Printf("Error parsing XML file: %v\n", err)
314		return
315	}
316
317	var metadata []AppStreamData
318	for _, component := range components.Components {
319		var icons []string
320		var screenshots []string
321
322		for _, icon := range component.Icons {
323			if icon.Type == "remote" {
324				width, err1 := strconv.Atoi(icon.Width)
325				height, err2 := strconv.Atoi(icon.Height)
326				if err1 == nil && err2 == nil && width >= 128 && height >= 128 {
327					icons = append(icons, icon.Url)
328				}
329			}
330		}
331
332		for _, screenshot := range component.Screenshots {
333			sort.Slice(screenshot.Images, func(i, j int) bool {
334				widthI, _ := strconv.Atoi(screenshot.Images[i].Width)
335				heightI, _ := strconv.Atoi(screenshot.Images[i].Height)
336				widthJ, _ := strconv.Atoi(screenshot.Images[j].Width)
337				heightJ, _ := strconv.Atoi(screenshot.Images[j].Height)
338				areaI := widthI * heightI
339				areaJ := widthJ * heightJ
340				return areaI > areaJ
341			})
342
343			for _, image := range screenshot.Images {
344				if image.Type == "source" || image.Type == "default" {
345					screenshots = append(screenshots, image.Url)
346				}
347			}
348		}
349
350		categories := getCategoriesString(component.Categories)
351		richDescription := getRichDescription(component.Descriptions)
352		name := getName(component.Names)
353		summary := getSummary(component.Summaries)
354		contentRating := getContentRating(component.ContentRating)
355		version := ""
356		if len(component.Releases.Release) > 0 {
357			version = component.Releases.Release[0].Version
358		}
359
360		metadata = append(metadata, AppStreamData{
361			AppId:           component.Id,
362			Name:            name,
363			Summary:         html2text.HTML2Text(summary),
364			ContentRating:   contentRating,
365			Icons:           icons,
366			Screenshots:     screenshots,
367			Categories:      categories,
368			RichDescription: html2text.HTML2Text(richDescription),
369			Version:         version,
370		})
371	}
372
373	if err := saveAll("appstream_metadata", metadata); err != nil {
374		fmt.Printf("Error saving metadata: %v\n", err)
375	} else {
376		fmt.Println("Metadata saved successfully.")
377	}
378}
379
380
381