apps/feeds/feeds.go 8.9 K raw
1
package main
2
3
import (
4
	"context"
5
	"errors"
6
	"fmt"
7
	"io"
8
	"log/slog"
9
	"net/http"
10
	"net/url"
11
	"slices"
12
	"strings"
13
	"sync"
14
	"time"
15
	"unicode/utf8"
16
17
	"github.com/mmcdole/gofeed"
18
	"golang.org/x/net/html"
19
)
20
21
type ParsedEntry struct {
22
	GUID        string
23
	Title       string
24
	Link        string
25
	Author      string
26
	PublishedAt int64
27
}
28
29
type FetchResult struct {
30
	Status       int
31
	ETag         string
32
	LastModified string
33
	Title        string
34
	SiteURL      string
35
	Entries      []ParsedEntry
36
}
37
38
type FeedPreviewItem struct {
39
	Title     string
40
	Link      string
41
	Author    string
42
	Published int64
43
}
44
45
const appUserAgent = "andromeda-feeds/0.1 (+https://github.com/stevedylandev/andromeda)"
46
47
func buildHTTPClient() *http.Client {
48
	return &http.Client{Timeout: 15 * time.Second}
49
}
50
51
func newRequest(ctx context.Context, method, rawURL string) (*http.Request, error) {
52
	req, err := http.NewRequestWithContext(ctx, method, rawURL, nil)
53
	if err != nil {
54
		return nil, err
55
	}
56
	req.Header.Set("User-Agent", appUserAgent)
57
	return req, nil
58
}
59
60
func fetchFeed(ctx context.Context, feedURL, etag, lastModified string) (*FetchResult, error) {
61
	client := buildHTTPClient()
62
	req, err := newRequest(ctx, http.MethodGet, feedURL)
63
	if err != nil {
64
		return nil, err
65
	}
66
	if etag != "" {
67
		req.Header.Set("If-None-Match", etag)
68
	}
69
	if lastModified != "" {
70
		req.Header.Set("If-Modified-Since", lastModified)
71
	}
72
	resp, err := client.Do(req)
73
	if err != nil {
74
		return nil, fmt.Errorf("fetch failed: %w", err)
75
	}
76
	defer resp.Body.Close()
77
	result := &FetchResult{
78
		Status:       resp.StatusCode,
79
		ETag:         resp.Header.Get("ETag"),
80
		LastModified: resp.Header.Get("Last-Modified"),
81
	}
82
	if resp.StatusCode == http.StatusNotModified {
83
		if result.ETag == "" {
84
			result.ETag = etag
85
		}
86
		if result.LastModified == "" {
87
			result.LastModified = lastModified
88
		}
89
		return result, nil
90
	}
91
	if resp.StatusCode < 200 || resp.StatusCode >= 300 {
92
		return nil, fmt.Errorf("upstream returned %d", resp.StatusCode)
93
	}
94
	parser := gofeed.NewParser()
95
	feed, err := parser.Parse(resp.Body)
96
	if err != nil {
97
		return nil, fmt.Errorf("feed parse failed: %w", err)
98
	}
99
	result.Title = strings.TrimSpace(feed.Title)
100
	result.SiteURL = firstNonEmpty(feed.Link, firstFeedAltLink(feed))
101
	for _, item := range feed.Items {
102
		link := strings.TrimSpace(item.Link)
103
		if link == "" {
104
			continue
105
		}
106
		title := strings.TrimSpace(item.Title)
107
		if title == "" {
108
			title = deriveTitleFromHTML(firstNonEmpty(item.Description, item.Content))
109
		}
110
		author := ""
111
		if item.Author != nil {
112
			author = strings.TrimSpace(item.Author.Name)
113
		}
114
		guid := strings.TrimSpace(item.GUID)
115
		if guid == "" {
116
			guid = link
117
		}
118
		published := int64(0)
119
		switch {
120
		case item.PublishedParsed != nil:
121
			published = item.PublishedParsed.Unix()
122
		case item.UpdatedParsed != nil:
123
			published = item.UpdatedParsed.Unix()
124
		}
125
		result.Entries = append(result.Entries, ParsedEntry{
126
			GUID:        guid,
127
			Title:       title,
128
			Link:        link,
129
			Author:      author,
130
			PublishedAt: published,
131
		})
132
	}
133
	return result, nil
134
}
135
136
func deriveTitleFromHTML(src string) string {
137
	txt := strings.Join(strings.Fields(htmlToText(src)), " ")
138
	if txt == "" {
139
		return ""
140
	}
141
	const maxChars = 80
142
	if utf8.RuneCountInString(txt) <= maxChars {
143
		return txt
144
	}
145
	runes := []rune(txt)
146
	return strings.TrimSpace(string(runes[:maxChars])) + "…"
147
}
148
149
func htmlToText(src string) string {
150
	if strings.TrimSpace(src) == "" {
151
		return ""
152
	}
153
	node, err := html.Parse(strings.NewReader(src))
154
	if err != nil {
155
		return src
156
	}
157
	var b strings.Builder
158
	var walk func(*html.Node)
159
	walk = func(n *html.Node) {
160
		if n.Type == html.TextNode {
161
			b.WriteString(n.Data)
162
			b.WriteByte(' ')
163
		}
164
		for c := n.FirstChild; c != nil; c = c.NextSibling {
165
			walk(c)
166
		}
167
	}
168
	walk(node)
169
	return html.UnescapeString(b.String())
170
}
171
172
func previewURLs(ctx context.Context, urls []string, perFeed int, log *slog.Logger) []FeedPreviewItem {
173
	var wg sync.WaitGroup
174
	var mu sync.Mutex
175
	items := []FeedPreviewItem{}
176
	for _, raw := range urls {
177
		feedURL := strings.TrimSpace(raw)
178
		if feedURL == "" {
179
			continue
180
		}
181
		wg.Add(1)
182
		go func() {
183
			defer wg.Done()
184
			res, err := fetchFeed(ctx, feedURL, "", "")
185
			if err != nil {
186
				log.Warn("preview fetch failed", "url", feedURL, "err", err)
187
				return
188
			}
189
			feedTitle := res.Title
190
			local := make([]FeedPreviewItem, 0, len(res.Entries))
191
			for _, entry := range res.Entries {
192
				if perFeed > 0 && len(local) >= perFeed {
193
					break
194
				}
195
				author := feedTitle
196
				if entry.Author != "" && feedTitle != "" {
197
					author = feedTitle + " - " + entry.Author
198
				} else if entry.Author != "" {
199
					author = entry.Author
200
				}
201
				local = append(local, FeedPreviewItem{Title: entry.Title, Link: entry.Link, Author: author, Published: entry.PublishedAt})
202
			}
203
			mu.Lock()
204
			items = append(items, local...)
205
			mu.Unlock()
206
		}()
207
	}
208
	wg.Wait()
209
	slices.SortFunc(items, func(a, b FeedPreviewItem) int {
210
		switch {
211
		case a.Published > b.Published:
212
			return -1
213
		case a.Published < b.Published:
214
			return 1
215
		default:
216
			return 0
217
		}
218
	})
219
	return items
220
}
221
222
func discoverFavicon(ctx context.Context, siteURL string) string {
223
	parsed, err := url.Parse(siteURL)
224
	if err != nil {
225
		return ""
226
	}
227
	client := buildHTTPClient()
228
	req, err := newRequest(ctx, http.MethodGet, siteURL)
229
	if err != nil {
230
		return ""
231
	}
232
	resp, err := client.Do(req)
233
	if err == nil {
234
		defer resp.Body.Close()
235
		body, _ := io.ReadAll(io.LimitReader(resp.Body, 1<<20))
236
		if href := findLinkHref(string(body), func(rel, typ string) bool {
237
			rel = strings.ToLower(rel)
238
			return strings.Contains(rel, "icon")
239
		}); href != "" {
240
			if resolved, err := parsed.Parse(href); err == nil {
241
				return resolved.String()
242
			}
243
		}
244
	}
245
	if fallback, err := parsed.Parse("/favicon.ico"); err == nil {
246
		return fallback.String()
247
	}
248
	return ""
249
}
250
251
func discoverFeeds(ctx context.Context, baseURL string) ([]string, error) {
252
	parsed, err := url.Parse(baseURL)
253
	if err != nil {
254
		return nil, fmt.Errorf("invalid URL: %w", err)
255
	}
256
	client := buildHTTPClient()
257
	req, err := newRequest(ctx, http.MethodGet, baseURL)
258
	if err != nil {
259
		return nil, fmt.Errorf("invalid URL: %w", err)
260
	}
261
	feeds := []string{}
262
	resp, err := client.Do(req)
263
	if err == nil {
264
		defer resp.Body.Close()
265
		body, _ := io.ReadAll(io.LimitReader(resp.Body, 1<<20))
266
		links := findAlternateFeedLinks(string(body))
267
		for _, href := range links {
268
			resolved := href
269
			if u, err := parsed.Parse(href); err == nil {
270
				resolved = u.String()
271
			}
272
			if !slices.Contains(feeds, resolved) {
273
				feeds = append(feeds, resolved)
274
			}
275
		}
276
	}
277
	if len(feeds) == 0 {
278
		paths := []string{"/feed", "/feed.xml", "/rss", "/rss.xml", "/atom.xml", "/index.xml", "/feed/rss", "/blog/feed", "/blog/rss"}
279
		for _, path := range paths {
280
			probe, err := parsed.Parse(path)
281
			if err != nil {
282
				continue
283
			}
284
			req, err := newRequest(ctx, http.MethodHead, probe.String())
285
			if err != nil {
286
				continue
287
			}
288
			resp, err := client.Do(req)
289
			if err != nil {
290
				continue
291
			}
292
			_ = resp.Body.Close()
293
			ct := strings.ToLower(resp.Header.Get("Content-Type"))
294
			if resp.StatusCode >= 200 && resp.StatusCode < 300 && (strings.Contains(ct, "xml") || strings.Contains(ct, "rss") || strings.Contains(ct, "atom")) {
295
				feeds = append(feeds, probe.String())
296
			}
297
		}
298
	}
299
	if len(feeds) == 0 {
300
		return nil, errors.New("no feeds found at this URL")
301
	}
302
	return feeds, nil
303
}
304
305
func findAlternateFeedLinks(doc string) []string {
306
	node, err := html.Parse(strings.NewReader(doc))
307
	if err != nil {
308
		return nil
309
	}
310
	links := []string{}
311
	var walk func(*html.Node)
312
	walk = func(n *html.Node) {
313
		if n.Type == html.ElementNode && strings.EqualFold(n.Data, "link") {
314
			attrs := attrsMap(n)
315
			rel := strings.ToLower(attrs["rel"])
316
			typ := strings.ToLower(attrs["type"])
317
			href := attrs["href"]
318
			if strings.Contains(rel, "alternate") && href != "" && (strings.Contains(typ, "rss") || strings.Contains(typ, "atom") || strings.Contains(typ, "xml")) {
319
				links = append(links, href)
320
			}
321
		}
322
		for c := n.FirstChild; c != nil; c = c.NextSibling {
323
			walk(c)
324
		}
325
	}
326
	walk(node)
327
	return links
328
}
329
330
func findLinkHref(doc string, match func(rel, typ string) bool) string {
331
	node, err := html.Parse(strings.NewReader(doc))
332
	if err != nil {
333
		return ""
334
	}
335
	var found string
336
	var walk func(*html.Node)
337
	walk = func(n *html.Node) {
338
		if found != "" {
339
			return
340
		}
341
		if n.Type == html.ElementNode && strings.EqualFold(n.Data, "link") {
342
			attrs := attrsMap(n)
343
			if match(attrs["rel"], attrs["type"]) {
344
				found = attrs["href"]
345
				return
346
			}
347
		}
348
		for c := n.FirstChild; c != nil; c = c.NextSibling {
349
			walk(c)
350
		}
351
	}
352
	walk(node)
353
	return found
354
}
355
356
func attrsMap(n *html.Node) map[string]string {
357
	out := make(map[string]string, len(n.Attr))
358
	for _, a := range n.Attr {
359
		out[strings.ToLower(a.Key)] = a.Val
360
	}
361
	return out
362
}
363
364
func firstFeedAltLink(feed *gofeed.Feed) string {
365
	for _, link := range feed.Links {
366
		if strings.TrimSpace(link) != "" {
367
			return link
368
		}
369
	}
370
	return ""
371
}
372
373
func firstNonEmpty(values ...string) string {
374
	for _, v := range values {
375
		if strings.TrimSpace(v) != "" {
376
			return strings.TrimSpace(v)
377
		}
378
	}
379
	return ""
380
}