src/lib/feed-operations.ts 13.9 K raw
1
import { XMLParser } from "fast-xml-parser";
2
import { COMMON_FEED_PATHS } from "./feed-discovery";
3
4
const parser = new XMLParser({
5
	ignoreAttributes: false,
6
	attributeNamePrefix: "@_",
7
	textNodeName: "#text",
8
	cdataPropName: "__cdata",
9
	parseAttributeValue: true,
10
	trimValues: true,
11
});
12
13
/**
14
 * Decodes HTML entities in a string
15
 * Handles both named entities (&) and numeric entities (&, &)
16
 */
17
function decodeHtmlEntities(text: string): string {
18
	if (!text || typeof text !== "string") return text;
19
20
	// Create a temporary element to use browser's built-in HTML decoding
21
	if (typeof document !== "undefined") {
22
		const textarea = document.createElement("textarea");
23
		textarea.innerHTML = text;
24
		return textarea.value;
25
	}
26
27
	// Fallback for non-browser environments (though we're in a browser app)
28
	// Handle common HTML entities manually
29
	const entities: Record<string, string> = {
30
		"&amp;": "&",
31
		"&lt;": "<",
32
		"&gt;": ">",
33
		"&quot;": '"',
34
		"&#039;": "'",
35
		"&apos;": "'",
36
		"&#8217;": "'",
37
		"&#8216;": "'",
38
		"&#8220;": '"',
39
		"&#8221;": '"',
40
		"&#8211;": "–",
41
		"&#8212;": "—",
42
		"&#038;": "&",
43
	};
44
45
	let decoded = text;
46
	for (const [entity, char] of Object.entries(entities)) {
47
		decoded = decoded.replace(new RegExp(entity, "g"), char);
48
	}
49
50
	// Handle numeric entities like &#8217;
51
	decoded = decoded.replace(/&#(\d+);/g, (_match, dec) => {
52
		return String.fromCharCode(dec);
53
	});
54
55
	// Handle hex entities like &#x27;
56
	decoded = decoded.replace(/&#x([0-9a-f]+);/gi, (_match, hex) => {
57
		return String.fromCharCode(parseInt(hex, 16));
58
	});
59
60
	return decoded;
61
}
62
63
export interface ParsedFeedData {
64
	feedData: any;
65
	posts: any[];
66
	isAtom: boolean;
67
}
68
69
/**
70
 * Fetches XML data from a URL with CORS fallback
71
 */
72
export async function fetchFeedWithFallback(url: string): Promise<string> {
73
	try {
74
		// Try to fetch directly first
75
		const response = await fetch(url);
76
		return await response.text();
77
	} catch {
78
		// Fall back to primary CORS proxy if direct fetch fails
79
		try {
80
			const response = await fetch(
81
				`https://proxy.alcove.tools?url=${encodeURIComponent(url)}`,
82
			);
83
			return await response.text();
84
		} catch {
85
			// Fall back to secondary CORS proxy if primary fails
86
			const response = await fetch(
87
				`https://proxy2.alcove.tools?url=${encodeURIComponent(url)}`,
88
			);
89
			return await response.text();
90
		}
91
	}
92
}
93
94
/**
95
 * Parses XML data and determines if it's RSS or Atom feed
96
 */
97
export function parseFeedXml(xmlData: string): ParsedFeedData {
98
	let parsedXmlData: any;
99
100
	try {
101
		parsedXmlData = parser.parse(xmlData);
102
	} catch (error) {
103
		throw new Error(
104
			`XML parsing failed: ${error instanceof Error ? error.message : "Unknown error"}`,
105
		);
106
	}
107
108
	// Determine if it's RSS or Atom feed
109
	let feedData: any;
110
	let posts: any[];
111
	let isAtom = false;
112
113
	if (parsedXmlData.rss) {
114
		// RSS feed
115
		feedData = parsedXmlData.rss.channel;
116
		if (!feedData) {
117
			throw new Error("RSS feed missing channel element");
118
		}
119
		const items = feedData.item || [];
120
		// Ensure posts is always an array (single item might not be in array)
121
		posts = Array.isArray(items) ? items : items ? [items] : [];
122
	} else if (parsedXmlData.feed) {
123
		// Atom feed
124
		feedData = parsedXmlData.feed;
125
		const entries = feedData.entry || [];
126
		// Ensure posts is always an array (single entry might not be in array)
127
		posts = Array.isArray(entries) ? entries : entries ? [entries] : [];
128
		isAtom = true;
129
	} else if (parsedXmlData["rdf:RDF"]) {
130
		// RDF/RSS 1.0 feed
131
		feedData = parsedXmlData["rdf:RDF"].channel;
132
		const items = parsedXmlData["rdf:RDF"].item || [];
133
		posts = Array.isArray(items) ? items : items ? [items] : [];
134
		isAtom = false;
135
	} else {
136
		// Log available root elements for debugging
137
		const rootKeys = Object.keys(parsedXmlData);
138
		throw new Error(
139
			`Unsupported feed format. Found root elements: ${rootKeys.join(", ")}`,
140
		);
141
	}
142
143
	// Filter out empty objects from posts array
144
	posts = posts.filter((post) => post && Object.keys(post).length > 0);
145
146
	return { feedData, posts, isAtom };
147
}
148
149
/**
150
 * Discovers RSS/Atom feed URL from a website URL
151
 */
152
export async function discoverFeed(websiteUrl: string): Promise<{
153
	feedUrl: string;
154
	xmlData: string;
155
} | null> {
156
	const urlObj = new URL(websiteUrl);
157
	const origin = urlObj.origin;
158
159
	for (const path of COMMON_FEED_PATHS) {
160
		const testUrl = `${origin}${path}`;
161
162
		try {
163
			// Try primary CORS proxy
164
			let response: Response;
165
			try {
166
				response = await fetch(
167
					`https://proxy.alcove.tools?url=${encodeURIComponent(testUrl)}`,
168
				);
169
			} catch {
170
				// Fall back to secondary CORS proxy
171
				response = await fetch(
172
					`https://proxy2.alcove.tools?url=${encodeURIComponent(testUrl)}`,
173
				);
174
			}
175
176
			if (response.ok) {
177
				const text = await response.text();
178
				// Quick check if it looks like XML
179
				if (
180
					text.trim().startsWith("<?xml") ||
181
					text.includes("<rss") ||
182
					text.includes("<feed")
183
				) {
184
					return { feedUrl: testUrl, xmlData: text };
185
				}
186
			}
187
		} catch (error) {
188
			continue;
189
		}
190
	}
191
192
	return null;
193
}
194
195
/**
196
 * Checks if a URL looks like a direct feed URL
197
 */
198
export function looksLikeFeedUrl(url: string): boolean {
199
	return (
200
		url.includes("/feed") ||
201
		url.includes("/rss") ||
202
		url.includes(".xml") ||
203
		url.includes("/atom")
204
	);
205
}
206
207
/**
208
 * Extracts YouTube channel ID from various YouTube URL formats
209
 * Supports:
210
 * - https://www.youtube.com/@ChannelHandle
211
 * - https://www.youtube.com/channel/UC...
212
 * - https://www.youtube.com/c/ChannelName
213
 * - https://www.youtube.com/user/Username
214
 */
215
export async function extractYouTubeChannelId(
216
	url: string,
217
): Promise<string | null> {
218
	try {
219
		// Direct channel ID format
220
		if (url.includes("/channel/")) {
221
			const match = url.match(/\/channel\/([^/?]+)/);
222
			return match ? match[1] : null;
223
		}
224
225
		// Handle @ format - need to fetch the page to get channel ID
226
		if (url.includes("/@")) {
227
			const handle = url.match(/\/@([^/?]+)/)?.[1];
228
			if (!handle) return null;
229
230
			// Fetch the YouTube page to extract the channel ID from meta tags
231
			try {
232
				let response: Response;
233
				try {
234
					response = await fetch(
235
						`https://proxy.alcove.tools?url=${encodeURIComponent(url)}`,
236
					);
237
				} catch {
238
					// Fall back to secondary CORS proxy
239
					response = await fetch(
240
						`https://proxy2.alcove.tools?url=${encodeURIComponent(url)}`,
241
					);
242
				}
243
				const html = await response.text();
244
245
				// Look for channel ID in various places
246
				const channelIdMatch = html.match(/channelId":"([^"]+)"/);
247
				if (channelIdMatch) {
248
					return channelIdMatch[1];
249
				}
250
251
				// Alternative: look in meta tags
252
				const metaMatch = html.match(
253
					/<meta itemprop="channelId" content="([^"]+)">/,
254
				);
255
				if (metaMatch) {
256
					return metaMatch[1];
257
				}
258
259
				// Alternative: look in link tags
260
				const linkMatch = html.match(
261
					/<link rel="canonical" href="https:\/\/www\.youtube\.com\/channel\/([^"]+)">/,
262
				);
263
				if (linkMatch) {
264
					return linkMatch[1];
265
				}
266
			} catch (error) {
267
				console.error("Failed to fetch YouTube page for channel ID:", error);
268
				return null;
269
			}
270
		}
271
272
		// For /c/ and /user/ formats, we also need to fetch the page
273
		if (url.includes("/c/") || url.includes("/user/")) {
274
			try {
275
				let response: Response;
276
				try {
277
					response = await fetch(
278
						`https://proxy.alcove.tools?url=${encodeURIComponent(url)}`,
279
					);
280
				} catch {
281
					// Fall back to secondary CORS proxy
282
					response = await fetch(
283
						`https://proxy2.alcove.tools?url=${encodeURIComponent(url)}`,
284
					);
285
				}
286
				const html = await response.text();
287
288
				const channelIdMatch = html.match(/channelId":"([^"]+)"/);
289
				if (channelIdMatch) {
290
					return channelIdMatch[1];
291
				}
292
			} catch (error) {
293
				console.error("Failed to fetch YouTube page for channel ID:", error);
294
				return null;
295
			}
296
		}
297
298
		return null;
299
	} catch (error) {
300
		console.error("Error extracting YouTube channel ID:", error);
301
		return null;
302
	}
303
}
304
305
/**
306
 * Converts YouTube channel URL to RSS feed URL
307
 */
308
export async function convertYouTubeUrlToFeed(
309
	url: string,
310
): Promise<string | null> {
311
	const channelId = await extractYouTubeChannelId(url);
312
	if (!channelId) return null;
313
314
	return `https://www.youtube.com/feeds/videos.xml?channel_id=${channelId}`;
315
}
316
317
/**
318
 * Checks if a URL is a YouTube URL
319
 */
320
export function isYouTubeUrl(url: string): boolean {
321
	return url.includes("youtube.com") || url.includes("youtu.be");
322
}
323
324
/**
325
 * Extracts YouTube video ID from a video URL
326
 * Supports:
327
 * - https://www.youtube.com/watch?v=VIDEO_ID
328
 * - https://youtu.be/VIDEO_ID
329
 * - https://www.youtube.com/embed/VIDEO_ID
330
 */
331
export function extractYouTubeVideoId(url: string): string | null {
332
	try {
333
		// Standard watch URL
334
		const watchMatch = url.match(/[?&]v=([^&]+)/);
335
		if (watchMatch) return watchMatch[1];
336
337
		// Short URL format
338
		const shortMatch = url.match(/youtu\.be\/([^?]+)/);
339
		if (shortMatch) return shortMatch[1];
340
341
		// Embed URL format
342
		const embedMatch = url.match(/youtube\.com\/embed\/([^?]+)/);
343
		if (embedMatch) return embedMatch[1];
344
345
		return null;
346
	} catch {
347
		return null;
348
	}
349
}
350
351
/**
352
 * Checks if a post is from a YouTube feed
353
 */
354
export function isYouTubePost(feedUrl: string | null): boolean {
355
	if (!feedUrl) return false;
356
	return feedUrl.includes("youtube.com/feeds/videos.xml");
357
}
358
359
/**
360
 * Extracts post link from RSS or Atom post entry
361
 */
362
export function extractPostLink(post: any, isAtom: boolean): string {
363
	if (isAtom) {
364
		// Handle Atom link which can be string, object, or array
365
		if (typeof post.link === "string") {
366
			return post.link || post.id || "#";
367
		} else if (Array.isArray(post.link)) {
368
			// Find 'alternate' link or use first link
369
			const alternateLink = post.link.find(
370
				(l: any) => l["@_rel"] === "alternate" || !l["@_rel"],
371
			);
372
			return (
373
				alternateLink?.["@_href"] || post.link[0]?.["@_href"] || post.id || "#"
374
			);
375
		} else if (post.link && typeof post.link === "object") {
376
			return post.link["@_href"] || post.id || "#";
377
		}
378
		return post.id || "#";
379
	}
380
381
	// RSS feed
382
	const link = post.link || post.guid || post.id;
383
	if (!link) return "#";
384
385
	// Handle link as object (sometimes RSS parsers do this)
386
	if (typeof link === "object") {
387
		return link["#text"] || link.__cdata || "#";
388
	}
389
390
	return String(link);
391
}
392
393
/**
394
 * Extracts author from RSS or Atom post entry
395
 */
396
export function extractPostAuthor(
397
	post: any,
398
	isAtom: boolean,
399
	feedTitle: string,
400
): string {
401
	if (isAtom) {
402
		// Atom can have author as object with name property
403
		const author = post.author;
404
		if (typeof author === "object" && author !== null) {
405
			return author.name || author["#text"] || feedTitle;
406
		}
407
		return author || feedTitle;
408
	}
409
410
	// RSS feed
411
	const author = post.author || post["dc:creator"] || post.creator;
412
	if (!author) return feedTitle;
413
414
	// Handle author as object
415
	if (typeof author === "object") {
416
		return author["#text"] || author.__cdata || feedTitle;
417
	}
418
419
	return String(author);
420
}
421
422
/**
423
 * Extracts content from RSS or Atom post entry
424
 */
425
export function extractPostContent(post: any, postLink?: string): string {
426
	// Try various content fields in order of preference
427
	const content =
428
		post["content:encoded"] || post.content || post.description || post.summary;
429
430
	// Default fallback message
431
	const fallbackMessage = postLink
432
		? `<p><a href="${postLink}" target="_blank" rel="noopener noreferrer">View post</a></p>`
433
		: "Please open on the web";
434
435
	// Handle different content structures
436
	if (typeof content === "string") {
437
		const trimmed = content.trim();
438
		return trimmed.length > 0 ? trimmed : fallbackMessage;
439
	} else if (content && typeof content === "object") {
440
		// Handle CDATA or nested text
441
		const extracted = content.__cdata || content["#text"] || "";
442
		const trimmed = String(extracted).trim();
443
		return trimmed.length > 0 ? trimmed : fallbackMessage;
444
	}
445
446
	// No content found - this is fine for link-only feeds
447
	return fallbackMessage;
448
}
449
450
/**
451
 * Extracts published date from RSS or Atom post entry
452
 */
453
export function extractPostDate(post: any): string {
454
	try {
455
		const dateValue = post.pubDate || post.updated || post.published;
456
		if (!dateValue) {
457
			return new Date().toISOString(); // Use current date if no date found
458
		}
459
		const parsedDate = new Date(dateValue);
460
		// Check if date is valid
461
		if (isNaN(parsedDate.getTime())) {
462
			return new Date().toISOString();
463
		}
464
		return parsedDate.toISOString();
465
	} catch {
466
		return new Date().toISOString();
467
	}
468
}
469
470
/**
471
 * Extract string value from various data types and decode HTML entities
472
 */
473
function extractStringValue(value: any): string {
474
	if (!value) return "";
475
476
	let strValue = "";
477
478
	if (typeof value === "string") {
479
		strValue = value;
480
	} else if (typeof value === "object") {
481
		// Handle objects that might contain text
482
		// Try common text properties
483
		if (value.__cdata) strValue = String(value.__cdata);
484
		else if (value["#text"]) strValue = String(value["#text"]);
485
		else if (value.text) strValue = String(value.text);
486
		// Last resort: return empty string
487
		else return "";
488
	} else {
489
		// For numbers, booleans, etc.
490
		strValue = String(value);
491
	}
492
493
	// Decode HTML entities before returning
494
	return decodeHtmlEntities(strValue);
495
}
496
497
/**
498
 * Safely truncate a string to a maximum length
499
 */
500
export function truncateString(str: any, maxLength: number): string {
501
	const strValue = extractStringValue(str);
502
	if (!strValue) return "";
503
	const trimmed = strValue.trim();
504
	if (trimmed.length <= maxLength) return trimmed;
505
	return trimmed.substring(0, maxLength - 3) + "...";
506
}
507
508
/**
509
 * Validate and sanitize feed data for insertion
510
 */
511
export function sanitizeFeedData(feedData: any, feed?: any) {
512
	// Extract title from feedData or feed, handling various formats
513
	const titleValue = feedData?.title || feed?.title || "Untitled Feed";
514
	const descValue =
515
		feedData?.description || feedData?.subtitle || feed?.description || "";
516
517
	return {
518
		title: truncateString(titleValue, 200),
519
		description: truncateString(descValue, 1000),
520
	};
521
}
522
523
/**
524
 * Validate and sanitize post data for insertion
525
 */
526
export function sanitizePostData(
527
	post: any,
528
	isAtom: boolean,
529
	feedTitle: string,
530
) {
531
	return {
532
		title: truncateString(post.title || "Untitled", 1000),
533
		author: truncateString(extractPostAuthor(post, isAtom, feedTitle), 200),
534
		link: truncateString(extractPostLink(post, isAtom), 1000),
535
	};
536
}