src/lib/feed-operations.ts 14.9 K raw
1
import { XMLParser } from "fast-xml-parser";
2
import { COMMON_FEED_PATHS } from "./feed-discovery";
3
4
const parser = new XMLParser({
5
	ignoreAttributes: false,
6
	attributeNamePrefix: "@_",
7
	textNodeName: "#text",
8
	cdataPropName: "__cdata",
9
	parseAttributeValue: true,
10
	trimValues: true,
11
});
12
13
/**
14
 * Decodes HTML entities in a string
15
 * Handles both named entities (&) and numeric entities (&, &)
16
 */
17
function decodeHtmlEntities(text: string): string {
18
	if (!text || typeof text !== "string") return text;
19
20
	// Create a temporary element to use browser's built-in HTML decoding
21
	if (typeof document !== "undefined") {
22
		const textarea = document.createElement("textarea");
23
		textarea.innerHTML = text;
24
		return textarea.value;
25
	}
26
27
	// Fallback for non-browser environments (though we're in a browser app)
28
	// Handle common HTML entities manually
29
	const entities: Record<string, string> = {
30
		"&amp;": "&",
31
		"&lt;": "<",
32
		"&gt;": ">",
33
		"&quot;": '"',
34
		"&#039;": "'",
35
		"&apos;": "'",
36
		"&#8217;": "'",
37
		"&#8216;": "'",
38
		"&#8220;": '"',
39
		"&#8221;": '"',
40
		"&#8211;": "–",
41
		"&#8212;": "—",
42
		"&#038;": "&",
43
	};
44
45
	let decoded = text;
46
	for (const [entity, char] of Object.entries(entities)) {
47
		decoded = decoded.replace(new RegExp(entity, "g"), char);
48
	}
49
50
	// Handle numeric entities like &#8217;
51
	decoded = decoded.replace(/&#(\d+);/g, (_match, dec) => {
52
		return String.fromCharCode(dec);
53
	});
54
55
	// Handle hex entities like &#x27;
56
	decoded = decoded.replace(/&#x([0-9a-f]+);/gi, (_match, hex) => {
57
		return String.fromCharCode(parseInt(hex, 16));
58
	});
59
60
	return decoded;
61
}
62
63
export interface ParsedFeedData {
64
	feedData: any;
65
	posts: any[];
66
	isAtom: boolean;
67
}
68
69
/**
70
 * Fetches XML data from a URL with CORS fallback
71
 */
72
export async function fetchFeedWithFallback(url: string): Promise<string> {
73
	try {
74
		// Try to fetch directly first
75
		const response = await fetch(url);
76
		return await response.text();
77
	} catch {
78
		// Fall back to primary CORS proxy if direct fetch fails
79
		try {
80
			const response = await fetch(
81
				`https://proxy.alcove.tools?url=${encodeURIComponent(url)}`,
82
			);
83
			return await response.text();
84
		} catch {
85
			// Fall back to secondary CORS proxy if primary fails
86
			const response = await fetch(
87
				`https://proxy2.alcove.tools?url=${encodeURIComponent(url)}`,
88
			);
89
			return await response.text();
90
		}
91
	}
92
}
93
94
/**
95
 * Parses XML data and determines if it's RSS or Atom feed
96
 */
97
export function parseFeedXml(xmlData: string): ParsedFeedData {
98
	let parsedXmlData: any;
99
100
	try {
101
		parsedXmlData = parser.parse(xmlData);
102
	} catch (error) {
103
		throw new Error(
104
			`XML parsing failed: ${error instanceof Error ? error.message : "Unknown error"}`,
105
		);
106
	}
107
108
	// Determine if it's RSS or Atom feed
109
	let feedData: any;
110
	let posts: any[];
111
	let isAtom = false;
112
113
	if (parsedXmlData.rss) {
114
		// RSS feed
115
		feedData = parsedXmlData.rss.channel;
116
		if (!feedData) {
117
			throw new Error("RSS feed missing channel element");
118
		}
119
		const items = feedData.item || [];
120
		// Ensure posts is always an array (single item might not be in array)
121
		posts = Array.isArray(items) ? items : items ? [items] : [];
122
	} else if (parsedXmlData.feed) {
123
		// Atom feed
124
		feedData = parsedXmlData.feed;
125
		const entries = feedData.entry || [];
126
		// Ensure posts is always an array (single entry might not be in array)
127
		posts = Array.isArray(entries) ? entries : entries ? [entries] : [];
128
		isAtom = true;
129
	} else if (parsedXmlData["rdf:RDF"]) {
130
		// RDF/RSS 1.0 feed
131
		feedData = parsedXmlData["rdf:RDF"].channel;
132
		const items = parsedXmlData["rdf:RDF"].item || [];
133
		posts = Array.isArray(items) ? items : items ? [items] : [];
134
		isAtom = false;
135
	} else {
136
		// Log available root elements for debugging
137
		const rootKeys = Object.keys(parsedXmlData);
138
		throw new Error(
139
			`Unsupported feed format. Found root elements: ${rootKeys.join(", ")}`,
140
		);
141
	}
142
143
	// Filter out empty objects from posts array
144
	posts = posts.filter((post) => post && Object.keys(post).length > 0);
145
146
	return { feedData, posts, isAtom };
147
}
148
149
/**
150
 * Discovers RSS/Atom feed URL from a website URL
151
 */
152
export async function discoverFeed(websiteUrl: string): Promise<{
153
	feedUrl: string;
154
	xmlData: string;
155
} | null> {
156
	const urlObj = new URL(websiteUrl);
157
	const origin = urlObj.origin;
158
159
	for (const path of COMMON_FEED_PATHS) {
160
		const testUrl = `${origin}${path}`;
161
162
		try {
163
			// Try primary CORS proxy
164
			let response: Response;
165
			try {
166
				response = await fetch(
167
					`https://proxy.alcove.tools?url=${encodeURIComponent(testUrl)}`,
168
				);
169
			} catch {
170
				// Fall back to secondary CORS proxy
171
				response = await fetch(
172
					`https://proxy2.alcove.tools?url=${encodeURIComponent(testUrl)}`,
173
				);
174
			}
175
176
			if (response.ok) {
177
				const text = await response.text();
178
				// Quick check if it looks like XML
179
				if (
180
					text.trim().startsWith("<?xml") ||
181
					text.includes("<rss") ||
182
					text.includes("<feed")
183
				) {
184
					return { feedUrl: testUrl, xmlData: text };
185
				}
186
			}
187
		} catch (error) {
188
			continue;
189
		}
190
	}
191
192
	return null;
193
}
194
195
/**
196
 * Extracts YouTube channel ID from various YouTube URL formats
197
 * Supports:
198
 * - https://www.youtube.com/@ChannelHandle
199
 * - https://www.youtube.com/channel/UC...
200
 * - https://www.youtube.com/c/ChannelName
201
 * - https://www.youtube.com/user/Username
202
 */
203
export async function extractYouTubeChannelId(
204
	url: string,
205
): Promise<string | null> {
206
	try {
207
		// Direct channel ID format
208
		if (url.includes("/channel/")) {
209
			const match = url.match(/\/channel\/([^/?]+)/);
210
			return match ? match[1] : null;
211
		}
212
213
		// Handle @ format - need to fetch the page to get channel ID
214
		if (url.includes("/@")) {
215
			const handle = url.match(/\/@([^/?]+)/)?.[1];
216
			if (!handle) return null;
217
218
			// Fetch the YouTube page to extract the channel ID from meta tags
219
			try {
220
				let response: Response;
221
				try {
222
					response = await fetch(
223
						`https://proxy.alcove.tools?url=${encodeURIComponent(url)}`,
224
					);
225
				} catch {
226
					// Fall back to secondary CORS proxy
227
					response = await fetch(
228
						`https://proxy2.alcove.tools?url=${encodeURIComponent(url)}`,
229
					);
230
				}
231
				const html = await response.text();
232
233
				// Look for channel ID in various places
234
				const channelIdMatch = html.match(/channelId":"([^"]+)"/);
235
				if (channelIdMatch) {
236
					return channelIdMatch[1];
237
				}
238
239
				// Alternative: look in meta tags
240
				const metaMatch = html.match(
241
					/<meta itemprop="channelId" content="([^"]+)">/,
242
				);
243
				if (metaMatch) {
244
					return metaMatch[1];
245
				}
246
247
				// Alternative: look in link tags
248
				const linkMatch = html.match(
249
					/<link rel="canonical" href="https:\/\/www\.youtube\.com\/channel\/([^"]+)">/,
250
				);
251
				if (linkMatch) {
252
					return linkMatch[1];
253
				}
254
			} catch (error) {
255
				console.error("Failed to fetch YouTube page for channel ID:", error);
256
				return null;
257
			}
258
		}
259
260
		// For /c/ and /user/ formats, we also need to fetch the page
261
		if (url.includes("/c/") || url.includes("/user/")) {
262
			try {
263
				let response: Response;
264
				try {
265
					response = await fetch(
266
						`https://proxy.alcove.tools?url=${encodeURIComponent(url)}`,
267
					);
268
				} catch {
269
					// Fall back to secondary CORS proxy
270
					response = await fetch(
271
						`https://proxy2.alcove.tools?url=${encodeURIComponent(url)}`,
272
					);
273
				}
274
				const html = await response.text();
275
276
				const channelIdMatch = html.match(/channelId":"([^"]+)"/);
277
				if (channelIdMatch) {
278
					return channelIdMatch[1];
279
				}
280
			} catch (error) {
281
				console.error("Failed to fetch YouTube page for channel ID:", error);
282
				return null;
283
			}
284
		}
285
286
		return null;
287
	} catch (error) {
288
		console.error("Error extracting YouTube channel ID:", error);
289
		return null;
290
	}
291
}
292
293
/**
294
 * Converts YouTube channel URL to RSS feed URL
295
 */
296
export async function convertYouTubeUrlToFeed(
297
	url: string,
298
): Promise<string | null> {
299
	const channelId = await extractYouTubeChannelId(url);
300
	if (!channelId) return null;
301
302
	return `https://www.youtube.com/feeds/videos.xml?channel_id=${channelId}`;
303
}
304
305
/**
306
 * Checks if a URL is a YouTube URL
307
 */
308
export function isYouTubeUrl(url: string): boolean {
309
	return url.includes("youtube.com") || url.includes("youtu.be");
310
}
311
312
/**
313
 * Extracts YouTube video ID from a video URL
314
 * Supports:
315
 * - https://www.youtube.com/watch?v=VIDEO_ID
316
 * - https://youtu.be/VIDEO_ID
317
 * - https://www.youtube.com/embed/VIDEO_ID
318
 */
319
export function extractYouTubeVideoId(url: string): string | null {
320
	try {
321
		// Standard watch URL
322
		const watchMatch = url.match(/[?&]v=([^&]+)/);
323
		if (watchMatch) return watchMatch[1];
324
325
		// Short URL format
326
		const shortMatch = url.match(/youtu\.be\/([^?]+)/);
327
		if (shortMatch) return shortMatch[1];
328
329
		// Embed URL format
330
		const embedMatch = url.match(/youtube\.com\/embed\/([^?]+)/);
331
		if (embedMatch) return embedMatch[1];
332
333
		return null;
334
	} catch {
335
		return null;
336
	}
337
}
338
339
/**
340
 * Checks if a post is from a YouTube feed
341
 */
342
export function isYouTubePost(feedUrl: string | null): boolean {
343
	if (!feedUrl) return false;
344
	return feedUrl.includes("youtube.com/feeds/videos.xml");
345
}
346
347
/**
348
 * Extracts post link from RSS or Atom post entry
349
 */
350
export function extractPostLink(post: any, isAtom: boolean): string {
351
	if (isAtom) {
352
		// Handle Atom link which can be string, object, or array
353
		if (typeof post.link === "string") {
354
			return post.link || post.id || "#";
355
		} else if (Array.isArray(post.link)) {
356
			// Find 'alternate' link or use first link
357
			const alternateLink = post.link.find(
358
				(l: any) => l["@_rel"] === "alternate" || !l["@_rel"],
359
			);
360
			return (
361
				alternateLink?.["@_href"] || post.link[0]?.["@_href"] || post.id || "#"
362
			);
363
		} else if (post.link && typeof post.link === "object") {
364
			return post.link["@_href"] || post.id || "#";
365
		}
366
		return post.id || "#";
367
	}
368
369
	// RSS feed
370
	const link = post.link || post.guid || post.id;
371
	if (!link) return "#";
372
373
	// Handle link as object (sometimes RSS parsers do this)
374
	if (typeof link === "object") {
375
		return link["#text"] || link.__cdata || "#";
376
	}
377
378
	return String(link);
379
}
380
381
/**
382
 * Extracts author from RSS or Atom post entry
383
 */
384
export function extractPostAuthor(
385
	post: any,
386
	isAtom: boolean,
387
	feedTitle: string,
388
): string {
389
	if (isAtom) {
390
		// Atom can have author as object with name property
391
		const author = post.author;
392
		if (typeof author === "object" && author !== null) {
393
			return author.name || author["#text"] || feedTitle;
394
		}
395
		return author || feedTitle;
396
	}
397
398
	// RSS feed
399
	const author = post.author || post["dc:creator"] || post.creator;
400
	if (!author) return feedTitle;
401
402
	// Handle author as object
403
	if (typeof author === "object") {
404
		return author["#text"] || author.__cdata || feedTitle;
405
	}
406
407
	return String(author);
408
}
409
410
/**
411
 * Extracts content from RSS or Atom post entry
412
 */
413
export function extractPostContent(post: any, postLink?: string): string {
414
	// Try various content fields in order of preference
415
	const content =
416
		post["content:encoded"] || post.content || post.description || post.summary;
417
418
	// Default fallback message
419
	const fallbackMessage = postLink
420
		? `<p><a href="${postLink}" target="_blank" rel="noopener noreferrer">View post</a></p>`
421
		: "Please open on the web";
422
423
	// Handle different content structures
424
	if (typeof content === "string") {
425
		const trimmed = content.trim();
426
		return trimmed.length > 0 ? trimmed : fallbackMessage;
427
	} else if (content && typeof content === "object") {
428
		// Handle CDATA or nested text
429
		const extracted = content.__cdata || content["#text"] || "";
430
		const trimmed = String(extracted).trim();
431
		return trimmed.length > 0 ? trimmed : fallbackMessage;
432
	}
433
434
	// No content found - this is fine for link-only feeds
435
	return fallbackMessage;
436
}
437
438
/**
439
 * Normalizes date strings to handle problematic formats like "24:00:00"
440
 * which causes errors in WebKit browsers
441
 */
442
function normalizeDateString(dateString: string): string {
443
	if (!dateString || typeof dateString !== 'string') return dateString;
444
	
445
	// Handle the 24:00:00 time format by converting it to 00:00:00 of the next day
446
	if (dateString.includes('24:00:00')) {
447
		// Replace 24:00:00 with 00:00:00
448
		const normalizedDate = dateString.replace(/24:00:00/, '00:00:00');
449
		
450
		try {
451
			// Parse the normalized date and add one day
452
			const tempDate = new Date(normalizedDate);
453
			if (!isNaN(tempDate.getTime())) {
454
				tempDate.setDate(tempDate.getDate() + 1);
455
				return tempDate.toISOString();
456
			}
457
		} catch {
458
			// If parsing fails, continue with the original string replacement
459
		}
460
		
461
		return normalizedDate;
462
	}
463
	
464
	return dateString;
465
}
466
467
/**
468
 * Extracts published date from RSS or Atom post entry
469
 */
470
export function extractPostDate(post: any, isAtom?: boolean): string {
471
	try {
472
		let dateValue: any;
473
		
474
		if (isAtom) {
475
			// For Atom feeds, prioritize published date over updated date
476
			dateValue = post.published || post.updated;
477
		} else {
478
			// For RSS feeds, use pubDate first, then fall back to Atom fields
479
			dateValue = post.pubDate || post.published || post.updated;
480
		}
481
		
482
		if (!dateValue) {
483
			return new Date().toISOString(); // Use current date if no date found
484
		}
485
		
486
		// Normalize the date string to handle problematic formats
487
		const normalizedDateValue = normalizeDateString(String(dateValue));
488
		
489
		const parsedDate = new Date(normalizedDateValue);
490
		// Check if date is valid
491
		if (isNaN(parsedDate.getTime())) {
492
			return new Date().toISOString();
493
		}
494
		return parsedDate.toISOString();
495
	} catch {
496
		return new Date().toISOString();
497
	}
498
}
499
500
/**
501
 * Extract string value from various data types and decode HTML entities
502
 */
503
function extractStringValue(value: any): string {
504
	if (!value) return "";
505
506
	let strValue = "";
507
508
	if (typeof value === "string") {
509
		strValue = value;
510
	} else if (typeof value === "object") {
511
		// Handle objects that might contain text
512
		// Try common text properties
513
		if (value.__cdata) strValue = String(value.__cdata);
514
		else if (value["#text"]) strValue = String(value["#text"]);
515
		else if (value.text) strValue = String(value.text);
516
		// Last resort: return empty string
517
		else return "";
518
	} else {
519
		// For numbers, booleans, etc.
520
		strValue = String(value);
521
	}
522
523
	// Decode HTML entities before returning
524
	return decodeHtmlEntities(strValue);
525
}
526
527
/**
528
 * Safely truncate a string to a maximum length
529
 */
530
export function truncateString(str: any, maxLength: number): string {
531
	const strValue = extractStringValue(str);
532
	if (!strValue) return "";
533
	const trimmed = strValue.trim();
534
	if (trimmed.length <= maxLength) return trimmed;
535
	return trimmed.substring(0, maxLength - 3) + "...";
536
}
537
538
/**
539
 * Validate and sanitize feed data for insertion
540
 */
541
export function sanitizeFeedData(feedData: any, feed?: any) {
542
	// Extract title from feedData or feed, handling various formats
543
	const titleValue = feedData?.title || feed?.title || "Untitled Feed";
544
	const descValue =
545
		feedData?.description || feedData?.subtitle || feed?.description || "";
546
547
	return {
548
		title: truncateString(titleValue, 200),
549
		description: truncateString(descValue, 1000),
550
	};
551
}
552
553
/**
554
 * Validate and sanitize post data for insertion
555
 */
556
export function sanitizePostData(
557
	post: any,
558
	isAtom: boolean,
559
	feedTitle: string,
560
) {
561
	return {
562
		title: truncateString(post.title || "Untitled", 1000),
563
		author: truncateString(extractPostAuthor(post, isAtom, feedTitle), 200),
564
		link: truncateString(extractPostLink(post, isAtom), 1000),
565
	};
566
}