import { XMLParser } from "fast-xml-parser"; import { COMMON_FEED_PATHS } from "./feed-discovery"; const parser = new XMLParser({ ignoreAttributes: false, attributeNamePrefix: "@_", textNodeName: "#text", cdataPropName: "__cdata", parseAttributeValue: true, trimValues: true, }); /** * Decodes HTML entities in a string * Handles both named entities (&) and numeric entities (&, &) */ function decodeHtmlEntities(text: string): string { if (!text || typeof text !== "string") return text; // Create a temporary element to use browser's built-in HTML decoding if (typeof document !== "undefined") { const textarea = document.createElement("textarea"); textarea.innerHTML = text; return textarea.value; } // Fallback for non-browser environments (though we're in a browser app) // Handle common HTML entities manually const entities: Record = { "&": "&", "<": "<", ">": ">", """: '"', "'": "'", "'": "'", "’": "'", "‘": "'", "“": '"', "”": '"', "–": "–", "—": "—", "&": "&", }; let decoded = text; for (const [entity, char] of Object.entries(entities)) { decoded = decoded.replace(new RegExp(entity, "g"), char); } // Handle numeric entities like ’ decoded = decoded.replace(/&#(\d+);/g, (_match, dec) => { return String.fromCharCode(dec); }); // Handle hex entities like ' decoded = decoded.replace(/&#x([0-9a-f]+);/gi, (_match, hex) => { return String.fromCharCode(parseInt(hex, 16)); }); return decoded; } export interface ParsedFeedData { feedData: any; posts: any[]; isAtom: boolean; } /** * Fetches XML data from a URL with CORS fallback */ export async function fetchFeedWithFallback(url: string): Promise { try { // Try to fetch directly first const response = await fetch(url); return await response.text(); } catch { // Fall back to primary CORS proxy if direct fetch fails try { const response = await fetch( `https://proxy.alcove.tools?url=${encodeURIComponent(url)}`, ); return await response.text(); } catch { // Fall back to secondary CORS proxy if primary fails const response = await fetch( `https://proxy2.alcove.tools?url=${encodeURIComponent(url)}`, ); return await response.text(); } } } /** * Parses XML data and determines if it's RSS or Atom feed */ export function parseFeedXml(xmlData: string): ParsedFeedData { let parsedXmlData: any; try { parsedXmlData = parser.parse(xmlData); } catch (error) { throw new Error( `XML parsing failed: ${error instanceof Error ? error.message : "Unknown error"}`, ); } // Determine if it's RSS or Atom feed let feedData: any; let posts: any[]; let isAtom = false; if (parsedXmlData.rss) { // RSS feed feedData = parsedXmlData.rss.channel; if (!feedData) { throw new Error("RSS feed missing channel element"); } const items = feedData.item || []; // Ensure posts is always an array (single item might not be in array) posts = Array.isArray(items) ? items : items ? [items] : []; } else if (parsedXmlData.feed) { // Atom feed feedData = parsedXmlData.feed; const entries = feedData.entry || []; // Ensure posts is always an array (single entry might not be in array) posts = Array.isArray(entries) ? entries : entries ? [entries] : []; isAtom = true; } else if (parsedXmlData["rdf:RDF"]) { // RDF/RSS 1.0 feed feedData = parsedXmlData["rdf:RDF"].channel; const items = parsedXmlData["rdf:RDF"].item || []; posts = Array.isArray(items) ? items : items ? [items] : []; isAtom = false; } else { // Log available root elements for debugging const rootKeys = Object.keys(parsedXmlData); throw new Error( `Unsupported feed format. Found root elements: ${rootKeys.join(", ")}`, ); } // Filter out empty objects from posts array posts = posts.filter((post) => post && Object.keys(post).length > 0); return { feedData, posts, isAtom }; } /** * Discovers RSS/Atom feed URL from a website URL */ export async function discoverFeed(websiteUrl: string): Promise<{ feedUrl: string; xmlData: string; } | null> { const urlObj = new URL(websiteUrl); const origin = urlObj.origin; for (const path of COMMON_FEED_PATHS) { const testUrl = `${origin}${path}`; try { // Try primary CORS proxy let response: Response; try { response = await fetch( `https://proxy.alcove.tools?url=${encodeURIComponent(testUrl)}`, ); } catch { // Fall back to secondary CORS proxy response = await fetch( `https://proxy2.alcove.tools?url=${encodeURIComponent(testUrl)}`, ); } if (response.ok) { const text = await response.text(); // Quick check if it looks like XML if ( text.trim().startsWith(" { try { // Direct channel ID format if (url.includes("/channel/")) { const match = url.match(/\/channel\/([^/?]+)/); return match ? match[1] : null; } // Handle @ format - need to fetch the page to get channel ID if (url.includes("/@")) { const handle = url.match(/\/@([^/?]+)/)?.[1]; if (!handle) return null; // Fetch the YouTube page to extract the channel ID from meta tags try { let response: Response; try { response = await fetch( `https://proxy.alcove.tools?url=${encodeURIComponent(url)}`, ); } catch { // Fall back to secondary CORS proxy response = await fetch( `https://proxy2.alcove.tools?url=${encodeURIComponent(url)}`, ); } const html = await response.text(); // Look for channel ID in various places const channelIdMatch = html.match(/channelId":"([^"]+)"/); if (channelIdMatch) { return channelIdMatch[1]; } // Alternative: look in meta tags const metaMatch = html.match( //, ); if (metaMatch) { return metaMatch[1]; } // Alternative: look in link tags const linkMatch = html.match( //, ); if (linkMatch) { return linkMatch[1]; } } catch (error) { console.error("Failed to fetch YouTube page for channel ID:", error); return null; } } // For /c/ and /user/ formats, we also need to fetch the page if (url.includes("/c/") || url.includes("/user/")) { try { let response: Response; try { response = await fetch( `https://proxy.alcove.tools?url=${encodeURIComponent(url)}`, ); } catch { // Fall back to secondary CORS proxy response = await fetch( `https://proxy2.alcove.tools?url=${encodeURIComponent(url)}`, ); } const html = await response.text(); const channelIdMatch = html.match(/channelId":"([^"]+)"/); if (channelIdMatch) { return channelIdMatch[1]; } } catch (error) { console.error("Failed to fetch YouTube page for channel ID:", error); return null; } } return null; } catch (error) { console.error("Error extracting YouTube channel ID:", error); return null; } } /** * Converts YouTube channel URL to RSS feed URL */ export async function convertYouTubeUrlToFeed( url: string, ): Promise { const channelId = await extractYouTubeChannelId(url); if (!channelId) return null; return `https://www.youtube.com/feeds/videos.xml?channel_id=${channelId}`; } /** * Checks if a URL is a YouTube URL */ export function isYouTubeUrl(url: string): boolean { return url.includes("youtube.com") || url.includes("youtu.be"); } /** * Extracts YouTube video ID from a video URL * Supports: * - https://www.youtube.com/watch?v=VIDEO_ID * - https://youtu.be/VIDEO_ID * - https://www.youtube.com/embed/VIDEO_ID */ export function extractYouTubeVideoId(url: string): string | null { try { // Standard watch URL const watchMatch = url.match(/[?&]v=([^&]+)/); if (watchMatch) return watchMatch[1]; // Short URL format const shortMatch = url.match(/youtu\.be\/([^?]+)/); if (shortMatch) return shortMatch[1]; // Embed URL format const embedMatch = url.match(/youtube\.com\/embed\/([^?]+)/); if (embedMatch) return embedMatch[1]; return null; } catch { return null; } } /** * Checks if a post is from a YouTube feed */ export function isYouTubePost(feedUrl: string | null): boolean { if (!feedUrl) return false; return feedUrl.includes("youtube.com/feeds/videos.xml"); } /** * Extracts post link from RSS or Atom post entry */ export function extractPostLink(post: any, isAtom: boolean): string { if (isAtom) { // Handle Atom link which can be string, object, or array if (typeof post.link === "string") { return post.link || post.id || "#"; } else if (Array.isArray(post.link)) { // Find 'alternate' link or use first link const alternateLink = post.link.find( (l: any) => l["@_rel"] === "alternate" || !l["@_rel"], ); return ( alternateLink?.["@_href"] || post.link[0]?.["@_href"] || post.id || "#" ); } else if (post.link && typeof post.link === "object") { return post.link["@_href"] || post.id || "#"; } return post.id || "#"; } // RSS feed const link = post.link || post.guid || post.id; if (!link) return "#"; // Handle link as object (sometimes RSS parsers do this) if (typeof link === "object") { return link["#text"] || link.__cdata || "#"; } return String(link); } /** * Extracts author from RSS or Atom post entry */ export function extractPostAuthor( post: any, isAtom: boolean, feedTitle: string, ): string { if (isAtom) { // Atom can have author as object with name property const author = post.author; if (typeof author === "object" && author !== null) { return author.name || author["#text"] || feedTitle; } return author || feedTitle; } // RSS feed const author = post.author || post["dc:creator"] || post.creator; if (!author) return feedTitle; // Handle author as object if (typeof author === "object") { return author["#text"] || author.__cdata || feedTitle; } return String(author); } /** * Extracts content from RSS or Atom post entry */ export function extractPostContent(post: any, postLink?: string): string { // Try various content fields in order of preference const content = post["content:encoded"] || post.content || post.description || post.summary; // Default fallback message const fallbackMessage = postLink ? `

View post

` : "Please open on the web"; // Handle different content structures if (typeof content === "string") { const trimmed = content.trim(); return trimmed.length > 0 ? trimmed : fallbackMessage; } else if (content && typeof content === "object") { // Handle CDATA or nested text const extracted = content.__cdata || content["#text"] || ""; const trimmed = String(extracted).trim(); return trimmed.length > 0 ? trimmed : fallbackMessage; } // No content found - this is fine for link-only feeds return fallbackMessage; } /** * Extracts published date from RSS or Atom post entry */ export function extractPostDate(post: any): string { try { const dateValue = post.pubDate || post.updated || post.published; if (!dateValue) { return new Date().toISOString(); // Use current date if no date found } const parsedDate = new Date(dateValue); // Check if date is valid if (isNaN(parsedDate.getTime())) { return new Date().toISOString(); } return parsedDate.toISOString(); } catch { return new Date().toISOString(); } } /** * Extract string value from various data types and decode HTML entities */ function extractStringValue(value: any): string { if (!value) return ""; let strValue = ""; if (typeof value === "string") { strValue = value; } else if (typeof value === "object") { // Handle objects that might contain text // Try common text properties if (value.__cdata) strValue = String(value.__cdata); else if (value["#text"]) strValue = String(value["#text"]); else if (value.text) strValue = String(value.text); // Last resort: return empty string else return ""; } else { // For numbers, booleans, etc. strValue = String(value); } // Decode HTML entities before returning return decodeHtmlEntities(strValue); } /** * Safely truncate a string to a maximum length */ export function truncateString(str: any, maxLength: number): string { const strValue = extractStringValue(str); if (!strValue) return ""; const trimmed = strValue.trim(); if (trimmed.length <= maxLength) return trimmed; return trimmed.substring(0, maxLength - 3) + "..."; } /** * Validate and sanitize feed data for insertion */ export function sanitizeFeedData(feedData: any, feed?: any) { // Extract title from feedData or feed, handling various formats const titleValue = feedData?.title || feed?.title || "Untitled Feed"; const descValue = feedData?.description || feedData?.subtitle || feed?.description || ""; return { title: truncateString(titleValue, 200), description: truncateString(descValue, 1000), }; } /** * Validate and sanitize post data for insertion */ export function sanitizePostData( post: any, isAtom: boolean, feedTitle: string, ) { return { title: truncateString(post.title || "Untitled", 1000), author: truncateString(extractPostAuthor(post, isAtom, feedTitle), 200), link: truncateString(extractPostLink(post, isAtom), 1000), }; }