| 1 | import { XMLParser } from "fast-xml-parser"; |
| 2 | import { COMMON_FEED_PATHS } from "./feed-discovery"; |
| 3 | |
| 4 | const parser = new XMLParser({ |
| 5 | ignoreAttributes: false, |
| 6 | attributeNamePrefix: "@_", |
| 7 | textNodeName: "#text", |
| 8 | cdataPropName: "__cdata", |
| 9 | parseAttributeValue: true, |
| 10 | trimValues: true, |
| 11 | }); |
| 12 | |
| 13 | /** |
| 14 | * Decodes HTML entities in a string |
| 15 | * Handles both named entities (&) and numeric entities (&, &) |
| 16 | */ |
| 17 | function decodeHtmlEntities(text: string): string { |
| 18 | if (!text || typeof text !== "string") return text; |
| 19 | |
| 20 | // Create a temporary element to use browser's built-in HTML decoding |
| 21 | if (typeof document !== "undefined") { |
| 22 | const textarea = document.createElement("textarea"); |
| 23 | textarea.innerHTML = text; |
| 24 | return textarea.value; |
| 25 | } |
| 26 | |
| 27 | // Fallback for non-browser environments (though we're in a browser app) |
| 28 | // Handle common HTML entities manually |
| 29 | const entities: Record<string, string> = { |
| 30 | "&": "&", |
| 31 | "<": "<", |
| 32 | ">": ">", |
| 33 | """: '"', |
| 34 | "'": "'", |
| 35 | "'": "'", |
| 36 | "’": "'", |
| 37 | "‘": "'", |
| 38 | "“": '"', |
| 39 | "”": '"', |
| 40 | "–": "–", |
| 41 | "—": "—", |
| 42 | "&": "&", |
| 43 | }; |
| 44 | |
| 45 | let decoded = text; |
| 46 | for (const [entity, char] of Object.entries(entities)) { |
| 47 | decoded = decoded.replace(new RegExp(entity, "g"), char); |
| 48 | } |
| 49 | |
| 50 | // Handle numeric entities like ’ |
| 51 | decoded = decoded.replace(/&#(\d+);/g, (_match, dec) => { |
| 52 | return String.fromCharCode(dec); |
| 53 | }); |
| 54 | |
| 55 | // Handle hex entities like ' |
| 56 | decoded = decoded.replace(/&#x([0-9a-f]+);/gi, (_match, hex) => { |
| 57 | return String.fromCharCode(parseInt(hex, 16)); |
| 58 | }); |
| 59 | |
| 60 | return decoded; |
| 61 | } |
| 62 | |
| 63 | export interface ParsedFeedData { |
| 64 | feedData: any; |
| 65 | posts: any[]; |
| 66 | isAtom: boolean; |
| 67 | } |
| 68 | |
| 69 | /** |
| 70 | * Fetches XML data from a URL with CORS fallback |
| 71 | */ |
| 72 | export async function fetchFeedWithFallback(url: string): Promise<string> { |
| 73 | try { |
| 74 | // Try to fetch directly first |
| 75 | const response = await fetch(url); |
| 76 | return await response.text(); |
| 77 | } catch { |
| 78 | // Fall back to primary CORS proxy if direct fetch fails |
| 79 | try { |
| 80 | const response = await fetch( |
| 81 | `https://proxy.alcove.tools?url=${encodeURIComponent(url)}`, |
| 82 | ); |
| 83 | return await response.text(); |
| 84 | } catch { |
| 85 | // Fall back to secondary CORS proxy if primary fails |
| 86 | const response = await fetch( |
| 87 | `https://proxy2.alcove.tools?url=${encodeURIComponent(url)}`, |
| 88 | ); |
| 89 | return await response.text(); |
| 90 | } |
| 91 | } |
| 92 | } |
| 93 | |
| 94 | /** |
| 95 | * Parses XML data and determines if it's RSS or Atom feed |
| 96 | */ |
| 97 | export function parseFeedXml(xmlData: string): ParsedFeedData { |
| 98 | let parsedXmlData: any; |
| 99 | |
| 100 | try { |
| 101 | parsedXmlData = parser.parse(xmlData); |
| 102 | } catch (error) { |
| 103 | throw new Error( |
| 104 | `XML parsing failed: ${error instanceof Error ? error.message : "Unknown error"}`, |
| 105 | ); |
| 106 | } |
| 107 | |
| 108 | // Determine if it's RSS or Atom feed |
| 109 | let feedData: any; |
| 110 | let posts: any[]; |
| 111 | let isAtom = false; |
| 112 | |
| 113 | if (parsedXmlData.rss) { |
| 114 | // RSS feed |
| 115 | feedData = parsedXmlData.rss.channel; |
| 116 | if (!feedData) { |
| 117 | throw new Error("RSS feed missing channel element"); |
| 118 | } |
| 119 | const items = feedData.item || []; |
| 120 | // Ensure posts is always an array (single item might not be in array) |
| 121 | posts = Array.isArray(items) ? items : items ? [items] : []; |
| 122 | } else if (parsedXmlData.feed) { |
| 123 | // Atom feed |
| 124 | feedData = parsedXmlData.feed; |
| 125 | const entries = feedData.entry || []; |
| 126 | // Ensure posts is always an array (single entry might not be in array) |
| 127 | posts = Array.isArray(entries) ? entries : entries ? [entries] : []; |
| 128 | isAtom = true; |
| 129 | } else if (parsedXmlData["rdf:RDF"]) { |
| 130 | // RDF/RSS 1.0 feed |
| 131 | feedData = parsedXmlData["rdf:RDF"].channel; |
| 132 | const items = parsedXmlData["rdf:RDF"].item || []; |
| 133 | posts = Array.isArray(items) ? items : items ? [items] : []; |
| 134 | isAtom = false; |
| 135 | } else { |
| 136 | // Log available root elements for debugging |
| 137 | const rootKeys = Object.keys(parsedXmlData); |
| 138 | throw new Error( |
| 139 | `Unsupported feed format. Found root elements: ${rootKeys.join(", ")}`, |
| 140 | ); |
| 141 | } |
| 142 | |
| 143 | // Filter out empty objects from posts array |
| 144 | posts = posts.filter((post) => post && Object.keys(post).length > 0); |
| 145 | |
| 146 | return { feedData, posts, isAtom }; |
| 147 | } |
| 148 | |
| 149 | /** |
| 150 | * Discovers RSS/Atom feed URL from a website URL |
| 151 | */ |
| 152 | export async function discoverFeed(websiteUrl: string): Promise<{ |
| 153 | feedUrl: string; |
| 154 | xmlData: string; |
| 155 | } | null> { |
| 156 | const urlObj = new URL(websiteUrl); |
| 157 | const origin = urlObj.origin; |
| 158 | |
| 159 | for (const path of COMMON_FEED_PATHS) { |
| 160 | const testUrl = `${origin}${path}`; |
| 161 | |
| 162 | try { |
| 163 | // Try primary CORS proxy |
| 164 | let response: Response; |
| 165 | try { |
| 166 | response = await fetch( |
| 167 | `https://proxy.alcove.tools?url=${encodeURIComponent(testUrl)}`, |
| 168 | ); |
| 169 | } catch { |
| 170 | // Fall back to secondary CORS proxy |
| 171 | response = await fetch( |
| 172 | `https://proxy2.alcove.tools?url=${encodeURIComponent(testUrl)}`, |
| 173 | ); |
| 174 | } |
| 175 | |
| 176 | if (response.ok) { |
| 177 | const text = await response.text(); |
| 178 | // Quick check if it looks like XML |
| 179 | if ( |
| 180 | text.trim().startsWith("<?xml") || |
| 181 | text.includes("<rss") || |
| 182 | text.includes("<feed") |
| 183 | ) { |
| 184 | return { feedUrl: testUrl, xmlData: text }; |
| 185 | } |
| 186 | } |
| 187 | } catch (error) { |
| 188 | continue; |
| 189 | } |
| 190 | } |
| 191 | |
| 192 | return null; |
| 193 | } |
| 194 | |
| 195 | /** |
| 196 | * Extracts YouTube channel ID from various YouTube URL formats |
| 197 | * Supports: |
| 198 | * - https://www.youtube.com/@ChannelHandle |
| 199 | * - https://www.youtube.com/channel/UC... |
| 200 | * - https://www.youtube.com/c/ChannelName |
| 201 | * - https://www.youtube.com/user/Username |
| 202 | */ |
| 203 | export async function extractYouTubeChannelId( |
| 204 | url: string, |
| 205 | ): Promise<string | null> { |
| 206 | try { |
| 207 | // Direct channel ID format |
| 208 | if (url.includes("/channel/")) { |
| 209 | const match = url.match(/\/channel\/([^/?]+)/); |
| 210 | return match ? match[1] : null; |
| 211 | } |
| 212 | |
| 213 | // Handle @ format - need to fetch the page to get channel ID |
| 214 | if (url.includes("/@")) { |
| 215 | const handle = url.match(/\/@([^/?]+)/)?.[1]; |
| 216 | if (!handle) return null; |
| 217 | |
| 218 | // Fetch the YouTube page to extract the channel ID from meta tags |
| 219 | try { |
| 220 | let response: Response; |
| 221 | try { |
| 222 | response = await fetch( |
| 223 | `https://proxy.alcove.tools?url=${encodeURIComponent(url)}`, |
| 224 | ); |
| 225 | } catch { |
| 226 | // Fall back to secondary CORS proxy |
| 227 | response = await fetch( |
| 228 | `https://proxy2.alcove.tools?url=${encodeURIComponent(url)}`, |
| 229 | ); |
| 230 | } |
| 231 | const html = await response.text(); |
| 232 | |
| 233 | // Look for channel ID in various places |
| 234 | const channelIdMatch = html.match(/channelId":"([^"]+)"/); |
| 235 | if (channelIdMatch) { |
| 236 | return channelIdMatch[1]; |
| 237 | } |
| 238 | |
| 239 | // Alternative: look in meta tags |
| 240 | const metaMatch = html.match( |
| 241 | /<meta itemprop="channelId" content="([^"]+)">/, |
| 242 | ); |
| 243 | if (metaMatch) { |
| 244 | return metaMatch[1]; |
| 245 | } |
| 246 | |
| 247 | // Alternative: look in link tags |
| 248 | const linkMatch = html.match( |
| 249 | /<link rel="canonical" href="https:\/\/www\.youtube\.com\/channel\/([^"]+)">/, |
| 250 | ); |
| 251 | if (linkMatch) { |
| 252 | return linkMatch[1]; |
| 253 | } |
| 254 | } catch (error) { |
| 255 | console.error("Failed to fetch YouTube page for channel ID:", error); |
| 256 | return null; |
| 257 | } |
| 258 | } |
| 259 | |
| 260 | // For /c/ and /user/ formats, we also need to fetch the page |
| 261 | if (url.includes("/c/") || url.includes("/user/")) { |
| 262 | try { |
| 263 | let response: Response; |
| 264 | try { |
| 265 | response = await fetch( |
| 266 | `https://proxy.alcove.tools?url=${encodeURIComponent(url)}`, |
| 267 | ); |
| 268 | } catch { |
| 269 | // Fall back to secondary CORS proxy |
| 270 | response = await fetch( |
| 271 | `https://proxy2.alcove.tools?url=${encodeURIComponent(url)}`, |
| 272 | ); |
| 273 | } |
| 274 | const html = await response.text(); |
| 275 | |
| 276 | const channelIdMatch = html.match(/channelId":"([^"]+)"/); |
| 277 | if (channelIdMatch) { |
| 278 | return channelIdMatch[1]; |
| 279 | } |
| 280 | } catch (error) { |
| 281 | console.error("Failed to fetch YouTube page for channel ID:", error); |
| 282 | return null; |
| 283 | } |
| 284 | } |
| 285 | |
| 286 | return null; |
| 287 | } catch (error) { |
| 288 | console.error("Error extracting YouTube channel ID:", error); |
| 289 | return null; |
| 290 | } |
| 291 | } |
| 292 | |
| 293 | /** |
| 294 | * Converts YouTube channel URL to RSS feed URL |
| 295 | */ |
| 296 | export async function convertYouTubeUrlToFeed( |
| 297 | url: string, |
| 298 | ): Promise<string | null> { |
| 299 | const channelId = await extractYouTubeChannelId(url); |
| 300 | if (!channelId) return null; |
| 301 | |
| 302 | return `https://www.youtube.com/feeds/videos.xml?channel_id=${channelId}`; |
| 303 | } |
| 304 | |
| 305 | /** |
| 306 | * Checks if a URL is a YouTube URL |
| 307 | */ |
| 308 | export function isYouTubeUrl(url: string): boolean { |
| 309 | return url.includes("youtube.com") || url.includes("youtu.be"); |
| 310 | } |
| 311 | |
| 312 | /** |
| 313 | * Extracts YouTube video ID from a video URL |
| 314 | * Supports: |
| 315 | * - https://www.youtube.com/watch?v=VIDEO_ID |
| 316 | * - https://youtu.be/VIDEO_ID |
| 317 | * - https://www.youtube.com/embed/VIDEO_ID |
| 318 | */ |
| 319 | export function extractYouTubeVideoId(url: string): string | null { |
| 320 | try { |
| 321 | // Standard watch URL |
| 322 | const watchMatch = url.match(/[?&]v=([^&]+)/); |
| 323 | if (watchMatch) return watchMatch[1]; |
| 324 | |
| 325 | // Short URL format |
| 326 | const shortMatch = url.match(/youtu\.be\/([^?]+)/); |
| 327 | if (shortMatch) return shortMatch[1]; |
| 328 | |
| 329 | // Embed URL format |
| 330 | const embedMatch = url.match(/youtube\.com\/embed\/([^?]+)/); |
| 331 | if (embedMatch) return embedMatch[1]; |
| 332 | |
| 333 | return null; |
| 334 | } catch { |
| 335 | return null; |
| 336 | } |
| 337 | } |
| 338 | |
| 339 | /** |
| 340 | * Checks if a post is from a YouTube feed |
| 341 | */ |
| 342 | export function isYouTubePost(feedUrl: string | null): boolean { |
| 343 | if (!feedUrl) return false; |
| 344 | return feedUrl.includes("youtube.com/feeds/videos.xml"); |
| 345 | } |
| 346 | |
| 347 | /** |
| 348 | * Extracts post link from RSS or Atom post entry |
| 349 | */ |
| 350 | export function extractPostLink(post: any, isAtom: boolean): string { |
| 351 | if (isAtom) { |
| 352 | // Handle Atom link which can be string, object, or array |
| 353 | if (typeof post.link === "string") { |
| 354 | return post.link || post.id || "#"; |
| 355 | } else if (Array.isArray(post.link)) { |
| 356 | // Find 'alternate' link or use first link |
| 357 | const alternateLink = post.link.find( |
| 358 | (l: any) => l["@_rel"] === "alternate" || !l["@_rel"], |
| 359 | ); |
| 360 | return ( |
| 361 | alternateLink?.["@_href"] || post.link[0]?.["@_href"] || post.id || "#" |
| 362 | ); |
| 363 | } else if (post.link && typeof post.link === "object") { |
| 364 | return post.link["@_href"] || post.id || "#"; |
| 365 | } |
| 366 | return post.id || "#"; |
| 367 | } |
| 368 | |
| 369 | // RSS feed |
| 370 | const link = post.link || post.guid || post.id; |
| 371 | if (!link) return "#"; |
| 372 | |
| 373 | // Handle link as object (sometimes RSS parsers do this) |
| 374 | if (typeof link === "object") { |
| 375 | return link["#text"] || link.__cdata || "#"; |
| 376 | } |
| 377 | |
| 378 | return String(link); |
| 379 | } |
| 380 | |
| 381 | /** |
| 382 | * Extracts author from RSS or Atom post entry |
| 383 | */ |
| 384 | export function extractPostAuthor( |
| 385 | post: any, |
| 386 | isAtom: boolean, |
| 387 | feedTitle: string, |
| 388 | ): string { |
| 389 | if (isAtom) { |
| 390 | // Atom can have author as object with name property |
| 391 | const author = post.author; |
| 392 | if (typeof author === "object" && author !== null) { |
| 393 | return author.name || author["#text"] || feedTitle; |
| 394 | } |
| 395 | return author || feedTitle; |
| 396 | } |
| 397 | |
| 398 | // RSS feed |
| 399 | const author = post.author || post["dc:creator"] || post.creator; |
| 400 | if (!author) return feedTitle; |
| 401 | |
| 402 | // Handle author as object |
| 403 | if (typeof author === "object") { |
| 404 | return author["#text"] || author.__cdata || feedTitle; |
| 405 | } |
| 406 | |
| 407 | return String(author); |
| 408 | } |
| 409 | |
| 410 | /** |
| 411 | * Extracts content from RSS or Atom post entry |
| 412 | */ |
| 413 | export function extractPostContent(post: any, postLink?: string): string { |
| 414 | // Try various content fields in order of preference |
| 415 | const content = |
| 416 | post["content:encoded"] || post.content || post.description || post.summary; |
| 417 | |
| 418 | // Default fallback message |
| 419 | const fallbackMessage = postLink |
| 420 | ? `<p><a href="${postLink}" target="_blank" rel="noopener noreferrer">View post</a></p>` |
| 421 | : "Please open on the web"; |
| 422 | |
| 423 | // Handle different content structures |
| 424 | if (typeof content === "string") { |
| 425 | const trimmed = content.trim(); |
| 426 | return trimmed.length > 0 ? trimmed : fallbackMessage; |
| 427 | } else if (content && typeof content === "object") { |
| 428 | // Handle CDATA or nested text |
| 429 | const extracted = content.__cdata || content["#text"] || ""; |
| 430 | const trimmed = String(extracted).trim(); |
| 431 | return trimmed.length > 0 ? trimmed : fallbackMessage; |
| 432 | } |
| 433 | |
| 434 | // No content found - this is fine for link-only feeds |
| 435 | return fallbackMessage; |
| 436 | } |
| 437 | |
| 438 | /** |
| 439 | * Extracts published date from RSS or Atom post entry |
| 440 | */ |
| 441 | export function extractPostDate(post: any): string { |
| 442 | try { |
| 443 | const dateValue = post.pubDate || post.updated || post.published; |
| 444 | if (!dateValue) { |
| 445 | return new Date().toISOString(); // Use current date if no date found |
| 446 | } |
| 447 | const parsedDate = new Date(dateValue); |
| 448 | // Check if date is valid |
| 449 | if (isNaN(parsedDate.getTime())) { |
| 450 | return new Date().toISOString(); |
| 451 | } |
| 452 | return parsedDate.toISOString(); |
| 453 | } catch { |
| 454 | return new Date().toISOString(); |
| 455 | } |
| 456 | } |
| 457 | |
| 458 | /** |
| 459 | * Extract string value from various data types and decode HTML entities |
| 460 | */ |
| 461 | function extractStringValue(value: any): string { |
| 462 | if (!value) return ""; |
| 463 | |
| 464 | let strValue = ""; |
| 465 | |
| 466 | if (typeof value === "string") { |
| 467 | strValue = value; |
| 468 | } else if (typeof value === "object") { |
| 469 | // Handle objects that might contain text |
| 470 | // Try common text properties |
| 471 | if (value.__cdata) strValue = String(value.__cdata); |
| 472 | else if (value["#text"]) strValue = String(value["#text"]); |
| 473 | else if (value.text) strValue = String(value.text); |
| 474 | // Last resort: return empty string |
| 475 | else return ""; |
| 476 | } else { |
| 477 | // For numbers, booleans, etc. |
| 478 | strValue = String(value); |
| 479 | } |
| 480 | |
| 481 | // Decode HTML entities before returning |
| 482 | return decodeHtmlEntities(strValue); |
| 483 | } |
| 484 | |
| 485 | /** |
| 486 | * Safely truncate a string to a maximum length |
| 487 | */ |
| 488 | export function truncateString(str: any, maxLength: number): string { |
| 489 | const strValue = extractStringValue(str); |
| 490 | if (!strValue) return ""; |
| 491 | const trimmed = strValue.trim(); |
| 492 | if (trimmed.length <= maxLength) return trimmed; |
| 493 | return trimmed.substring(0, maxLength - 3) + "..."; |
| 494 | } |
| 495 | |
| 496 | /** |
| 497 | * Validate and sanitize feed data for insertion |
| 498 | */ |
| 499 | export function sanitizeFeedData(feedData: any, feed?: any) { |
| 500 | // Extract title from feedData or feed, handling various formats |
| 501 | const titleValue = feedData?.title || feed?.title || "Untitled Feed"; |
| 502 | const descValue = |
| 503 | feedData?.description || feedData?.subtitle || feed?.description || ""; |
| 504 | |
| 505 | return { |
| 506 | title: truncateString(titleValue, 200), |
| 507 | description: truncateString(descValue, 1000), |
| 508 | }; |
| 509 | } |
| 510 | |
| 511 | /** |
| 512 | * Validate and sanitize post data for insertion |
| 513 | */ |
| 514 | export function sanitizePostData( |
| 515 | post: any, |
| 516 | isAtom: boolean, |
| 517 | feedTitle: string, |
| 518 | ) { |
| 519 | return { |
| 520 | title: truncateString(post.title || "Untitled", 1000), |
| 521 | author: truncateString(extractPostAuthor(post, isAtom, feedTitle), 200), |
| 522 | link: truncateString(extractPostLink(post, isAtom), 1000), |
| 523 | }; |
| 524 | } |