chore: enable auto discovery and url normalization
a4268d5c
3 file(s) · +147 −1
| 218 | 218 | "feedparser-rs", |
|
| 219 | 219 | "open", |
|
| 220 | 220 | "ratatui", |
|
| 221 | + | "ureq", |
|
| 221 | 222 | ] |
|
| 222 | 223 | ||
| 223 | 224 | [[package]] |
|
| 2143 | 2144 | checksum = "ef86cd5876211988985292b91c96a8f2d298df24e75989a43a3c73f2d4d8168b" |
|
| 2144 | 2145 | dependencies = [ |
|
| 2145 | 2146 | "aws-lc-rs", |
|
| 2147 | + | "log", |
|
| 2146 | 2148 | "once_cell", |
|
| 2149 | + | "ring", |
|
| 2147 | 2150 | "rustls-pki-types", |
|
| 2148 | 2151 | "rustls-webpki", |
|
| 2149 | 2152 | "subtle", |
|
| 2895 | 2898 | checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" |
|
| 2896 | 2899 | ||
| 2897 | 2900 | [[package]] |
|
| 2901 | + | name = "ureq" |
|
| 2902 | + | version = "3.3.0" |
|
| 2903 | + | source = "registry+https://github.com/rust-lang/crates.io-index" |
|
| 2904 | + | checksum = "dea7109cdcd5864d4eeb1b58a1648dc9bf520360d7af16ec26d0a9354bafcfc0" |
|
| 2905 | + | dependencies = [ |
|
| 2906 | + | "base64", |
|
| 2907 | + | "flate2", |
|
| 2908 | + | "log", |
|
| 2909 | + | "percent-encoding", |
|
| 2910 | + | "rustls", |
|
| 2911 | + | "rustls-pki-types", |
|
| 2912 | + | "ureq-proto", |
|
| 2913 | + | "utf8-zero", |
|
| 2914 | + | "webpki-roots", |
|
| 2915 | + | ] |
|
| 2916 | + | ||
| 2917 | + | [[package]] |
|
| 2918 | + | name = "ureq-proto" |
|
| 2919 | + | version = "0.6.0" |
|
| 2920 | + | source = "registry+https://github.com/rust-lang/crates.io-index" |
|
| 2921 | + | checksum = "e994ba84b0bd1b1b0cf92878b7ef898a5c1760108fe7b6010327e274917a808c" |
|
| 2922 | + | dependencies = [ |
|
| 2923 | + | "base64", |
|
| 2924 | + | "http", |
|
| 2925 | + | "httparse", |
|
| 2926 | + | "log", |
|
| 2927 | + | ] |
|
| 2928 | + | ||
| 2929 | + | [[package]] |
|
| 2898 | 2930 | name = "url" |
|
| 2899 | 2931 | version = "2.5.8" |
|
| 2900 | 2932 | source = "registry+https://github.com/rust-lang/crates.io-index" |
|
| 2917 | 2949 | version = "0.1.8" |
|
| 2918 | 2950 | source = "registry+https://github.com/rust-lang/crates.io-index" |
|
| 2919 | 2951 | checksum = "1292c0d970b54115d14f2492fe0170adf21d68a1de108eebc51c1df4f346a091" |
|
| 2952 | + | ||
| 2953 | + | [[package]] |
|
| 2954 | + | name = "utf8-zero" |
|
| 2955 | + | version = "0.8.1" |
|
| 2956 | + | source = "registry+https://github.com/rust-lang/crates.io-index" |
|
| 2957 | + | checksum = "b8c0a043c9540bae7c578c88f91dda8bd82e59ae27c21baca69c8b191aaf5a6e" |
|
| 2920 | 2958 | ||
| 2921 | 2959 | [[package]] |
|
| 2922 | 2960 | name = "utf8_iter" |
|
| 3132 | 3170 | version = "1.0.7" |
|
| 3133 | 3171 | source = "registry+https://github.com/rust-lang/crates.io-index" |
|
| 3134 | 3172 | checksum = "f31141ce3fc3e300ae89b78c0dd67f9708061d1d2eda54b8209346fd6be9a92c" |
|
| 3173 | + | dependencies = [ |
|
| 3174 | + | "rustls-pki-types", |
|
| 3175 | + | ] |
|
| 3176 | + | ||
| 3177 | + | [[package]] |
|
| 3178 | + | name = "webpki-roots" |
|
| 3179 | + | version = "1.0.7" |
|
| 3180 | + | source = "registry+https://github.com/rust-lang/crates.io-index" |
|
| 3181 | + | checksum = "52f5ee44c96cf55f1b349600768e3ece3a8f26010c05265ab73f945bb1a2eb9d" |
|
| 3135 | 3182 | dependencies = [ |
|
| 3136 | 3183 | "rustls-pki-types", |
|
| 3137 | 3184 | ] |
|
| 8 | 8 | ||
| 9 | 9 | [dependencies] |
|
| 10 | 10 | color-eyre = "0.6.3" |
|
| 11 | + | ureq = "3" |
|
| 11 | 12 | crossterm = "0.29.0" |
|
| 12 | 13 | feedparser-rs = "0.5.3" |
|
| 13 | 14 | open = "5" |
| 9 | 9 | widgets::{Block, List, ListItem, ListState, Padding}, |
|
| 10 | 10 | }; |
|
| 11 | 11 | ||
| 12 | + | fn normalize_url(s: &str) -> String { |
|
| 13 | + | if s.starts_with("http://") || s.starts_with("https://") { |
|
| 14 | + | s.to_string() |
|
| 15 | + | } else { |
|
| 16 | + | format!("https://{s}") |
|
| 17 | + | } |
|
| 18 | + | } |
|
| 19 | + | ||
| 20 | + | fn is_bare_domain(url: &str) -> bool { |
|
| 21 | + | let rest = url |
|
| 22 | + | .strip_prefix("https://") |
|
| 23 | + | .or_else(|| url.strip_prefix("http://")) |
|
| 24 | + | .unwrap_or(url); |
|
| 25 | + | let path = rest.find('/').map(|i| &rest[i..]).unwrap_or(""); |
|
| 26 | + | path.trim_matches('/').is_empty() |
|
| 27 | + | } |
|
| 28 | + | ||
| 29 | + | fn find_feed_link(html: &str, base_url: &str) -> Option<String> { |
|
| 30 | + | let base = base_url.trim_end_matches('/'); |
|
| 31 | + | let lower = html.to_lowercase(); |
|
| 32 | + | let mut pos = 0; |
|
| 33 | + | while let Some(tag_start) = lower[pos..].find("<link") { |
|
| 34 | + | let abs = pos + tag_start; |
|
| 35 | + | let tag_end = lower[abs..].find('>')? + abs; |
|
| 36 | + | let tag = &html[abs..=tag_end]; |
|
| 37 | + | let tag_lower = tag.to_lowercase(); |
|
| 38 | + | let is_feed = |
|
| 39 | + | tag_lower.contains("application/rss+xml") || tag_lower.contains("application/atom+xml"); |
|
| 40 | + | if is_feed { |
|
| 41 | + | if let Some(href) = extract_attr(tag, "href") { |
|
| 42 | + | let resolved = if href.starts_with("http://") || href.starts_with("https://") { |
|
| 43 | + | href |
|
| 44 | + | } else if href.starts_with('/') { |
|
| 45 | + | format!("{base}{href}") |
|
| 46 | + | } else { |
|
| 47 | + | format!("{base}/{href}") |
|
| 48 | + | }; |
|
| 49 | + | return Some(resolved); |
|
| 50 | + | } |
|
| 51 | + | } |
|
| 52 | + | pos = tag_end + 1; |
|
| 53 | + | } |
|
| 54 | + | None |
|
| 55 | + | } |
|
| 56 | + | ||
| 57 | + | fn extract_attr(tag: &str, attr: &str) -> Option<String> { |
|
| 58 | + | let search = format!("{attr}="); |
|
| 59 | + | let lower = tag.to_lowercase(); |
|
| 60 | + | let start = lower.find(&search)? + search.len(); |
|
| 61 | + | let rest = &tag[start..]; |
|
| 62 | + | let (quote, end_char) = if rest.starts_with('"') { |
|
| 63 | + | (&rest[1..], '"') |
|
| 64 | + | } else if rest.starts_with('\'') { |
|
| 65 | + | (&rest[1..], '\'') |
|
| 66 | + | } else { |
|
| 67 | + | return None; |
|
| 68 | + | }; |
|
| 69 | + | let end = quote.find(end_char)?; |
|
| 70 | + | Some(quote[..end].to_string()) |
|
| 71 | + | } |
|
| 72 | + | ||
| 73 | + | fn discover_feed(input: &str) -> color_eyre::Result<String> { |
|
| 74 | + | let url = normalize_url(input); |
|
| 75 | + | if !is_bare_domain(&url) { |
|
| 76 | + | return Ok(url); |
|
| 77 | + | } |
|
| 78 | + | let html = ureq::get(&url).call()?.body_mut().read_to_string()?; |
|
| 79 | + | if let Some(feed_url) = find_feed_link(&html, &url) { |
|
| 80 | + | return Ok(feed_url); |
|
| 81 | + | } |
|
| 82 | + | let base = url.trim_end_matches('/'); |
|
| 83 | + | const PATHS: &[&str] = &[ |
|
| 84 | + | "/feed.xml", |
|
| 85 | + | "/rss.xml", |
|
| 86 | + | "/atom.xml", |
|
| 87 | + | "/feed", |
|
| 88 | + | "/rss", |
|
| 89 | + | "/index.xml", |
|
| 90 | + | "/feeds/posts/default", |
|
| 91 | + | "/blog/feed.xml", |
|
| 92 | + | "/blog/rss.xml", |
|
| 93 | + | ]; |
|
| 94 | + | for path in PATHS { |
|
| 95 | + | let candidate = format!("{base}{path}"); |
|
| 96 | + | if ureq::get(&candidate) |
|
| 97 | + | .call() |
|
| 98 | + | .map(|r| r.status() == 200) |
|
| 99 | + | .unwrap_or(false) |
|
| 100 | + | { |
|
| 101 | + | return Ok(candidate); |
|
| 102 | + | } |
|
| 103 | + | } |
|
| 104 | + | Err(color_eyre::eyre::eyre!("No feed found for: {input}")) |
|
| 105 | + | } |
|
| 106 | + | ||
| 12 | 107 | fn main() -> color_eyre::Result<()> { |
|
| 13 | 108 | color_eyre::install()?; |
|
| 14 | 109 | let urls: Vec<String> = std::env::args().skip(1).collect(); |
|
| 19 | 114 | } |
|
| 20 | 115 | let feeds: Vec<ParsedFeed> = urls |
|
| 21 | 116 | .iter() |
|
| 22 | - | .map(|url| parse_url(url, None, None, None)) |
|
| 117 | + | .map(|url| -> color_eyre::Result<ParsedFeed> { |
|
| 118 | + | let resolved = discover_feed(url)?; |
|
| 119 | + | Ok(parse_url(&resolved, None, None, None)?) |
|
| 120 | + | }) |
|
| 23 | 121 | .collect::<Result<_, _>>()?; |
|
| 24 | 122 | ||
| 25 | 123 | let mut entries: Vec<(&Entry, Option<&str>)> = feeds |
|