chore: refactored document url parsing e6cf1b99
Steve · 2026-01-24 10:05 1 file(s) · +234 −170
packages/server/src/utils/document.ts +234 −170
5 5
6 6
// Raw document record from PDS
7 7
interface DocumentRecord {
8 -
  site?: string;
9 -
  path?: string;
10 -
  title?: string;
11 -
  description?: string;
12 -
  coverImage?: unknown;
13 -
  content?: unknown;
14 -
  textContent?: string;
15 -
  bskyPostRef?: { uri: string; cid: string };
16 -
  tags?: string[];
17 -
  publishedAt?: string;
18 -
  updatedAt?: string;
8 +
	site?: string;
9 +
	path?: string;
10 +
	title?: string;
11 +
	description?: string;
12 +
	coverImage?: unknown;
13 +
	content?: unknown;
14 +
	textContent?: string;
15 +
	bskyPostRef?: { uri: string; cid: string };
16 +
	tags?: string[];
17 +
	publishedAt?: string;
18 +
	updatedAt?: string;
19 19
}
20 20
21 21
// Raw publication record from PDS
22 22
interface PublicationRecord {
23 -
  url?: string;
24 -
  name?: string;
25 -
  description?: string;
26 -
  icon?: unknown;
23 +
	url?: string;
24 +
	name?: string;
25 +
	description?: string;
26 +
	icon?: unknown;
27 27
}
28 28
29 29
// Resolved publication data
30 30
interface ResolvedPublication {
31 -
  url: string;
32 -
  name: string;
33 -
  description: string | null;
34 -
  iconCid: string | null;
35 -
  iconUrl: string | null;
31 +
	url: string;
32 +
	name: string;
33 +
	description: string | null;
34 +
	iconCid: string | null;
35 +
	iconUrl: string | null;
36 36
}
37 37
38 38
/**
39 39
 * Fetches a publication record from an at:// URI
40 40
 */
41 41
async function fetchPublication(
42 -
  db: D1Database,
43 -
  siteUri: string
42 +
	db: D1Database,
43 +
	siteUri: string,
44 44
): Promise<ResolvedPublication | null> {
45 -
  const parsed = parseAtUri(siteUri);
46 -
  if (!parsed) return null;
45 +
	const parsed = parseAtUri(siteUri);
46 +
	if (!parsed) return null;
47 47
48 -
  try {
49 -
    const pds = await resolvePds(db, parsed.did);
50 -
    if (!pds) return null;
48 +
	try {
49 +
		const pds = await resolvePds(db, parsed.did);
50 +
		if (!pds) return null;
51 51
52 -
    const url = `${pds}/xrpc/com.atproto.repo.getRecord?repo=${encodeURIComponent(
53 -
      parsed.did
54 -
    )}&collection=${encodeURIComponent(parsed.collection)}&rkey=${encodeURIComponent(
55 -
      parsed.rkey
56 -
    )}`;
52 +
		const url = `${pds}/xrpc/com.atproto.repo.getRecord?repo=${encodeURIComponent(
53 +
			parsed.did,
54 +
		)}&collection=${encodeURIComponent(parsed.collection)}&rkey=${encodeURIComponent(
55 +
			parsed.rkey,
56 +
		)}`;
57 57
58 -
    const response = await fetch(url);
59 -
    if (!response.ok) return null;
58 +
		const response = await fetch(url);
59 +
		if (!response.ok) return null;
60 60
61 -
    const data = (await response.json()) as { value?: PublicationRecord };
62 -
    const pub = data.value;
63 -
    if (!pub?.url || !pub?.name) return null;
61 +
		const data = (await response.json()) as { value?: PublicationRecord };
62 +
		const pub = data.value;
63 +
		if (!pub?.url || !pub?.name) return null;
64 64
65 -
    const iconCid = extractBlobCid(pub.icon);
66 -
    const iconUrl = iconCid ? buildBlobUrl(pds, parsed.did, iconCid) : null;
65 +
		const iconCid = extractBlobCid(pub.icon);
66 +
		const iconUrl = iconCid ? buildBlobUrl(pds, parsed.did, iconCid) : null;
67 67
68 -
    return {
69 -
      url: pub.url,
70 -
      name: pub.name,
71 -
      description: pub.description || null,
72 -
      iconCid,
73 -
      iconUrl,
74 -
    };
75 -
  } catch {
76 -
    return null;
77 -
  }
68 +
		return {
69 +
			url: pub.url,
70 +
			name: pub.name,
71 +
			description: pub.description || null,
72 +
			iconCid,
73 +
			iconUrl,
74 +
		};
75 +
	} catch {
76 +
		return null;
77 +
	}
78 78
}
79 79
80 80
/**
83 83
 * If site is an https:// URL, uses it directly.
84 84
 */
85 85
export async function resolveViewUrl(
86 -
  db: D1Database,
87 -
  siteUri: string,
88 -
  path: string
86 +
	db: D1Database,
87 +
	siteUri: string,
88 +
	path: string,
89 89
): Promise<string | null> {
90 -
  // Check if site is an at:// URI or direct URL
91 -
  if (siteUri.startsWith("at://")) {
92 -
    const pub = await fetchPublication(db, siteUri);
93 -
    if (!pub?.url) return null;
94 -
    const baseUrl = pub.url.startsWith("http") ? pub.url : `https://${pub.url}`;
95 -
    return `${baseUrl.replace(/\/$/, "")}${path}`;
96 -
  }
90 +
	// Check if site is an at:// URI or direct URL
91 +
	if (siteUri.startsWith("at://")) {
92 +
		const pub = await fetchPublication(db, siteUri);
93 +
		if (!pub?.url) return null;
94 +
		const baseUrl = pub.url.startsWith("http") ? pub.url : `https://${pub.url}`;
95 +
		return new URL(path, baseUrl).toString();
96 +
	}
97 97
98 -
  // Direct URL
99 -
  const baseUrl = siteUri.startsWith("http") ? siteUri : `https://${siteUri}`;
100 -
  return `${baseUrl.replace(/\/$/, "")}${path}`;
98 +
	// Direct URL
99 +
	const baseUrl = siteUri.startsWith("http") ? siteUri : `https://${siteUri}`;
100 +
	return new URL(path, baseUrl).toString();
101 101
}
102 102
103 103
/**
105 105
 * and stores all fields in resolved_documents table.
106 106
 */
107 107
export async function processDocument(
108 -
  db: D1Database,
109 -
  did: string,
110 -
  collection: string,
111 -
  rkey: string
108 +
	db: D1Database,
109 +
	did: string,
110 +
	collection: string,
111 +
	rkey: string,
112 112
) {
113 -
  try {
114 -
    // 1. Resolve PDS
115 -
    const pds = await resolvePds(db, did);
116 -
    if (!pds) {
117 -
      console.warn(`Could not resolve PDS for ${did}`);
118 -
      return;
119 -
    }
113 +
	try {
114 +
		// 1. Resolve PDS
115 +
		const pds = await resolvePds(db, did);
116 +
		if (!pds) {
117 +
			console.warn(`Could not resolve PDS for ${did}`);
118 +
			return;
119 +
		}
120 120
121 -
    // 2. Fetch Document Record
122 -
    const url = `${pds}/xrpc/com.atproto.repo.getRecord?repo=${encodeURIComponent(
123 -
      did
124 -
    )}&collection=${encodeURIComponent(collection)}&rkey=${encodeURIComponent(rkey)}`;
121 +
		// 2. Fetch Document Record
122 +
		const url = `${pds}/xrpc/com.atproto.repo.getRecord?repo=${encodeURIComponent(
123 +
			did,
124 +
		)}&collection=${encodeURIComponent(collection)}&rkey=${encodeURIComponent(rkey)}`;
125 125
126 -
    const response = await fetch(url);
127 -
    if (!response.ok) {
128 -
      if (response.status === 404) {
129 -
        console.warn(`Record not found: ${did}/${collection}/${rkey}`);
130 -
      }
131 -
      return;
132 -
    }
126 +
		const response = await fetch(url);
127 +
		if (!response.ok) {
128 +
			if (response.status === 404) {
129 +
				// Record was deleted from PDS - clean up our local copy
130 +
				console.warn(
131 +
					`Record not found (deleted): ${did}/${collection}/${rkey}`,
132 +
				);
133 +
				const uri = `at://${did}/${collection}/${rkey}`;
134 +
				await db
135 +
					.prepare("DELETE FROM resolved_documents WHERE uri = ?")
136 +
					.bind(uri)
137 +
					.run();
138 +
				await db
139 +
					.prepare(
140 +
						"DELETE FROM repo_records WHERE did = ? AND collection = ? AND rkey = ?",
141 +
					)
142 +
					.bind(did, collection, rkey)
143 +
					.run();
144 +
				return; // Not an error, just cleanup
145 +
			}
146 +
			// Other errors (5xx, rate limits, etc.) should be retried
147 +
			throw new Error(
148 +
				`Failed to fetch record: ${response.status} ${response.statusText}`,
149 +
			);
150 +
		}
133 151
134 -
    const data = (await response.json()) as {
135 -
      uri: string;
136 -
      cid?: string;
137 -
      value: DocumentRecord;
138 -
    };
152 +
		const data = (await response.json()) as {
153 +
			uri: string;
154 +
			cid?: string;
155 +
			value: DocumentRecord;
156 +
		};
139 157
140 -
    const { value, cid } = data;
158 +
		const { value, cid } = data;
141 159
142 -
    // 3. Update repo_records
143 -
    await db
144 -
      .prepare(
145 -
        `INSERT INTO repo_records (did, rkey, collection, cid, synced_at)
160 +
		// 3. Update repo_records
161 +
		await db
162 +
			.prepare(
163 +
				`INSERT INTO repo_records (did, rkey, collection, cid, synced_at)
146 164
         VALUES (?, ?, ?, ?, datetime('now'))
147 165
         ON CONFLICT(did, collection, rkey) DO UPDATE SET
148 166
           cid = ?,
149 -
           synced_at = datetime('now')`
150 -
      )
151 -
      .bind(did, rkey, collection, cid || null, cid || null)
152 -
      .run();
167 +
           synced_at = datetime('now')`,
168 +
			)
169 +
			.bind(did, rkey, collection, cid || null, cid || null)
170 +
			.run();
153 171
154 -
    // 4. Extract document fields
155 -
    const title = value.title || null;
156 -
    const description = value.description || null;
157 -
    const path = value.path || null;
158 -
    const site = value.site || null;
159 -
    const content = value.content ? JSON.stringify(value.content) : null;
160 -
    const textContent = value.textContent || null;
161 -
    const coverImageCid = extractBlobCid(value.coverImage);
162 -
    const coverImageUrl = coverImageCid ? buildBlobUrl(pds, did, coverImageCid) : null;
163 -
    const bskyPostRef = value.bskyPostRef ? JSON.stringify(value.bskyPostRef) : null;
164 -
    const tags = value.tags ? JSON.stringify(value.tags) : null;
165 -
    const publishedAt = value.publishedAt || null;
166 -
    const updatedAt = value.updatedAt || null;
172 +
		// 4. Extract document fields
173 +
		const title = value.title || null;
174 +
		const description = value.description || null;
175 +
		const path = value.path || null;
176 +
		const site = value.site || null;
177 +
		const content = value.content ? JSON.stringify(value.content) : null;
178 +
		const textContent = value.textContent || null;
179 +
		const coverImageCid = extractBlobCid(value.coverImage);
180 +
		const coverImageUrl = coverImageCid
181 +
			? buildBlobUrl(pds, did, coverImageCid)
182 +
			: null;
183 +
		const bskyPostRef = value.bskyPostRef
184 +
			? JSON.stringify(value.bskyPostRef)
185 +
			: null;
186 +
		const tags = value.tags ? JSON.stringify(value.tags) : null;
187 +
		const publishedAt = value.publishedAt || null;
188 +
		const updatedAt = value.updatedAt || null;
167 189
168 -
    // 5. Resolve publication if site is at:// URI
169 -
    let pubUrl: string | null = null;
170 -
    let pubName: string | null = null;
171 -
    let pubDescription: string | null = null;
172 -
    let pubIconCid: string | null = null;
173 -
    let pubIconUrl: string | null = null;
174 -
    let viewUrl: string | null = null;
190 +
		// 5. Resolve publication if site is at:// URI
191 +
		let pubUrl: string | null = null;
192 +
		let pubName: string | null = null;
193 +
		let pubDescription: string | null = null;
194 +
		let pubIconCid: string | null = null;
195 +
		let pubIconUrl: string | null = null;
196 +
		let viewUrl: string | null = null;
175 197
176 -
    if (site) {
177 -
      if (site.startsWith("at://")) {
178 -
        // Fetch publication record
179 -
        const pub = await fetchPublication(db, site);
180 -
        if (pub) {
181 -
          pubUrl = pub.url;
182 -
          pubName = pub.name;
183 -
          pubDescription = pub.description;
184 -
          pubIconCid = pub.iconCid;
185 -
          pubIconUrl = pub.iconUrl;
186 -
          // Construct view URL
187 -
          if (pubUrl && path) {
188 -
            const baseUrl = pubUrl.startsWith("http") ? pubUrl : `https://${pubUrl}`;
189 -
            viewUrl = `${baseUrl.replace(/\/$/, "")}${path}`;
190 -
          }
191 -
        }
192 -
      } else {
193 -
        // Site is a direct URL (loose document)
194 -
        pubUrl = site;
195 -
        if (path) {
196 -
          const baseUrl = site.startsWith("http") ? site : `https://${site}`;
197 -
          viewUrl = `${baseUrl.replace(/\/$/, "")}${path}`;
198 -
        }
199 -
      }
200 -
    }
198 +
		if (site) {
199 +
			if (site.startsWith("at://")) {
200 +
				// Fetch publication record
201 +
				const pub = await fetchPublication(db, site);
202 +
				if (pub) {
203 +
					pubUrl = pub.url;
204 +
					pubName = pub.name;
205 +
					pubDescription = pub.description;
206 +
					pubIconCid = pub.iconCid;
207 +
					pubIconUrl = pub.iconUrl;
208 +
					// Construct view URL
209 +
					if (pubUrl && path) {
210 +
						const baseUrl = pubUrl.startsWith("http")
211 +
							? pubUrl
212 +
							: `https://${pubUrl}`;
213 +
						viewUrl = new URL(path, baseUrl).toString();
214 +
					}
215 +
				}
216 +
			} else {
217 +
				// Site is a direct URL (loose document)
218 +
				pubUrl = site;
219 +
				if (path) {
220 +
					const baseUrl = site.startsWith("http") ? site : `https://${site}`;
221 +
					viewUrl = new URL(path, baseUrl).toString();
222 +
				}
223 +
			}
224 +
		}
201 225
202 -
    // 6. Verify the document
203 -
    const uri = `at://${did}/${collection}/${rkey}`;
204 -
    const verified = await verifyDocumentRecord(pubUrl, site, viewUrl, uri);
226 +
		// 6. Verify the document
227 +
		const uri = `at://${did}/${collection}/${rkey}`;
228 +
		const verified = await verifyDocumentRecord(pubUrl, site, viewUrl, uri);
205 229
206 -
    // 7. Insert/update resolved_documents
207 -
    const STALE_OFFSET_HOURS = 24;
230 +
		// 7. Insert/update resolved_documents
231 +
		const STALE_OFFSET_HOURS = 24;
208 232
209 -
    await db
210 -
      .prepare(
211 -
        `INSERT INTO resolved_documents (
233 +
		await db
234 +
			.prepare(
235 +
				`INSERT INTO resolved_documents (
212 236
          uri, did, rkey, title, description, path, site, content, text_content,
213 237
          cover_image_cid, cover_image_url, bsky_post_ref, tags,
214 238
          published_at, updated_at, pub_url, pub_name, pub_description,
220 244
          cover_image_cid = ?, cover_image_url = ?, bsky_post_ref = ?, tags = ?,
221 245
          published_at = ?, updated_at = ?, pub_url = ?, pub_name = ?, pub_description = ?,
222 246
          pub_icon_cid = ?, pub_icon_url = ?, view_url = ?, pds_endpoint = ?,
223 -
          resolved_at = datetime('now'), stale_at = datetime('now', '+${STALE_OFFSET_HOURS} hours'), verified = ?`
224 -
      )
225 -
      .bind(
226 -
        // INSERT values
227 -
        uri, did, rkey, title, description, path, site, content, textContent,
228 -
        coverImageCid, coverImageUrl, bskyPostRef, tags,
229 -
        publishedAt, updatedAt, pubUrl, pubName, pubDescription,
230 -
        pubIconCid, pubIconUrl, viewUrl, pds, verified ? 1 : 0,
231 -
        // UPDATE values
232 -
        title, description, path, site, content, textContent,
233 -
        coverImageCid, coverImageUrl, bskyPostRef, tags,
234 -
        publishedAt, updatedAt, pubUrl, pubName, pubDescription,
235 -
        pubIconCid, pubIconUrl, viewUrl, pds, verified ? 1 : 0
236 -
      )
237 -
      .run();
247 +
          resolved_at = datetime('now'), stale_at = datetime('now', '+${STALE_OFFSET_HOURS} hours'), verified = ?`,
248 +
			)
249 +
			.bind(
250 +
				// INSERT values
251 +
				uri,
252 +
				did,
253 +
				rkey,
254 +
				title,
255 +
				description,
256 +
				path,
257 +
				site,
258 +
				content,
259 +
				textContent,
260 +
				coverImageCid,
261 +
				coverImageUrl,
262 +
				bskyPostRef,
263 +
				tags,
264 +
				publishedAt,
265 +
				updatedAt,
266 +
				pubUrl,
267 +
				pubName,
268 +
				pubDescription,
269 +
				pubIconCid,
270 +
				pubIconUrl,
271 +
				viewUrl,
272 +
				pds,
273 +
				verified ? 1 : 0,
274 +
				// UPDATE values
275 +
				title,
276 +
				description,
277 +
				path,
278 +
				site,
279 +
				content,
280 +
				textContent,
281 +
				coverImageCid,
282 +
				coverImageUrl,
283 +
				bskyPostRef,
284 +
				tags,
285 +
				publishedAt,
286 +
				updatedAt,
287 +
				pubUrl,
288 +
				pubName,
289 +
				pubDescription,
290 +
				pubIconCid,
291 +
				pubIconUrl,
292 +
				viewUrl,
293 +
				pds,
294 +
				verified ? 1 : 0,
295 +
			)
296 +
			.run();
238 297
239 -
    console.log(`Processed document: ${uri}`);
240 -
  } catch (error) {
241 -
    console.error(`Error processing document ${did}/${collection}/${rkey}:`, error);
242 -
  }
298 +
		console.log(`Processed document: ${uri}`);
299 +
	} catch (error) {
300 +
		console.error(
301 +
			`Error processing document ${did}/${collection}/${rkey}:`,
302 +
			error,
303 +
		);
304 +
		// Re-throw so the queue handler can retry
305 +
		throw error;
306 +
	}
243 307
}