feat: block any requests that are disallowed by robots txt

This commit is contained in:
dusk 2024-11-26 21:24:27 +03:00
parent c9765abf6c
commit a67eb0a376
Signed by: dusk
SSH Key Fingerprint: SHA256:Abmvag+juovVufZTxyWY8KcVgrznxvBjQpJesv071Aw
6 changed files with 65 additions and 38 deletions

BIN
bun.lockb

Binary file not shown.

View File

@ -47,6 +47,7 @@
"nanoid": "^5.0.9", "nanoid": "^5.0.9",
"rehype-autolink-headings": "^7.1.0", "rehype-autolink-headings": "^7.1.0",
"rehype-slug": "^6.0.0", "rehype-slug": "^6.0.0",
"robots-parser": "^3.0.1",
"steamgriddb": "^2.2.0", "steamgriddb": "^2.2.0",
"typescript-svelte-plugin": "^0.3.43" "typescript-svelte-plugin": "^0.3.43"
}, },

View File

@ -15,7 +15,7 @@ export const lastFmGetNowPlaying: () => Promise<LastTrack | null> = async () =>
try { try {
var resp = await (await fetch(GET_RECENT_TRACKS_ENDPOINT)).json() var resp = await (await fetch(GET_RECENT_TRACKS_ENDPOINT)).json()
var track = resp.recenttracks.track[0] ?? null var track = resp.recenttracks.track[0] ?? null
if (!(track['@attr'].nowplaying ?? null)) { if (!((track['@attr'] ?? {}).nowplaying ?? null)) {
throw "no nowplaying track found" throw "no nowplaying track found"
} }
var data = { var data = {

53
src/lib/robots.ts Normal file
View File

@ -0,0 +1,53 @@
import { env } from '$env/dynamic/private'
import { get, writable } from 'svelte/store'
import { type Robot } from 'robots-parser'
import robotsParser from 'robots-parser'
import { PUBLIC_BASE_URL } from '$env/static/public'
const cachedParsedRobots = writable<Robot | null>(null)
const cachedRobots = writable<string>("")
const lastFetched = writable<number>(Date.now())
const fetchRobotsTxt = async () => {
const robotsTxtResp = await fetch(
"https://api.darkvisitors.com/robots-txts",
{
method: "POST",
headers: {
"Authorization": `Bearer ${env.DARK_VISITORS_TOKEN}`,
"Content-Type": "application/json"
},
body: JSON.stringify({
agent_types: [
"AI Assistant",
"AI Data Scraper",
"AI Search Crawler",
"Undocumented AI Agent",
],
disallow: "/"
})
}
)
const robotsTxt = await robotsTxtResp.text()
lastFetched.set(Date.now())
return robotsTxt
}
export const getRobotsTxt = async () => {
let robotsTxt = get(cachedRobots)
if (robotsTxt.length === 0 || Date.now() - get(lastFetched) > 1000 * 60 * 60 * 24) {
robotsTxt = await fetchRobotsTxt()
cachedRobots.set(robotsTxt)
cachedParsedRobots.set(robotsParser(`${PUBLIC_BASE_URL}/robots.txt`, robotsTxt))
}
return robotsTxt
}
export const testUa = async (url: string, ua: string) => {
let parsedRobots = get(cachedParsedRobots)
if (parsedRobots === null) {
parsedRobots = robotsParser(`${PUBLIC_BASE_URL}/robots.txt`, await getRobotsTxt())
cachedParsedRobots.set(parsedRobots)
}
return parsedRobots.isAllowed(url, ua)
}

View File

@ -1,13 +1,20 @@
import { testUa } from '$lib/robots.js';
import { incrementVisitCount, notifyDarkVisitors } from '$lib/visits.js'; import { incrementVisitCount, notifyDarkVisitors } from '$lib/visits.js';
import { error } from '@sveltejs/kit';
export const csr = true; export const csr = true;
export const ssr = true; export const ssr = true;
export const prerender = false; export const prerender = false;
export const trailingSlash = 'always'; export const trailingSlash = 'always';
export async function load({ request, cookies, url, setHeaders }) { export async function load({ request, cookies, url }) {
notifyDarkVisitors(url, request) // no await so it doesnt block load notifyDarkVisitors(url, request) // no await so it doesnt block load
// block any requests if the user agent is disallowed by our robots txt
if (await testUa(url.toString(), request.headers.get('user-agent') ?? "unknown user agent") === false) {
throw error(403, "get a better user agent silly")
}
return { return {
route: url.pathname, route: url.pathname,
visitCount: incrementVisitCount(request, cookies), visitCount: incrementVisitCount(request, cookies),

View File

@ -1,39 +1,5 @@
import { env } from '$env/dynamic/private'; import { getRobotsTxt } from "$lib/robots"
import { get, writable } from 'svelte/store';
const cachedRobots = writable<string>("")
const lastFetched = writable<number>(Date.now())
const fetchRobotsTxt = async () => {
const robotsTxtResp = await fetch(
"https://api.darkvisitors.com/robots-txts",
{
method: "POST",
headers: {
"Authorization": `Bearer ${env.DARK_VISITORS_TOKEN}`,
"Content-Type": "application/json"
},
body: JSON.stringify({
agent_types: [
"AI Assistant",
"AI Data Scraper",
"AI Search Crawler",
"Undocumented AI Agent",
],
disallow: "/"
})
}
)
const robotsTxt = await robotsTxtResp.text()
lastFetched.set(Date.now())
return robotsTxt
}
export const GET = async ({ }) => { export const GET = async ({ }) => {
let robotsTxt = get(cachedRobots) return new Response(await getRobotsTxt())
if (robotsTxt.length === 0 || Date.now() - get(lastFetched) > 1000 * 60 * 60 * 24) {
robotsTxt = await fetchRobotsTxt()
cachedRobots.set(robotsTxt)
}
return new Response(robotsTxt)
} }