Files
davisfe.cz/scripts/scrapeLakes.ts
T

244 lines
8.6 KiB
TypeScript

import * as fs from 'fs';
import * as path from 'path';
import * as cheerio from 'cheerio';
import axios from 'axios';
import https from 'https';
import { lakesConfig } from './lakesConfig';
interface DataRecord {
timestamp: string;
level: number;
flow: number;
inflow?: number;
volume?: number;
temperature?: number | null;
precipitation?: number | null;
qn?: string;
}
// Parse date from DD.MM.YYYY HH:MM to ISO
export function parseDateString(dateStr: string): string | null {
try {
if (!dateStr || !dateStr.includes(' ')) return null;
const [datePart, timePart] = dateStr.trim().split(' ');
const [day, month, year] = datePart.split('.');
const [hours, minutes] = timePart.split(':');
if (!year || !hours) return null;
const y = parseInt(year);
const m = parseInt(month) - 1;
const dDay = parseInt(day);
const d = new Date(y, m, dDay, parseInt(hours), parseInt(minutes));
if (isNaN(d.getTime())) return null;
if (d.getFullYear() !== y || d.getMonth() !== m || d.getDate() !== dDay) return null;
return d.toISOString();
} catch (e) {
return null;
}
}
async function scrapeLake(lakeId: string, oid: string, internalId: string) {
const config = lakesConfig.find(l => l.id === lakeId);
const isRiver = config?.type === 'river';
const URL = isRiver
? `https://www.pvl.cz/portal/sap/cz/pc/Mereni.aspx?oid=${oid}&id=${internalId}`
: `https://www.pvl.cz/portal/nadrze/cz/pc/Mereni.aspx?oid=${oid}&id=${internalId}`;
const DATA_FILE = path.resolve(`public/data/${internalId}.json`);
try {
const agent = new https.Agent({ rejectUnauthorized: false });
const response = await axios.get(URL, {
httpsAgent: agent,
headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
}
});
const $ = cheerio.load(response.data);
let currentInflow = 0;
let currentVolume = 0;
let currentTemp: number | null = null;
let currentPrecip: number | null = null;
$('table').each((i, tbl) => {
const text = $(tbl).text();
if (text.includes('Aktuální hodnoty') && text.includes('Přítok')) {
$(tbl).find('tr').each((j, r) => {
const label = $(r).find('td').eq(0).text().trim();
const valStr = $(r).find('td').eq(1).text().trim().replace(/\s/g, '').replace(',', '.');
if (label.includes('Přítok')) currentInflow = parseFloat(valStr) || 0;
if (label.includes('Objem')) currentVolume = parseFloat(valStr) || 0;
if (label.includes('Teplota')) {
const v = parseFloat(valStr);
if (!isNaN(v)) currentTemp = v;
}
if (label.includes('Srážky')) {
const v = parseFloat(valStr);
if (!isNaN(v)) currentPrecip = v;
}
});
}
});
const records: DataRecord[] = [];
let dataTable = null;
$('table').each((i, tbl) => {
const id = ($(tbl).attr('id') || '').toLowerCase();
if (id.includes('datamereni24hgv') || id.includes('datamerenigv')) {
dataTable = $(tbl);
}
});
if (dataTable) {
let qnColIndex = -1;
let flowColIndex = 2;
let levelColIndex = 1;
// Find column indices from header dynamically
$(dataTable).find('tr').first().find('th, td').each((idx, cell) => {
const headerText = $(cell).text().trim().toLowerCase();
if (headerText.includes('qn')) {
qnColIndex = idx;
} else if (headerText.includes('hladina') || headerText.includes('stav')) {
levelColIndex = idx;
} else if (headerText.includes('odtok') || headerText.includes('průtok') || headerText.includes('prutok') || headerText.includes('flow')) {
flowColIndex = idx;
}
});
$(dataTable).find('tr').each((i, row) => {
if (i === 0) return; // skip header
const cols = $(row).find('td');
if (cols.length > Math.max(levelColIndex, flowColIndex)) {
const rawDate = $(cols[0]).text().trim();
const levelStr = $(cols[levelColIndex]).text().trim().replace(',', '.');
const flowStr = $(cols[flowColIndex]).text().trim().replace(',', '.');
const qn = qnColIndex !== -1 && cols.length > qnColIndex ? $(cols[qnColIndex]).text().trim() : '';
const parsedDateStr = parseDateString(rawDate);
if (parsedDateStr) {
const newRecord: DataRecord = {
timestamp: parsedDateStr,
level: parseFloat(levelStr) || 0,
flow: parseFloat(flowStr) || 0
};
if (qn) {
newRecord.qn = qn;
}
records.push(newRecord);
}
}
});
}
if (records.length > 0) {
records[0].inflow = currentInflow;
records[0].volume = currentVolume;
// Override weather from PVL completely using Open-Meteo
const config = lakesConfig.find(l => l.id.split('|')[0] === internalId);
if (config && config.coords) {
try {
const lat = config.coords[0];
const lon = config.coords[1];
const url = `https://api.open-meteo.com/v1/forecast?latitude=${lat}&longitude=${lon}&current=temperature_2m,precipitation`;
const weatherRes = await axios.get(url, { timeout: 5000 });
if (weatherRes.data && weatherRes.data.current) {
records[0].temperature = weatherRes.data.current.temperature_2m;
records[0].precipitation = weatherRes.data.current.precipitation;
}
// Small delay to prevent API rate limits
await new Promise(resolve => setTimeout(resolve, 200));
} catch (err: any) {
console.error(`Failed to fetch weather for ${internalId}:`, err.message);
}
}
}
let existingData: DataRecord[] = [];
if (fs.existsSync(DATA_FILE)) {
const fileContent = fs.readFileSync(DATA_FILE, 'utf-8');
existingData = JSON.parse(fileContent);
}
const dataMap = new Map<string, DataRecord>();
existingData.forEach(item => dataMap.set(item.timestamp, item));
records.forEach(item => {
const existing = dataMap.get(item.timestamp);
if (existing) {
dataMap.set(item.timestamp, {
...existing,
...item,
inflow: item.inflow !== undefined ? item.inflow : existing.inflow,
volume: item.volume !== undefined ? item.volume : existing.volume,
qn: item.qn !== undefined ? item.qn : existing.qn
});
} else {
dataMap.set(item.timestamp, item);
}
});
const mergedData = Array.from(dataMap.values()).sort((a, b) => {
return new Date(a.timestamp).getTime() - new Date(b.timestamp).getTime();
});
// Propagate previous values if missing (user requested)
let lastKnownTemp: number | null = null;
let lastKnownPrecip: number | null = null;
let lastKnownInflow: number | undefined = undefined;
let lastKnownVolume: number | undefined = undefined;
mergedData.forEach(item => {
if (item.temperature !== undefined && item.temperature !== null) {
lastKnownTemp = item.temperature;
} else if (lastKnownTemp !== null) {
item.temperature = lastKnownTemp;
}
if (item.precipitation !== undefined && item.precipitation !== null) {
lastKnownPrecip = item.precipitation;
} else if (lastKnownPrecip !== null) {
item.precipitation = lastKnownPrecip;
}
if (item.inflow !== undefined && item.inflow !== null) {
lastKnownInflow = item.inflow;
} else if (lastKnownInflow !== undefined) {
item.inflow = lastKnownInflow;
}
if (item.volume !== undefined && item.volume !== null) {
lastKnownVolume = item.volume;
} else if (lastKnownVolume !== undefined) {
item.volume = lastKnownVolume;
}
});
fs.mkdirSync(path.dirname(DATA_FILE), { recursive: true });
fs.writeFileSync(DATA_FILE, JSON.stringify(mergedData, null, 2), 'utf-8');
console.log(`[${internalId}] Scraped ${records.length} records. DB total: ${mergedData.length}`);
} catch (error: any) {
console.error(`[${internalId}] Error scraping data:`, error.message);
}
}
async function runScraper() {
console.log(`Starting bulk scraper for ${lakesConfig.length} lakes...`);
for (const lake of lakesConfig) {
// ID format: VLL1|1 -> internalId=VLL1, oid=1
const [internalId, oid] = lake.id.split('|');
await scrapeLake(lake.id, oid, internalId);
// Add small delay to not hammer the server
await new Promise(resolve => setTimeout(resolve, 500));
}
console.log('Bulk scraping finished.');
}
runScraper();