99 lines
3.2 KiB
TypeScript
99 lines
3.2 KiB
TypeScript
import * as fs from 'fs';
|
|
import * as path from 'path';
|
|
import * as cheerio from 'cheerio';
|
|
import axios from 'axios';
|
|
import https from 'https';
|
|
import { lakesConfig } from './lakesConfig';
|
|
|
|
interface DataRecord {
|
|
timestamp: string;
|
|
level: number;
|
|
flow: number;
|
|
}
|
|
|
|
// Parse date from DD.MM.YYYY HH:MM to ISO
|
|
function parseDateString(dateStr: string): string {
|
|
const [datePart, timePart] = dateStr.trim().split(' ');
|
|
const [day, month, year] = datePart.split('.');
|
|
const [hours, minutes] = timePart.split(':');
|
|
|
|
const d = new Date(parseInt(year), parseInt(month) - 1, parseInt(day), parseInt(hours), parseInt(minutes));
|
|
return d.toISOString();
|
|
}
|
|
|
|
async function scrapeLake(lakeId: string, oid: string, internalId: string) {
|
|
const URL = `https://www.pvl.cz/portal/nadrze/cz/pc/Mereni.aspx?oid=${oid}&id=${internalId}`;
|
|
const DATA_FILE = path.resolve(`public/data/${internalId}.json`);
|
|
|
|
try {
|
|
const agent = new https.Agent({ rejectUnauthorized: false });
|
|
const response = await axios.get(URL, {
|
|
httpsAgent: agent,
|
|
headers: {
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
|
|
}
|
|
});
|
|
|
|
const html = response.data;
|
|
const $ = cheerio.load(html);
|
|
const rows = $('table tr');
|
|
const newData: DataRecord[] = [];
|
|
|
|
rows.each((i, row) => {
|
|
const tds = $(row).find('td');
|
|
if (tds.length >= 3) {
|
|
const datetimeText = $(tds[0]).text().trim();
|
|
if (/^\d{2}\.\d{2}\.\d{4}\s\d{2}:\d{2}$/.test(datetimeText)) {
|
|
const timestamp = parseDateString(datetimeText);
|
|
const levelText = $(tds[1]).text().trim().replace(',', '.');
|
|
const flowText = $(tds[2]).text().trim().replace(',', '.');
|
|
|
|
newData.push({
|
|
timestamp,
|
|
level: parseFloat(levelText),
|
|
flow: parseFloat(flowText)
|
|
});
|
|
}
|
|
}
|
|
});
|
|
|
|
let existingData: DataRecord[] = [];
|
|
if (fs.existsSync(DATA_FILE)) {
|
|
const fileContent = fs.readFileSync(DATA_FILE, 'utf-8');
|
|
existingData = JSON.parse(fileContent);
|
|
}
|
|
|
|
const dataMap = new Map<string, DataRecord>();
|
|
existingData.forEach(item => dataMap.set(item.timestamp, item));
|
|
newData.forEach(item => dataMap.set(item.timestamp, item));
|
|
|
|
const mergedData = Array.from(dataMap.values()).sort((a, b) => {
|
|
return new Date(a.timestamp).getTime() - new Date(b.timestamp).getTime();
|
|
});
|
|
|
|
fs.mkdirSync(path.dirname(DATA_FILE), { recursive: true });
|
|
fs.writeFileSync(DATA_FILE, JSON.stringify(mergedData, null, 2), 'utf-8');
|
|
|
|
console.log(`[${internalId}] Scraped ${newData.length} records. DB total: ${mergedData.length}`);
|
|
|
|
} catch (error: any) {
|
|
console.error(`[${internalId}] Error scraping data:`, error.message);
|
|
}
|
|
}
|
|
|
|
async function runScraper() {
|
|
console.log(`Starting bulk scraper for ${lakesConfig.length} lakes...`);
|
|
|
|
for (const lake of lakesConfig) {
|
|
// ID format: VLL1|1 -> internalId=VLL1, oid=1
|
|
const [internalId, oid] = lake.id.split('|');
|
|
await scrapeLake(lake.id, oid, internalId);
|
|
// Add small delay to not hammer the server
|
|
await new Promise(resolve => setTimeout(resolve, 500));
|
|
}
|
|
|
|
console.log('Bulk scraping finished.');
|
|
}
|
|
|
|
runScraper();
|