new-infofer-scraper/scraper/src/Scrapers/Train.cs
2025-01-03 06:50:58 +01:00

268 lines
11 KiB
C#
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

using System;
using System.Collections.Generic;
using System.Linq;
using System.Net;
using System.Net.Http;
using System.Net.Http.Headers;
using System.Text.RegularExpressions;
using System.Threading.Tasks;
using AngleSharp;
using AngleSharp.Dom;
using AngleSharp.Html.Dom;
using Flurl;
using InfoferScraper.Models.Train;
using NodaTime;
using NodaTime.Extensions;
using scraper.Exceptions;
namespace InfoferScraper.Scrapers {
public class TrainScraper {
private const string BaseUrl = "https://mersultrenurilor.infofer.ro/ro-RO/";
private static readonly Regex TrainInfoRegex = new(@"^([A-Z-]+)\s([0-9]+)\sîn\s([0-9.]+)$");
private static readonly Regex OperatorRegex = new(@"^Operat\sde\s(.+)$");
private static readonly Regex RouteRegex =
new(@$"^Parcurs\stren\s([{Utils.RoLetters} ]+)[-]([{Utils.RoLetters}\s]+)$");
private static readonly Regex SlRegex =
new(
@"^(?:Fără|([0-9]+)\smin)\s(întârziere|mai\sdevreme)\sla\s(trecerea\sfără\soprire\sprin|sosirea\sîn|plecarea\sdin)\s(.+)\.$");
private static readonly Dictionary<char, StatusKind> SlStateMap = new() {
{ 't', StatusKind.Passing },
{ 's', StatusKind.Arrival },
{ 'p', StatusKind.Departure },
};
private static readonly Regex KmRegex = new(@"^km\s([0-9]+)$");
private static readonly Regex StoppingTimeRegex = new(@"^([0-9]+)\s(min|sec)\soprire$");
private static readonly Regex PlatformRegex = new(@"^linia\s(.+)$");
private static readonly Regex StationArrdepStatusRegex =
new(@"^(?:(la timp)|(?:((?:\+|-)[0-9]+) min \((?:(?:întârziere)|(?:mai devreme))\)))(\*?)$");
private static readonly Regex TrainNumberChangeNoteRegex =
new(@"^Trenul își schimbă numărul în\s([A-Z-]+)\s([0-9]+)$");
private static readonly Regex DepartsAsNoteRegex =
new(@"^Trenul pleacă cu numărul\s([A-Z-]+)\s([0-9]+)\sîn\s([0-9]{2}).([0-9]{2}).([0-9]{4})$");
private static readonly Regex ReceivingWagonsNoteRegex =
new(@"^Trenul primește vagoane de la\s(.+)\.$");
private static readonly Regex DetachingWagonsNoteRegex =
new(@"^Trenul detașează vagoane pentru stația\s(.+)\.$");
private static readonly DateTimeZone BucharestTz = DateTimeZoneProviders.Tzdb["Europe/Bucharest"];
private readonly CookieContainer cookieContainer = new();
private readonly HttpClient httpClient;
public TrainScraper(HttpClientHandler? httpClientHandler = null)
{
if (httpClientHandler == null) {
httpClientHandler = new HttpClientHandler {
CookieContainer = cookieContainer,
UseCookies = true,
};
}
else {
httpClientHandler.CookieContainer = cookieContainer;
httpClientHandler.UseCookies = true;
}
httpClient = new HttpClient(httpClientHandler) {
BaseAddress = new Uri(BaseUrl),
DefaultRequestVersion = new Version(2, 0),
DefaultRequestHeaders = {
UserAgent = {
new ProductInfoHeaderValue("new-infofer-scraper", "0.0.1"),
new ProductInfoHeaderValue("(developed by DC Dev as a hobby, get in touch at webmaster<at>dcdev.ro for any issues)"),
},
},
};
}
public async Task<ITrainScrapeResult?> Scrape(string trainNumber, DateTimeOffset? dateOverride = null) {
var dateOverrideInstant = dateOverride?.ToInstant().InZone(BucharestTz);
dateOverride = dateOverrideInstant?.ToDateTimeOffset();
TrainScrapeResult result = new();
var asConfig = Configuration.Default;
var asContext = BrowsingContext.New(asConfig);
var firstUrl = "Tren"
.AppendPathSegment(trainNumber);
if (dateOverride != null) {
firstUrl = firstUrl.SetQueryParam("Date", $"{dateOverride:d.MM.yyyy}");
}
var firstResponse = await httpClient.GetStringAsync(firstUrl);
var firstDocument = await asContext.OpenAsync(req => req.Content(firstResponse));
var firstForm = firstDocument.GetElementById("form-search")!;
var firstResult = firstForm
.QuerySelectorAll<IHtmlInputElement>("input")
.Where(elem => elem.Name != null)
.ToDictionary(elem => elem.Name!, elem => elem.Value);
var secondUrl = "".AppendPathSegments("Trains", "TrainsResult");
var secondResponse = await httpClient.PostAsync(
secondUrl,
#pragma warning disable CS8620
new FormUrlEncodedContent(firstResult)
#pragma warning restore CS8620
);
var secondResponseContent = await secondResponse.Content.ReadAsStringAsync();
var secondDocument = await asContext.OpenAsync(
req => req.Content(secondResponseContent)
);
var (trainInfoDiv, (_, (_, (resultsDiv, _)))) = secondDocument
.QuerySelectorAll("body > div");
if (trainInfoDiv == null) {
return null;
}
if (resultsDiv == null) {
throw new TrainNotThisDayException();
}
trainInfoDiv = trainInfoDiv.QuerySelectorAll(":scope > div > div").First();
(result.Rank, (result.Number, (result.Date, _))) = (TrainInfoRegex.Match(
trainInfoDiv.QuerySelector(":scope > h2")!.Text().WithCollapsedSpaces()
).Groups as IEnumerable<Group>).Select(group => group.Value).Skip(1);
var (scrapedDateD, (scrapedDateM, (scrapedDateY, _))) = result.Date
.Split('.')
.Select(int.Parse);
var date = new DateTime(scrapedDateY, scrapedDateM, scrapedDateD);
result.Operator = (OperatorRegex.Match(
trainInfoDiv.QuerySelector(":scope > p")!.Text().WithCollapsedSpaces()
).Groups as IEnumerable<Group>).Skip(1).First().Value;
foreach (var groupDiv in resultsDiv.QuerySelectorAll(":scope > div")) {
result.AddTrainGroup(group => {
var statusDiv = groupDiv.QuerySelectorAll(":scope > div").First();
var routeText = statusDiv.QuerySelector(":scope > h4")!.Text().WithCollapsedSpaces();
group.ConfigureRoute(route => {
(route.From, (route.To, _)) = (RouteRegex.Match(routeText).Groups as IEnumerable<Group>).Skip(1)
.Select(group => group.Value);
});
try {
var statusLineMatch =
SlRegex.Match(statusDiv.QuerySelector(":scope > div")!.Text().WithCollapsedSpaces());
var (slmDelay, (slmLate, (slmArrival, (slmStation, _)))) =
(statusLineMatch.Groups as IEnumerable<Group>).Skip(1).Select(group => group.Value);
group.MakeStatus(status => {
status.Delay = string.IsNullOrEmpty(slmDelay) ? 0 :
slmLate == "întârziere" ? int.Parse(slmDelay) : -int.Parse(slmDelay);
status.Station = slmStation;
status.State = SlStateMap[slmArrival[0]];
});
}
catch {
// ignored
}
Utils.DateTimeSequencer dtSeq = new(date.Year, date.Month, date.Day);
var stations = statusDiv.QuerySelectorAll(":scope > ul > li");
foreach (var station in stations) {
group.AddStopDescription(stopDescription => {
var (left, (middle, (right, _))) = station
.QuerySelectorAll(":scope > div > div");
var (stopDetails, (stopNotes, _)) = middle
.QuerySelectorAll(":scope > div > div > div");
stopDescription.Name = stopDetails
.QuerySelectorAll(":scope > div")[0]
.Text()
.WithCollapsedSpaces();
stopDescription.LinkName = new Flurl.Url(stopDetails
.QuerySelectorAll(":scope > div")[0]
.QuerySelector(":scope a")
.Attributes["href"]
.Value).PathSegments.Last();
var scrapedKm = stopDetails
.QuerySelectorAll(":scope > div")[1]
.Text()
.WithCollapsedSpaces();
stopDescription.Km = int.Parse(
(KmRegex.Match(scrapedKm).Groups as IEnumerable<Group>).Skip(1).First().Value
);
var scrapedStoppingTime = stopDetails
.QuerySelectorAll(":scope > div")[2]
.Text()
.WithCollapsedSpaces();
if (!string.IsNullOrEmpty(scrapedStoppingTime)) {
var (stValue, (stMinsec, _)) =
(StoppingTimeRegex.Match(scrapedStoppingTime).Groups as IEnumerable<Group>)
.Skip(1)
.Select(group => group.Value);
stopDescription.StoppingTime = int.Parse(stValue);
if (stMinsec == "min") stopDescription.StoppingTime *= 60;
}
var scrapedPlatform = stopDetails
.QuerySelectorAll(":scope > div")[3]
.Text()
.WithCollapsedSpaces();
if (!string.IsNullOrEmpty(scrapedPlatform))
stopDescription.Platform = PlatformRegex.Match(scrapedPlatform).Groups[1].Value;
void ScrapeTime(IElement element, ref TrainStopArrDep arrDep) {
var parts = element.QuerySelectorAll(":scope > div > div > div");
if (parts.Length == 0) throw new OperationCanceledException();
var time = parts[0];
var scrapedTime = time.Text().WithCollapsedSpaces();
var (stHour, (stMin, _)) = scrapedTime.Split(':').Select(int.Parse);
arrDep.ScheduleTime = BucharestTz.AtLeniently(dtSeq.Next(stHour, stMin).ToLocalDateTime())
.ToDateTimeOffset();
if (parts.Length < 2) return;
var statusElement = parts[1];
var (onTime, (delay, (approx, _))) = (StationArrdepStatusRegex.Match(
statusElement.Text().WithCollapsedSpaces(replaceWith: " ")
).Groups as IEnumerable<Group>).Skip(1).Select(group => group.Value);
arrDep.MakeStatus(status => {
if (string.IsNullOrEmpty(onTime) && delay == null) {
status.Cancelled = true;
}
else {
status.Delay = string.IsNullOrEmpty(onTime) ? int.Parse(delay) : 0;
}
status.Real = string.IsNullOrEmpty(approx);
});
}
try {
stopDescription.MakeArrival(arrival => { ScrapeTime(left, ref arrival); });
}
catch (OperationCanceledException) { }
try {
stopDescription.MakeDeparture(departure => { ScrapeTime(right, ref departure); });
}
catch (OperationCanceledException) { }
foreach (var noteDiv in stopNotes.QuerySelectorAll(":scope > div > div")) {
var noteText = noteDiv.Text().WithCollapsedSpaces();
Match trainNumberChangeMatch, departsAsMatch, detachingWagons, receivingWagons;
if ((trainNumberChangeMatch = TrainNumberChangeNoteRegex.Match(noteText)).Success) {
stopDescription.AddTrainNumberChangeNote(trainNumberChangeMatch.Groups[1].Value, trainNumberChangeMatch.Groups[2].Value);
}
else if ((departsAsMatch = DepartsAsNoteRegex.Match(noteText)).Success) {
var groups = departsAsMatch.Groups;
var departureDate = BucharestTz.AtStrictly(new(int.Parse(groups[5].Value), int.Parse(groups[4].Value), int.Parse(groups[3].Value), 0, 0));
stopDescription.AddDepartsAsNote(groups[1].Value, groups[2].Value, departureDate.ToDateTimeOffset());
}
else if ((detachingWagons = DetachingWagonsNoteRegex.Match(noteText)).Success) {
stopDescription.AddDetachingWagonsNote(detachingWagons.Groups[1].Value);
}
else if ((receivingWagons = ReceivingWagonsNoteRegex.Match(noteText)).Success) {
stopDescription.AddReceivingWagonsNote(receivingWagons.Groups[1].Value);
}
}
});
}
});
}
return result;
}
}
} // namespace