using System; using System.Collections.Generic; using System.Linq; using System.Net; using System.Net.Http; using System.Net.Http.Headers; using System.Text.RegularExpressions; using System.Threading.Tasks; using AngleSharp; using AngleSharp.Dom; using AngleSharp.Html.Dom; using Flurl; using InfoferScraper.Models.Train; using NodaTime; using NodaTime.Extensions; using scraper.Exceptions; namespace InfoferScraper.Scrapers { public class TrainScraper { private const string BaseUrl = "https://mersultrenurilor.infofer.ro/ro-RO/"; private static readonly Regex TrainInfoRegex = new(@"^([A-Z-]+)\s([0-9]+)\sîn\s([0-9.]+)$"); private static readonly Regex OperatorRegex = new(@"^Operat\sde\s(.+)$"); private static readonly Regex RouteRegex = new(@$"^Parcurs\stren\s([{Utils.RoLetters} ]+)[-–]([{Utils.RoLetters}\s]+)$"); private static readonly Regex SlRegex = new( @"^(?:Fără|([0-9]+)\smin)\s(întârziere|mai\sdevreme)\sla\s(trecerea\sfără\soprire\sprin|sosirea\sîn|plecarea\sdin)\s(.+)\.$"); private static readonly Dictionary SlStateMap = new() { { 't', StatusKind.Passing }, { 's', StatusKind.Arrival }, { 'p', StatusKind.Departure }, }; private static readonly Regex KmRegex = new(@"^km\s([0-9]+)$"); private static readonly Regex StoppingTimeRegex = new(@"^([0-9]+)\s(min|sec)\soprire$"); private static readonly Regex PlatformRegex = new(@"^linia\s(.+)$"); private static readonly Regex StationArrdepStatusRegex = new(@"^(?:(la timp)|(?:((?:\+|-)[0-9]+) min \((?:(?:întârziere)|(?:mai devreme))\)))(\*?)$"); private static readonly Regex TrainNumberChangeNoteRegex = new(@"^Trenul își schimbă numărul în\s([A-Z-]+)\s([0-9]+)$"); private static readonly Regex DepartsAsNoteRegex = new(@"^Trenul pleacă cu numărul\s([A-Z-]+)\s([0-9]+)\sîn\s([0-9]{2}).([0-9]{2}).([0-9]{4})$"); private static readonly Regex ReceivingWagonsNoteRegex = new(@"^Trenul primește vagoane de la\s(.+)\.$"); private static readonly Regex DetachingWagonsNoteRegex = new(@"^Trenul detașează vagoane pentru stația\s(.+)\.$"); private static readonly DateTimeZone BucharestTz = DateTimeZoneProviders.Tzdb["Europe/Bucharest"]; private readonly CookieContainer cookieContainer = new(); private readonly HttpClient httpClient; public TrainScraper(HttpClientHandler? httpClientHandler = null) { if (httpClientHandler == null) { httpClientHandler = new HttpClientHandler { CookieContainer = cookieContainer, UseCookies = true, }; } else { httpClientHandler.CookieContainer = cookieContainer; httpClientHandler.UseCookies = true; } httpClient = new HttpClient(httpClientHandler) { BaseAddress = new Uri(BaseUrl), DefaultRequestVersion = new Version(2, 0), DefaultRequestHeaders = { UserAgent = { new ProductInfoHeaderValue("new-infofer-scraper", "0.0.1"), new ProductInfoHeaderValue("(developed by DC Dev as a hobby, get in touch at webmasterdcdev.ro for any issues)"), }, }, }; } public async Task Scrape(string trainNumber, DateTimeOffset? dateOverride = null) { var dateOverrideInstant = dateOverride?.ToInstant().InZone(BucharestTz); dateOverride = dateOverrideInstant?.ToDateTimeOffset(); TrainScrapeResult result = new(); var asConfig = Configuration.Default; var asContext = BrowsingContext.New(asConfig); var firstUrl = "Tren" .AppendPathSegment(trainNumber); if (dateOverride != null) { firstUrl = firstUrl.SetQueryParam("Date", $"{dateOverride:d.MM.yyyy}"); } var firstResponse = await httpClient.GetStringAsync(firstUrl); var firstDocument = await asContext.OpenAsync(req => req.Content(firstResponse)); var firstForm = firstDocument.GetElementById("form-search")!; var firstResult = firstForm .QuerySelectorAll("input") .Where(elem => elem.Name != null) .ToDictionary(elem => elem.Name!, elem => elem.Value); var secondUrl = "".AppendPathSegments("Trains", "TrainsResult"); var secondResponse = await httpClient.PostAsync( secondUrl, #pragma warning disable CS8620 new FormUrlEncodedContent(firstResult) #pragma warning restore CS8620 ); var secondResponseContent = await secondResponse.Content.ReadAsStringAsync(); var secondDocument = await asContext.OpenAsync( req => req.Content(secondResponseContent) ); var (trainInfoDiv, (_, (_, (resultsDiv, _)))) = secondDocument .QuerySelectorAll("body > div"); if (trainInfoDiv == null) { return null; } if (resultsDiv == null) { throw new TrainNotThisDayException(); } trainInfoDiv = trainInfoDiv.QuerySelectorAll(":scope > div > div").First(); (result.Rank, (result.Number, (result.Date, _))) = (TrainInfoRegex.Match( trainInfoDiv.QuerySelector(":scope > h2")!.Text().WithCollapsedSpaces() ).Groups as IEnumerable).Select(group => group.Value).Skip(1); var (scrapedDateD, (scrapedDateM, (scrapedDateY, _))) = result.Date .Split('.') .Select(int.Parse); var date = new DateTime(scrapedDateY, scrapedDateM, scrapedDateD); result.Operator = (OperatorRegex.Match( trainInfoDiv.QuerySelector(":scope > p")!.Text().WithCollapsedSpaces() ).Groups as IEnumerable).Skip(1).First().Value; foreach (var groupDiv in resultsDiv.QuerySelectorAll(":scope > div")) { result.AddTrainGroup(group => { var statusDiv = groupDiv.QuerySelectorAll(":scope > div").First(); var routeText = statusDiv.QuerySelector(":scope > h4")!.Text().WithCollapsedSpaces(); group.ConfigureRoute(route => { (route.From, (route.To, _)) = (RouteRegex.Match(routeText).Groups as IEnumerable).Skip(1) .Select(group => group.Value); }); try { var statusLineMatch = SlRegex.Match(statusDiv.QuerySelector(":scope > div")!.Text().WithCollapsedSpaces()); var (slmDelay, (slmLate, (slmArrival, (slmStation, _)))) = (statusLineMatch.Groups as IEnumerable).Skip(1).Select(group => group.Value); group.MakeStatus(status => { status.Delay = string.IsNullOrEmpty(slmDelay) ? 0 : slmLate == "întârziere" ? int.Parse(slmDelay) : -int.Parse(slmDelay); status.Station = slmStation; status.State = SlStateMap[slmArrival[0]]; }); } catch { // ignored } Utils.DateTimeSequencer dtSeq = new(date.Year, date.Month, date.Day); var stations = statusDiv.QuerySelectorAll(":scope > ul > li"); foreach (var station in stations) { group.AddStopDescription(stopDescription => { var (left, (middle, (right, _))) = station .QuerySelectorAll(":scope > div > div"); var (stopDetails, (stopNotes, _)) = middle .QuerySelectorAll(":scope > div > div > div"); stopDescription.Name = stopDetails .QuerySelectorAll(":scope > div")[0] .Text() .WithCollapsedSpaces(); stopDescription.LinkName = new Flurl.Url(stopDetails .QuerySelectorAll(":scope > div")[0] .QuerySelector(":scope a") .Attributes["href"] .Value).PathSegments.Last(); var scrapedKm = stopDetails .QuerySelectorAll(":scope > div")[1] .Text() .WithCollapsedSpaces(); stopDescription.Km = int.Parse( (KmRegex.Match(scrapedKm).Groups as IEnumerable).Skip(1).First().Value ); var scrapedStoppingTime = stopDetails .QuerySelectorAll(":scope > div")[2] .Text() .WithCollapsedSpaces(); if (!string.IsNullOrEmpty(scrapedStoppingTime)) { var (stValue, (stMinsec, _)) = (StoppingTimeRegex.Match(scrapedStoppingTime).Groups as IEnumerable) .Skip(1) .Select(group => group.Value); stopDescription.StoppingTime = int.Parse(stValue); if (stMinsec == "min") stopDescription.StoppingTime *= 60; } var scrapedPlatform = stopDetails .QuerySelectorAll(":scope > div")[3] .Text() .WithCollapsedSpaces(); if (!string.IsNullOrEmpty(scrapedPlatform)) stopDescription.Platform = PlatformRegex.Match(scrapedPlatform).Groups[1].Value; void ScrapeTime(IElement element, ref TrainStopArrDep arrDep) { var parts = element.QuerySelectorAll(":scope > div > div > div"); if (parts.Length == 0) throw new OperationCanceledException(); var time = parts[0]; var scrapedTime = time.Text().WithCollapsedSpaces(); var (stHour, (stMin, _)) = scrapedTime.Split(':').Select(int.Parse); arrDep.ScheduleTime = BucharestTz.AtLeniently(dtSeq.Next(stHour, stMin).ToLocalDateTime()) .ToDateTimeOffset(); if (parts.Length < 2) return; var statusElement = parts[1]; var (onTime, (delay, (approx, _))) = (StationArrdepStatusRegex.Match( statusElement.Text().WithCollapsedSpaces(replaceWith: " ") ).Groups as IEnumerable).Skip(1).Select(group => group.Value); arrDep.MakeStatus(status => { if (string.IsNullOrEmpty(onTime) && delay == null) { status.Cancelled = true; } else { status.Delay = string.IsNullOrEmpty(onTime) ? int.Parse(delay) : 0; } status.Real = string.IsNullOrEmpty(approx); }); } try { stopDescription.MakeArrival(arrival => { ScrapeTime(left, ref arrival); }); } catch (OperationCanceledException) { } try { stopDescription.MakeDeparture(departure => { ScrapeTime(right, ref departure); }); } catch (OperationCanceledException) { } foreach (var noteDiv in stopNotes.QuerySelectorAll(":scope > div > div")) { var noteText = noteDiv.Text().WithCollapsedSpaces(); Match trainNumberChangeMatch, departsAsMatch, detachingWagons, receivingWagons; if ((trainNumberChangeMatch = TrainNumberChangeNoteRegex.Match(noteText)).Success) { stopDescription.AddTrainNumberChangeNote(trainNumberChangeMatch.Groups[1].Value, trainNumberChangeMatch.Groups[2].Value); } else if ((departsAsMatch = DepartsAsNoteRegex.Match(noteText)).Success) { var groups = departsAsMatch.Groups; var departureDate = BucharestTz.AtStrictly(new(int.Parse(groups[5].Value), int.Parse(groups[4].Value), int.Parse(groups[3].Value), 0, 0)); stopDescription.AddDepartsAsNote(groups[1].Value, groups[2].Value, departureDate.ToDateTimeOffset()); } else if ((detachingWagons = DetachingWagonsNoteRegex.Match(noteText)).Success) { stopDescription.AddDetachingWagonsNote(detachingWagons.Groups[1].Value); } else if ((receivingWagons = ReceivingWagonsNoteRegex.Match(noteText)).Success) { stopDescription.AddReceivingWagonsNote(receivingWagons.Groups[1].Value); } } }); } }); } return result; } } } // namespace