Add initial itinerary scraping
This commit is contained in:
parent
de5d85cff4
commit
1d9db5b491
4 changed files with 302 additions and 1 deletions
|
@ -6,7 +6,7 @@
|
||||||
|
|
||||||
<PropertyGroup>
|
<PropertyGroup>
|
||||||
<OutputType>Exe</OutputType>
|
<OutputType>Exe</OutputType>
|
||||||
<TargetFramework>net6.0</TargetFramework>
|
<TargetFrameworks>net6.0;net7.0</TargetFrameworks>
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
|
|
||||||
</Project>
|
</Project>
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
using System;
|
using System;
|
||||||
|
using System.Linq;
|
||||||
using System.Text.Json;
|
using System.Text.Json;
|
||||||
using System.Threading.Tasks;
|
using System.Threading.Tasks;
|
||||||
using InfoferScraper;
|
using InfoferScraper;
|
||||||
|
@ -7,6 +8,7 @@ using InfoferScraper.Scrapers;
|
||||||
while (true) {
|
while (true) {
|
||||||
Console.WriteLine("1. Scrape Train");
|
Console.WriteLine("1. Scrape Train");
|
||||||
Console.WriteLine("2. Scrape Station");
|
Console.WriteLine("2. Scrape Station");
|
||||||
|
Console.WriteLine("3. Scrape Itineraries");
|
||||||
Console.WriteLine("0. Exit");
|
Console.WriteLine("0. Exit");
|
||||||
|
|
||||||
var input = Console.ReadLine()?.Trim();
|
var input = Console.ReadLine()?.Trim();
|
||||||
|
@ -17,6 +19,9 @@ while (true) {
|
||||||
case "2":
|
case "2":
|
||||||
await PrintStation();
|
await PrintStation();
|
||||||
break;
|
break;
|
||||||
|
case "3":
|
||||||
|
await ScrapeItineraries();
|
||||||
|
break;
|
||||||
case null:
|
case null:
|
||||||
case "0":
|
case "0":
|
||||||
goto INPUT_LOOP_BREAK;
|
goto INPUT_LOOP_BREAK;
|
||||||
|
@ -61,3 +66,30 @@ async Task PrintStation() {
|
||||||
)
|
)
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
async Task ScrapeItineraries() {
|
||||||
|
Console.Write("From station: ");
|
||||||
|
var from = Console.ReadLine();
|
||||||
|
Console.Write("To station: ");
|
||||||
|
var to = Console.ReadLine();
|
||||||
|
|
||||||
|
if (from == null || to == null) return;
|
||||||
|
|
||||||
|
var data = await RouteScraper.Scrape(from, to);
|
||||||
|
|
||||||
|
Console.WriteLine($"{data.Count} itineraries:");
|
||||||
|
Console.WriteLine();
|
||||||
|
|
||||||
|
void PrintArrDepLine(DateTimeOffset date, string station) {
|
||||||
|
Console.WriteLine($"{date:HH:mm} {station}");
|
||||||
|
}
|
||||||
|
|
||||||
|
foreach (var itinerary in data) {
|
||||||
|
foreach (var train in itinerary.Trains) {
|
||||||
|
PrintArrDepLine(train.DepartureDate, train.From);
|
||||||
|
Console.WriteLine($" {train.TrainRank,-4} {train.TrainNumber,-5} ({train.Operator}), {train.Km,3} km via {string.Join(", ", train.IntermediateStops)}");
|
||||||
|
PrintArrDepLine(train.ArrivalDate, train.To);
|
||||||
|
}
|
||||||
|
|
||||||
|
Console.WriteLine();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
62
scraper/src/Models/Itinerary.cs
Normal file
62
scraper/src/Models/Itinerary.cs
Normal file
|
@ -0,0 +1,62 @@
|
||||||
|
using System;
|
||||||
|
using System.Collections.Generic;
|
||||||
|
|
||||||
|
namespace scraper.Models.Itinerary;
|
||||||
|
|
||||||
|
#region Interfaces
|
||||||
|
|
||||||
|
public interface IItinerary {
|
||||||
|
public IReadOnlyList<IItineraryTrain> Trains { get; }
|
||||||
|
}
|
||||||
|
|
||||||
|
public interface IItineraryTrain {
|
||||||
|
public string From { get; }
|
||||||
|
public string To { get; }
|
||||||
|
public IReadOnlyList<string> IntermediateStops { get; }
|
||||||
|
public DateTimeOffset DepartureDate { get; }
|
||||||
|
public DateTimeOffset ArrivalDate { get; }
|
||||||
|
public int Km { get; }
|
||||||
|
public string Operator { get; }
|
||||||
|
public string TrainRank { get; }
|
||||||
|
public string TrainNumber { get; }
|
||||||
|
}
|
||||||
|
|
||||||
|
#endregion
|
||||||
|
|
||||||
|
#region Implementations
|
||||||
|
|
||||||
|
internal record Itinerary : IItinerary {
|
||||||
|
private List<IItineraryTrain> ModifyableTrains { get; set; } = new();
|
||||||
|
|
||||||
|
public IReadOnlyList<IItineraryTrain> Trains => ModifyableTrains;
|
||||||
|
|
||||||
|
internal void AddTrain(IItineraryTrain train) {
|
||||||
|
ModifyableTrains.Add(train);
|
||||||
|
}
|
||||||
|
|
||||||
|
internal void AddTrain(Action<ItineraryTrain> configurator) {
|
||||||
|
ItineraryTrain newTrain = new();
|
||||||
|
configurator(newTrain);
|
||||||
|
AddTrain(newTrain);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
internal record ItineraryTrain : IItineraryTrain {
|
||||||
|
private List<string> ModifyableIntermediateStops { get; set; } = new();
|
||||||
|
|
||||||
|
public string From { get; internal set; } = "";
|
||||||
|
public string To { get; internal set; } = "";
|
||||||
|
public IReadOnlyList<string> IntermediateStops => ModifyableIntermediateStops;
|
||||||
|
public DateTimeOffset DepartureDate { get; internal set; } = new();
|
||||||
|
public DateTimeOffset ArrivalDate { get; internal set; } = new();
|
||||||
|
public int Km { get; internal set; } = 0;
|
||||||
|
public string Operator { get; internal set; } = "";
|
||||||
|
public string TrainRank { get; internal set; } = "";
|
||||||
|
public string TrainNumber { get; internal set; } = "";
|
||||||
|
|
||||||
|
internal void AddIntermediateStop(string stop) {
|
||||||
|
ModifyableIntermediateStops.Add(stop);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#endregion
|
207
scraper/src/Scrapers/Route.cs
Normal file
207
scraper/src/Scrapers/Route.cs
Normal file
|
@ -0,0 +1,207 @@
|
||||||
|
using System;
|
||||||
|
using System.Collections.Generic;
|
||||||
|
using System.Linq;
|
||||||
|
using System.Net;
|
||||||
|
using System.Net.Http;
|
||||||
|
using System.Text.RegularExpressions;
|
||||||
|
using System.Threading.Tasks;
|
||||||
|
using AngleSharp;
|
||||||
|
using AngleSharp.Dom;
|
||||||
|
using AngleSharp.Html.Dom;
|
||||||
|
using Flurl;
|
||||||
|
using InfoferScraper.Models.Train;
|
||||||
|
using NodaTime;
|
||||||
|
using NodaTime.Extensions;
|
||||||
|
using scraper.Models.Itinerary;
|
||||||
|
|
||||||
|
namespace InfoferScraper.Scrapers;
|
||||||
|
|
||||||
|
public static class RouteScraper {
|
||||||
|
private const string BaseUrl = "https://mersultrenurilor.infofer.ro/ro-RO/";
|
||||||
|
private static readonly DateTimeZone BucharestTz = DateTimeZoneProviders.Tzdb["Europe/Bucharest"];
|
||||||
|
|
||||||
|
private static readonly CookieContainer CookieContainer = new();
|
||||||
|
|
||||||
|
private static readonly HttpClient HttpClient = new(new HttpClientHandler {
|
||||||
|
CookieContainer = CookieContainer,
|
||||||
|
UseCookies = true,
|
||||||
|
}) {
|
||||||
|
BaseAddress = new Uri(BaseUrl),
|
||||||
|
DefaultRequestVersion = new Version(2, 0),
|
||||||
|
};
|
||||||
|
|
||||||
|
private static readonly Regex KmTrainRankNoRegex = new(@"^([0-9]+)\skm\scu\s([A-Z-]+)\s([0-9]+)$");
|
||||||
|
private static readonly Regex OperatorRegex = new(@$"^Operat\sde\s([{Utils.RoLetters}\s]+)$");
|
||||||
|
private static readonly Regex DepArrRegex = new(@"^(Ple|Sos)\s([0-9]+)\s([a-z]+)\.?\s([0-9]+):([0-9]+)$");
|
||||||
|
|
||||||
|
private static readonly Dictionary<string, int> Months = new Dictionary<string, int>() {
|
||||||
|
["ian"] = 1,
|
||||||
|
["feb"] = 2,
|
||||||
|
["mar"] = 3,
|
||||||
|
["apr"] = 4,
|
||||||
|
["mai"] = 5,
|
||||||
|
["iun"] = 6,
|
||||||
|
["iul"] = 7,
|
||||||
|
["aug"] = 8,
|
||||||
|
["sep"] = 9,
|
||||||
|
["oct"] = 10,
|
||||||
|
["noi"] = 11,
|
||||||
|
["dec"] = 12,
|
||||||
|
};
|
||||||
|
|
||||||
|
public static async Task<List<IItinerary>?> Scrape(string from, string to, DateTimeOffset? dateOverride = null) {
|
||||||
|
var dateOverrideInstant = dateOverride?.ToInstant().InZone(BucharestTz);
|
||||||
|
dateOverride = dateOverrideInstant?.ToDateTimeOffset();
|
||||||
|
TrainScrapeResult result = new();
|
||||||
|
|
||||||
|
var asConfig = Configuration.Default;
|
||||||
|
var asContext = BrowsingContext.New(asConfig);
|
||||||
|
|
||||||
|
var firstUrl = "Rute-trenuri"
|
||||||
|
.AppendPathSegment(from)
|
||||||
|
.AppendPathSegment(to);
|
||||||
|
if (dateOverride != null) {
|
||||||
|
firstUrl = firstUrl.SetQueryParam("DepartureDate", $"{dateOverride:d.MM.yyyy}");
|
||||||
|
}
|
||||||
|
firstUrl = firstUrl.SetQueryParam("OrderingTypeId", "0");
|
||||||
|
firstUrl = firstUrl.SetQueryParam("TimeSelectionId", "0");
|
||||||
|
firstUrl = firstUrl.SetQueryParam("MinutesInDay", "0");
|
||||||
|
firstUrl = firstUrl.SetQueryParam("ConnectionsTypeId", "1");
|
||||||
|
firstUrl = firstUrl.SetQueryParam("BetweenTrainsMinimumMinutes", "5");
|
||||||
|
firstUrl = firstUrl.SetQueryParam("ChangeStationName", "");
|
||||||
|
|
||||||
|
var firstResponse = await HttpClient.GetStringAsync(firstUrl);
|
||||||
|
var firstDocument = await asContext.OpenAsync(req => req.Content(firstResponse));
|
||||||
|
var firstForm = firstDocument.GetElementById("form-search")!;
|
||||||
|
|
||||||
|
var firstResult = firstForm
|
||||||
|
.QuerySelectorAll<IHtmlInputElement>("input")
|
||||||
|
.Where(elem => elem.Name != null)
|
||||||
|
.ToDictionary(elem => elem.Name!, elem => elem.Value);
|
||||||
|
|
||||||
|
var secondUrl = "".AppendPathSegments("Itineraries", "GetItineraries");
|
||||||
|
var secondResponse = await HttpClient.PostAsync(
|
||||||
|
secondUrl,
|
||||||
|
#pragma warning disable CS8620
|
||||||
|
new FormUrlEncodedContent(firstResult)
|
||||||
|
#pragma warning restore CS8620
|
||||||
|
);
|
||||||
|
var secondResponseContent = await secondResponse.Content.ReadAsStringAsync();
|
||||||
|
var secondDocument = await asContext.OpenAsync(
|
||||||
|
req => req.Content(secondResponseContent)
|
||||||
|
);
|
||||||
|
|
||||||
|
var (itineraryInfoDiv, _) = secondDocument
|
||||||
|
.QuerySelectorAll("body > div");
|
||||||
|
|
||||||
|
if (itineraryInfoDiv == null) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
var itinerariesLi = secondDocument
|
||||||
|
.QuerySelectorAll("body > ul > li");
|
||||||
|
var itineraries = new List<IItinerary>();
|
||||||
|
foreach (var itineraryLi in itinerariesLi) {
|
||||||
|
var itinerary = new Itinerary();
|
||||||
|
|
||||||
|
var cardDivs = itineraryLi.QuerySelectorAll(":scope > div > div > div > div");
|
||||||
|
var detailsDivs = cardDivs[3]
|
||||||
|
.QuerySelectorAll(":scope > div > div")[1]
|
||||||
|
.QuerySelectorAll(":scope > div");
|
||||||
|
var trainItineraryAndDetailsLis = detailsDivs[0]
|
||||||
|
.QuerySelectorAll(":scope > ul > li");
|
||||||
|
var stations = new List<string>();
|
||||||
|
var details = new List<ItineraryTrain>();
|
||||||
|
foreach (var (idx, li) in trainItineraryAndDetailsLis.Select((li, idx) => (idx, li))) {
|
||||||
|
if (idx % 2 == 0) {
|
||||||
|
// Station
|
||||||
|
stations.Add(
|
||||||
|
li
|
||||||
|
.QuerySelectorAll(":scope > div > div > div > div")[1]
|
||||||
|
.Text()
|
||||||
|
.WithCollapsedSpaces()
|
||||||
|
);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
var now = LocalDateTime.FromDateTime(DateTime.Now);
|
||||||
|
// Detail
|
||||||
|
var detailColumns = li.QuerySelectorAll(":scope > div > div");
|
||||||
|
var leftSideDivs = detailColumns[0].QuerySelectorAll(":scope > div");
|
||||||
|
|
||||||
|
var departureDateText = leftSideDivs[0]
|
||||||
|
.QuerySelectorAll(":scope > div")[1]
|
||||||
|
.Text()
|
||||||
|
.WithCollapsedSpaces();
|
||||||
|
var departureDateMatch = DepArrRegex.Match(departureDateText);
|
||||||
|
var departureDate = new LocalDateTime(
|
||||||
|
now.Year,
|
||||||
|
Months[departureDateMatch.Groups[3].Value],
|
||||||
|
int.Parse(departureDateMatch.Groups[2].Value),
|
||||||
|
int.Parse(departureDateMatch.Groups[4].Value),
|
||||||
|
int.Parse(departureDateMatch.Groups[5].Value),
|
||||||
|
0
|
||||||
|
);
|
||||||
|
if (departureDate < now.PlusDays(-1)) {
|
||||||
|
departureDate = departureDate.PlusYears(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
var arrivalDateText = leftSideDivs[3]
|
||||||
|
.QuerySelectorAll(":scope > div")[1]
|
||||||
|
.Text()
|
||||||
|
.WithCollapsedSpaces();
|
||||||
|
var arrivalDateMatch = DepArrRegex.Match(arrivalDateText);
|
||||||
|
var arrivalDate = new LocalDateTime(
|
||||||
|
now.Year,
|
||||||
|
Months[arrivalDateMatch.Groups[3].Value],
|
||||||
|
int.Parse(arrivalDateMatch.Groups[2].Value),
|
||||||
|
int.Parse(arrivalDateMatch.Groups[4].Value),
|
||||||
|
int.Parse(arrivalDateMatch.Groups[5].Value),
|
||||||
|
0
|
||||||
|
);
|
||||||
|
if (arrivalDate < now.PlusDays(-1)) {
|
||||||
|
arrivalDate = arrivalDate.PlusYears(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
var rightSideDivs = detailColumns[1].QuerySelectorAll(":scope > div > div");
|
||||||
|
var kmRankNumberText = rightSideDivs[0]
|
||||||
|
.QuerySelectorAll(":scope > div > div")[0]
|
||||||
|
.Text()
|
||||||
|
.WithCollapsedSpaces();
|
||||||
|
var kmRankNumberMatch = KmTrainRankNoRegex.Match(kmRankNumberText);
|
||||||
|
|
||||||
|
var operatorText = rightSideDivs[0]
|
||||||
|
.QuerySelectorAll(":scope > div > div")[1]
|
||||||
|
.Text()
|
||||||
|
.WithCollapsedSpaces();
|
||||||
|
var operatorMatch = OperatorRegex.Match(operatorText);
|
||||||
|
|
||||||
|
var train = new ItineraryTrain {
|
||||||
|
ArrivalDate = BucharestTz.AtLeniently(arrivalDate).ToDateTimeOffset(),
|
||||||
|
DepartureDate = BucharestTz.AtLeniently(departureDate).ToDateTimeOffset(),
|
||||||
|
Km = int.Parse(kmRankNumberMatch.Groups[1].Value),
|
||||||
|
TrainRank = kmRankNumberMatch.Groups[2].Value,
|
||||||
|
TrainNumber = kmRankNumberMatch.Groups[3].Value,
|
||||||
|
Operator = operatorMatch.Groups[1].Value,
|
||||||
|
};
|
||||||
|
|
||||||
|
foreach (var div in leftSideDivs[2]
|
||||||
|
.QuerySelectorAll(":scope > div")
|
||||||
|
.Where((_, i) => i % 2 != 0)) {
|
||||||
|
train.AddIntermediateStop(div.Text().WithCollapsedSpaces());
|
||||||
|
}
|
||||||
|
|
||||||
|
details.Add(train);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
foreach (var ((iFrom, iTo), detail) in stations.Zip(stations.Skip(1)).Zip(details)) {
|
||||||
|
detail.From = iFrom;
|
||||||
|
detail.To = iTo;
|
||||||
|
itinerary.AddTrain(detail);
|
||||||
|
}
|
||||||
|
|
||||||
|
itineraries.Add(itinerary);
|
||||||
|
}
|
||||||
|
|
||||||
|
return itineraries;
|
||||||
|
}
|
||||||
|
}
|
Loading…
Add table
Reference in a new issue