Add initial itinerary scraping
This commit is contained in:
parent
de5d85cff4
commit
1d9db5b491
4 changed files with 302 additions and 1 deletions
|
@ -6,7 +6,7 @@
|
|||
|
||||
<PropertyGroup>
|
||||
<OutputType>Exe</OutputType>
|
||||
<TargetFramework>net6.0</TargetFramework>
|
||||
<TargetFrameworks>net6.0;net7.0</TargetFrameworks>
|
||||
</PropertyGroup>
|
||||
|
||||
</Project>
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
using System;
|
||||
using System.Linq;
|
||||
using System.Text.Json;
|
||||
using System.Threading.Tasks;
|
||||
using InfoferScraper;
|
||||
|
@ -7,6 +8,7 @@ using InfoferScraper.Scrapers;
|
|||
while (true) {
|
||||
Console.WriteLine("1. Scrape Train");
|
||||
Console.WriteLine("2. Scrape Station");
|
||||
Console.WriteLine("3. Scrape Itineraries");
|
||||
Console.WriteLine("0. Exit");
|
||||
|
||||
var input = Console.ReadLine()?.Trim();
|
||||
|
@ -17,6 +19,9 @@ while (true) {
|
|||
case "2":
|
||||
await PrintStation();
|
||||
break;
|
||||
case "3":
|
||||
await ScrapeItineraries();
|
||||
break;
|
||||
case null:
|
||||
case "0":
|
||||
goto INPUT_LOOP_BREAK;
|
||||
|
@ -61,3 +66,30 @@ async Task PrintStation() {
|
|||
)
|
||||
);
|
||||
}
|
||||
async Task ScrapeItineraries() {
|
||||
Console.Write("From station: ");
|
||||
var from = Console.ReadLine();
|
||||
Console.Write("To station: ");
|
||||
var to = Console.ReadLine();
|
||||
|
||||
if (from == null || to == null) return;
|
||||
|
||||
var data = await RouteScraper.Scrape(from, to);
|
||||
|
||||
Console.WriteLine($"{data.Count} itineraries:");
|
||||
Console.WriteLine();
|
||||
|
||||
void PrintArrDepLine(DateTimeOffset date, string station) {
|
||||
Console.WriteLine($"{date:HH:mm} {station}");
|
||||
}
|
||||
|
||||
foreach (var itinerary in data) {
|
||||
foreach (var train in itinerary.Trains) {
|
||||
PrintArrDepLine(train.DepartureDate, train.From);
|
||||
Console.WriteLine($" {train.TrainRank,-4} {train.TrainNumber,-5} ({train.Operator}), {train.Km,3} km via {string.Join(", ", train.IntermediateStops)}");
|
||||
PrintArrDepLine(train.ArrivalDate, train.To);
|
||||
}
|
||||
|
||||
Console.WriteLine();
|
||||
}
|
||||
}
|
||||
|
|
62
scraper/src/Models/Itinerary.cs
Normal file
62
scraper/src/Models/Itinerary.cs
Normal file
|
@ -0,0 +1,62 @@
|
|||
using System;
|
||||
using System.Collections.Generic;
|
||||
|
||||
namespace scraper.Models.Itinerary;
|
||||
|
||||
#region Interfaces
|
||||
|
||||
public interface IItinerary {
|
||||
public IReadOnlyList<IItineraryTrain> Trains { get; }
|
||||
}
|
||||
|
||||
public interface IItineraryTrain {
|
||||
public string From { get; }
|
||||
public string To { get; }
|
||||
public IReadOnlyList<string> IntermediateStops { get; }
|
||||
public DateTimeOffset DepartureDate { get; }
|
||||
public DateTimeOffset ArrivalDate { get; }
|
||||
public int Km { get; }
|
||||
public string Operator { get; }
|
||||
public string TrainRank { get; }
|
||||
public string TrainNumber { get; }
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Implementations
|
||||
|
||||
internal record Itinerary : IItinerary {
|
||||
private List<IItineraryTrain> ModifyableTrains { get; set; } = new();
|
||||
|
||||
public IReadOnlyList<IItineraryTrain> Trains => ModifyableTrains;
|
||||
|
||||
internal void AddTrain(IItineraryTrain train) {
|
||||
ModifyableTrains.Add(train);
|
||||
}
|
||||
|
||||
internal void AddTrain(Action<ItineraryTrain> configurator) {
|
||||
ItineraryTrain newTrain = new();
|
||||
configurator(newTrain);
|
||||
AddTrain(newTrain);
|
||||
}
|
||||
}
|
||||
|
||||
internal record ItineraryTrain : IItineraryTrain {
|
||||
private List<string> ModifyableIntermediateStops { get; set; } = new();
|
||||
|
||||
public string From { get; internal set; } = "";
|
||||
public string To { get; internal set; } = "";
|
||||
public IReadOnlyList<string> IntermediateStops => ModifyableIntermediateStops;
|
||||
public DateTimeOffset DepartureDate { get; internal set; } = new();
|
||||
public DateTimeOffset ArrivalDate { get; internal set; } = new();
|
||||
public int Km { get; internal set; } = 0;
|
||||
public string Operator { get; internal set; } = "";
|
||||
public string TrainRank { get; internal set; } = "";
|
||||
public string TrainNumber { get; internal set; } = "";
|
||||
|
||||
internal void AddIntermediateStop(string stop) {
|
||||
ModifyableIntermediateStops.Add(stop);
|
||||
}
|
||||
}
|
||||
|
||||
#endregion
|
207
scraper/src/Scrapers/Route.cs
Normal file
207
scraper/src/Scrapers/Route.cs
Normal file
|
@ -0,0 +1,207 @@
|
|||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Linq;
|
||||
using System.Net;
|
||||
using System.Net.Http;
|
||||
using System.Text.RegularExpressions;
|
||||
using System.Threading.Tasks;
|
||||
using AngleSharp;
|
||||
using AngleSharp.Dom;
|
||||
using AngleSharp.Html.Dom;
|
||||
using Flurl;
|
||||
using InfoferScraper.Models.Train;
|
||||
using NodaTime;
|
||||
using NodaTime.Extensions;
|
||||
using scraper.Models.Itinerary;
|
||||
|
||||
namespace InfoferScraper.Scrapers;
|
||||
|
||||
public static class RouteScraper {
|
||||
private const string BaseUrl = "https://mersultrenurilor.infofer.ro/ro-RO/";
|
||||
private static readonly DateTimeZone BucharestTz = DateTimeZoneProviders.Tzdb["Europe/Bucharest"];
|
||||
|
||||
private static readonly CookieContainer CookieContainer = new();
|
||||
|
||||
private static readonly HttpClient HttpClient = new(new HttpClientHandler {
|
||||
CookieContainer = CookieContainer,
|
||||
UseCookies = true,
|
||||
}) {
|
||||
BaseAddress = new Uri(BaseUrl),
|
||||
DefaultRequestVersion = new Version(2, 0),
|
||||
};
|
||||
|
||||
private static readonly Regex KmTrainRankNoRegex = new(@"^([0-9]+)\skm\scu\s([A-Z-]+)\s([0-9]+)$");
|
||||
private static readonly Regex OperatorRegex = new(@$"^Operat\sde\s([{Utils.RoLetters}\s]+)$");
|
||||
private static readonly Regex DepArrRegex = new(@"^(Ple|Sos)\s([0-9]+)\s([a-z]+)\.?\s([0-9]+):([0-9]+)$");
|
||||
|
||||
private static readonly Dictionary<string, int> Months = new Dictionary<string, int>() {
|
||||
["ian"] = 1,
|
||||
["feb"] = 2,
|
||||
["mar"] = 3,
|
||||
["apr"] = 4,
|
||||
["mai"] = 5,
|
||||
["iun"] = 6,
|
||||
["iul"] = 7,
|
||||
["aug"] = 8,
|
||||
["sep"] = 9,
|
||||
["oct"] = 10,
|
||||
["noi"] = 11,
|
||||
["dec"] = 12,
|
||||
};
|
||||
|
||||
public static async Task<List<IItinerary>?> Scrape(string from, string to, DateTimeOffset? dateOverride = null) {
|
||||
var dateOverrideInstant = dateOverride?.ToInstant().InZone(BucharestTz);
|
||||
dateOverride = dateOverrideInstant?.ToDateTimeOffset();
|
||||
TrainScrapeResult result = new();
|
||||
|
||||
var asConfig = Configuration.Default;
|
||||
var asContext = BrowsingContext.New(asConfig);
|
||||
|
||||
var firstUrl = "Rute-trenuri"
|
||||
.AppendPathSegment(from)
|
||||
.AppendPathSegment(to);
|
||||
if (dateOverride != null) {
|
||||
firstUrl = firstUrl.SetQueryParam("DepartureDate", $"{dateOverride:d.MM.yyyy}");
|
||||
}
|
||||
firstUrl = firstUrl.SetQueryParam("OrderingTypeId", "0");
|
||||
firstUrl = firstUrl.SetQueryParam("TimeSelectionId", "0");
|
||||
firstUrl = firstUrl.SetQueryParam("MinutesInDay", "0");
|
||||
firstUrl = firstUrl.SetQueryParam("ConnectionsTypeId", "1");
|
||||
firstUrl = firstUrl.SetQueryParam("BetweenTrainsMinimumMinutes", "5");
|
||||
firstUrl = firstUrl.SetQueryParam("ChangeStationName", "");
|
||||
|
||||
var firstResponse = await HttpClient.GetStringAsync(firstUrl);
|
||||
var firstDocument = await asContext.OpenAsync(req => req.Content(firstResponse));
|
||||
var firstForm = firstDocument.GetElementById("form-search")!;
|
||||
|
||||
var firstResult = firstForm
|
||||
.QuerySelectorAll<IHtmlInputElement>("input")
|
||||
.Where(elem => elem.Name != null)
|
||||
.ToDictionary(elem => elem.Name!, elem => elem.Value);
|
||||
|
||||
var secondUrl = "".AppendPathSegments("Itineraries", "GetItineraries");
|
||||
var secondResponse = await HttpClient.PostAsync(
|
||||
secondUrl,
|
||||
#pragma warning disable CS8620
|
||||
new FormUrlEncodedContent(firstResult)
|
||||
#pragma warning restore CS8620
|
||||
);
|
||||
var secondResponseContent = await secondResponse.Content.ReadAsStringAsync();
|
||||
var secondDocument = await asContext.OpenAsync(
|
||||
req => req.Content(secondResponseContent)
|
||||
);
|
||||
|
||||
var (itineraryInfoDiv, _) = secondDocument
|
||||
.QuerySelectorAll("body > div");
|
||||
|
||||
if (itineraryInfoDiv == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
var itinerariesLi = secondDocument
|
||||
.QuerySelectorAll("body > ul > li");
|
||||
var itineraries = new List<IItinerary>();
|
||||
foreach (var itineraryLi in itinerariesLi) {
|
||||
var itinerary = new Itinerary();
|
||||
|
||||
var cardDivs = itineraryLi.QuerySelectorAll(":scope > div > div > div > div");
|
||||
var detailsDivs = cardDivs[3]
|
||||
.QuerySelectorAll(":scope > div > div")[1]
|
||||
.QuerySelectorAll(":scope > div");
|
||||
var trainItineraryAndDetailsLis = detailsDivs[0]
|
||||
.QuerySelectorAll(":scope > ul > li");
|
||||
var stations = new List<string>();
|
||||
var details = new List<ItineraryTrain>();
|
||||
foreach (var (idx, li) in trainItineraryAndDetailsLis.Select((li, idx) => (idx, li))) {
|
||||
if (idx % 2 == 0) {
|
||||
// Station
|
||||
stations.Add(
|
||||
li
|
||||
.QuerySelectorAll(":scope > div > div > div > div")[1]
|
||||
.Text()
|
||||
.WithCollapsedSpaces()
|
||||
);
|
||||
}
|
||||
else {
|
||||
var now = LocalDateTime.FromDateTime(DateTime.Now);
|
||||
// Detail
|
||||
var detailColumns = li.QuerySelectorAll(":scope > div > div");
|
||||
var leftSideDivs = detailColumns[0].QuerySelectorAll(":scope > div");
|
||||
|
||||
var departureDateText = leftSideDivs[0]
|
||||
.QuerySelectorAll(":scope > div")[1]
|
||||
.Text()
|
||||
.WithCollapsedSpaces();
|
||||
var departureDateMatch = DepArrRegex.Match(departureDateText);
|
||||
var departureDate = new LocalDateTime(
|
||||
now.Year,
|
||||
Months[departureDateMatch.Groups[3].Value],
|
||||
int.Parse(departureDateMatch.Groups[2].Value),
|
||||
int.Parse(departureDateMatch.Groups[4].Value),
|
||||
int.Parse(departureDateMatch.Groups[5].Value),
|
||||
0
|
||||
);
|
||||
if (departureDate < now.PlusDays(-1)) {
|
||||
departureDate = departureDate.PlusYears(1);
|
||||
}
|
||||
|
||||
var arrivalDateText = leftSideDivs[3]
|
||||
.QuerySelectorAll(":scope > div")[1]
|
||||
.Text()
|
||||
.WithCollapsedSpaces();
|
||||
var arrivalDateMatch = DepArrRegex.Match(arrivalDateText);
|
||||
var arrivalDate = new LocalDateTime(
|
||||
now.Year,
|
||||
Months[arrivalDateMatch.Groups[3].Value],
|
||||
int.Parse(arrivalDateMatch.Groups[2].Value),
|
||||
int.Parse(arrivalDateMatch.Groups[4].Value),
|
||||
int.Parse(arrivalDateMatch.Groups[5].Value),
|
||||
0
|
||||
);
|
||||
if (arrivalDate < now.PlusDays(-1)) {
|
||||
arrivalDate = arrivalDate.PlusYears(1);
|
||||
}
|
||||
|
||||
var rightSideDivs = detailColumns[1].QuerySelectorAll(":scope > div > div");
|
||||
var kmRankNumberText = rightSideDivs[0]
|
||||
.QuerySelectorAll(":scope > div > div")[0]
|
||||
.Text()
|
||||
.WithCollapsedSpaces();
|
||||
var kmRankNumberMatch = KmTrainRankNoRegex.Match(kmRankNumberText);
|
||||
|
||||
var operatorText = rightSideDivs[0]
|
||||
.QuerySelectorAll(":scope > div > div")[1]
|
||||
.Text()
|
||||
.WithCollapsedSpaces();
|
||||
var operatorMatch = OperatorRegex.Match(operatorText);
|
||||
|
||||
var train = new ItineraryTrain {
|
||||
ArrivalDate = BucharestTz.AtLeniently(arrivalDate).ToDateTimeOffset(),
|
||||
DepartureDate = BucharestTz.AtLeniently(departureDate).ToDateTimeOffset(),
|
||||
Km = int.Parse(kmRankNumberMatch.Groups[1].Value),
|
||||
TrainRank = kmRankNumberMatch.Groups[2].Value,
|
||||
TrainNumber = kmRankNumberMatch.Groups[3].Value,
|
||||
Operator = operatorMatch.Groups[1].Value,
|
||||
};
|
||||
|
||||
foreach (var div in leftSideDivs[2]
|
||||
.QuerySelectorAll(":scope > div")
|
||||
.Where((_, i) => i % 2 != 0)) {
|
||||
train.AddIntermediateStop(div.Text().WithCollapsedSpaces());
|
||||
}
|
||||
|
||||
details.Add(train);
|
||||
}
|
||||
}
|
||||
foreach (var ((iFrom, iTo), detail) in stations.Zip(stations.Skip(1)).Zip(details)) {
|
||||
detail.From = iFrom;
|
||||
detail.To = iTo;
|
||||
itinerary.AddTrain(detail);
|
||||
}
|
||||
|
||||
itineraries.Add(itinerary);
|
||||
}
|
||||
|
||||
return itineraries;
|
||||
}
|
||||
}
|
Loading…
Add table
Reference in a new issue