Added station arr/dep scrapper
Added scrapper for arrivals and departures at station
This commit is contained in:
parent
5e65675a08
commit
0a7e2b2568
15 changed files with 899 additions and 274 deletions
20
scraper/schemas.py
Normal file
20
scraper/schemas.py
Normal file
|
@ -0,0 +1,20 @@
|
|||
from contextlib import ExitStack as _ExitStack
|
||||
|
||||
_es = _ExitStack()
|
||||
|
||||
def _load_file(name: str):
|
||||
import json
|
||||
from os.path import join, dirname
|
||||
dir = dirname(__file__)
|
||||
|
||||
return json.load(_es.enter_context(open(join(dir, name))))
|
||||
|
||||
TRAIN_INFO_SCHEMA = {
|
||||
'v1': _load_file('scrape_train_schema.json'),
|
||||
'v2': _load_file('scrape_train_schema_v2.json'),
|
||||
}
|
||||
STATION_SCHEMA = {
|
||||
'v2': _load_file('scrape_station_schema_v2.json'),
|
||||
}
|
||||
|
||||
_es.close()
|
87
scraper/scrape_station.py
Normal file
87
scraper/scrape_station.py
Normal file
|
@ -0,0 +1,87 @@
|
|||
import re
|
||||
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
import pytz
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from .utils import *
|
||||
|
||||
# region regex definitions
|
||||
|
||||
RO_LETTERS = r'A-Za-zăâîșțĂÂÎȚȘ'
|
||||
|
||||
STATION_INFO_REGEX = re.compile(rf'^([{RO_LETTERS} ]+) în ([0-9.]+)$')
|
||||
|
||||
STOPPING_TIME_REGEX = re.compile(r'^(necunoscută \(stație terminus\))|(?:([0-9]+) min \((?:începând cu|până la) ([0-9]{1,2}:[0-9]{2})\))$')
|
||||
|
||||
# endregion
|
||||
|
||||
def scrape(station_name: str):
|
||||
station_name = ro_letters_to_en(station_name)
|
||||
# Start scrapping session
|
||||
s = requests.Session()
|
||||
|
||||
r = s.get(build_url(
|
||||
'https://mersultrenurilor.infofer.ro/ro-RO/Statie/{station}',
|
||||
station=station_name.replace(' ', '-'),
|
||||
))
|
||||
|
||||
soup = BeautifulSoup(r.text, features='html.parser')
|
||||
sform = soup.find(id='form-search')
|
||||
result_data = { elem['name']: elem['value'] for elem in sform('input') }
|
||||
|
||||
r = s.post('https://mersultrenurilor.infofer.ro/ro-RO/Stations/StationsResult', data=result_data)
|
||||
soup = BeautifulSoup(r.text, features='html.parser')
|
||||
|
||||
scraped = {}
|
||||
|
||||
station_info_div, _, departures_div, arrivals_div, *_ = soup('div', recursive=False)
|
||||
|
||||
scraped['stationName'], scraped['date'] = STATION_INFO_REGEX.match(collapse_space(station_info_div.h2.text)).groups()
|
||||
date_d, date_m, date_y = (int(comp) for comp in scraped['date'].split('.'))
|
||||
date = datetime(date_y, date_m, date_d)
|
||||
dt_seq = DateTimeSequencer(date.year, date.month, date.day)
|
||||
tz = pytz.timezone('Europe/Bucharest')
|
||||
|
||||
def parse_arrdep_list(elem, end_station_field_name):
|
||||
def parse_item(elem):
|
||||
result = {}
|
||||
|
||||
try:
|
||||
data_div, status_div = elem('div', recursive=False)
|
||||
except ValueError:
|
||||
data_div, *_ = elem('div', recursive=False)
|
||||
status_div = None
|
||||
data_main_div, data_details_div = data_div('div', recursive=False)
|
||||
time_div, dest_div, train_div, *_ = data_main_div('div', recursive=False)
|
||||
operator_div, route_div, stopping_time_div = data_details_div.div('div', recursive=False)
|
||||
|
||||
result['time'] = collapse_space(time_div.div.div('div', recursive=False)[1].text)
|
||||
st_hr, st_min = (int(comp) for comp in result['time'].split(':'))
|
||||
result['time'] = tz.localize(dt_seq(st_hr, st_min)).isoformat()
|
||||
|
||||
unknown_st, st, st_opposite_time = STOPPING_TIME_REGEX.match(
|
||||
collapse_space(stopping_time_div.div('div', recursive=False)[1].text)
|
||||
).groups()
|
||||
if unknown_st:
|
||||
result['stoppingTime'] = None
|
||||
elif st:
|
||||
result['stoppingTime'] = int(st)
|
||||
|
||||
result['train'] = {}
|
||||
result['train']['rank'] = collapse_space(train_div.div.div('div', recursive=False)[1].span.text)
|
||||
result['train']['number'] = collapse_space(train_div.div.div('div', recursive=False)[1].a.text)
|
||||
result['train'][end_station_field_name] = collapse_space(dest_div.div.div('div', recursive=False)[1].text)
|
||||
result['train']['operator'] = collapse_space(operator_div.div('div', recursive=False)[1].text)
|
||||
result['train']['route'] = collapse_space(route_div.div('div', recursive=False)[1].text).split(' - ')
|
||||
|
||||
return result
|
||||
|
||||
return [parse_item(elem) for elem in elem.div.ul('li', recursive=False)]
|
||||
|
||||
scraped['departures'] = parse_arrdep_list(departures_div, 'destination')
|
||||
scraped['arrivals'] = parse_arrdep_list(arrivals_div, 'origin')
|
||||
|
||||
return scraped
|
137
scraper/scrape_station_schema_v2.json
Normal file
137
scraper/scrape_station_schema_v2.json
Normal file
|
@ -0,0 +1,137 @@
|
|||
{
|
||||
"$schema": "http://json-schema.org/schema",
|
||||
"title": "Train Info InfoFer Scrap Station Schema",
|
||||
"description": "Results of scrapping InfoFer website for station arrival/departure info",
|
||||
"definitions": {
|
||||
"arrDepItem": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"time": {
|
||||
"description": "Time of arrival/departure",
|
||||
"type": "string",
|
||||
"format": "date-time"
|
||||
},
|
||||
"train": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"rank": {
|
||||
"type": "string",
|
||||
"examples": [
|
||||
"R",
|
||||
"R-E",
|
||||
"IR",
|
||||
"IRN"
|
||||
]
|
||||
},
|
||||
"number": {
|
||||
"type": "string",
|
||||
"examples": [
|
||||
"74",
|
||||
"15934"
|
||||
]
|
||||
},
|
||||
"operator": {
|
||||
"type": "string",
|
||||
"examples": [
|
||||
"CFR Călători",
|
||||
"Softrans",
|
||||
"Regio Călători"
|
||||
]
|
||||
},
|
||||
"route": {
|
||||
"description": "All the stations the train stops at",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"rank",
|
||||
"number",
|
||||
"operator"
|
||||
]
|
||||
},
|
||||
"stoppingTime": {
|
||||
"type": [
|
||||
"integer",
|
||||
"null"
|
||||
],
|
||||
"minimum": 1
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"time",
|
||||
"train",
|
||||
"stoppingTime"
|
||||
]
|
||||
}
|
||||
},
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"arrivals": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"allOf": [
|
||||
{
|
||||
"$ref": "#/definitions/arrDepItem"
|
||||
},
|
||||
{
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"train": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"origin": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"required": ["origin"]
|
||||
}
|
||||
},
|
||||
"required": ["train"]
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"departures": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"allOf": [
|
||||
{
|
||||
"$ref": "#/definitions/arrDepItem"
|
||||
},
|
||||
{
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"train": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"destination": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"required": ["destination"]
|
||||
}
|
||||
},
|
||||
"required": ["train"]
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"stationName": {
|
||||
"type": "string"
|
||||
},
|
||||
"date": {
|
||||
"description": "Date for which the data is provided (likely today)",
|
||||
"type": "string",
|
||||
"pattern": "^[0-9]{1,2}\\.[0-9]{2}\\.[0-9]{4}$"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"arrivals",
|
||||
"departures",
|
||||
"stationName",
|
||||
"date"
|
||||
]
|
||||
}
|
143
scraper/scrape_train.py
Normal file
143
scraper/scrape_train.py
Normal file
|
@ -0,0 +1,143 @@
|
|||
import re
|
||||
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
import pytz
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from .utils import *
|
||||
|
||||
# region regex definitions
|
||||
|
||||
TRAIN_INFO_REGEX = re.compile(r'^([A-Z-]+) ([0-9]+) în ([0-9.]+)$')
|
||||
|
||||
OPERATOR_REGEX = re.compile(r'^Operat de (.+)$')
|
||||
|
||||
SL_REGEX = re.compile(r'^(?:Fără|([0-9]+) min) (întârziere|mai devreme) la (trecerea fără oprire prin|sosirea în|plecarea din) (.+)\.$')
|
||||
SL_STATE_MAP = {
|
||||
't': 'passing',
|
||||
's': 'arrival',
|
||||
'p': 'departure',
|
||||
}
|
||||
|
||||
RO_LETTERS = r'A-Za-zăâîșțĂÂÎȚȘ'
|
||||
|
||||
ROUTE_REGEX = re.compile(rf'^Parcurs tren ([{RO_LETTERS} ]+)[-–]([{RO_LETTERS} ]+)$')
|
||||
|
||||
KM_REGEX = re.compile(r'^km ([0-9]+)$')
|
||||
|
||||
PLATFORM_REGEX = re.compile(r'^linia (.+)$')
|
||||
|
||||
STOPPING_TIME_REGEX = re.compile(r'^([0-9]+) min oprire$')
|
||||
|
||||
STATION_DEPARR_STATUS_REGEX = re.compile(r'^(?:(la timp)|(?:((?:\+|-)[0-9]+) min \((?:(?:întârziere)|(?:mai devreme))\)))(\*?)$')
|
||||
|
||||
# endregion
|
||||
|
||||
def scrape(train_no: int, use_yesterday=False, date_override=None):
|
||||
# Start scrapping session
|
||||
s = requests.Session()
|
||||
|
||||
date = datetime.today()
|
||||
if use_yesterday:
|
||||
date -= timedelta(days=1)
|
||||
if date_override:
|
||||
date = date_override
|
||||
|
||||
r = s.get(build_url(
|
||||
'https://mersultrenurilor.infofer.ro/ro-RO/Tren/{train_no}',
|
||||
train_no=train_no,
|
||||
query=[
|
||||
('Date', date.strftime('%d.%m.%Y')),
|
||||
],
|
||||
))
|
||||
|
||||
soup = BeautifulSoup(r.text, features='html.parser')
|
||||
sform = soup.find(id='form-search')
|
||||
result_data = { elem['name']: elem['value'] for elem in sform('input') }
|
||||
|
||||
r = s.post('https://mersultrenurilor.infofer.ro/ro-RO/Trains/TrainsResult', data=result_data)
|
||||
soup = BeautifulSoup(r.text, features='html.parser')
|
||||
|
||||
scraped = {}
|
||||
|
||||
train_info_div, _, _, results_div, *_ = soup('div', recursive=False)
|
||||
|
||||
train_info_div = train_info_div.div('div', recursive=False)[0]
|
||||
|
||||
scraped['rank'], scraped['number'], scraped['date'] = TRAIN_INFO_REGEX.match(collapse_space(train_info_div.h2.text)).groups()
|
||||
date_d, date_m, date_y = (int(comp) for comp in scraped['date'].split('.'))
|
||||
date = datetime(date_y, date_m, date_d)
|
||||
|
||||
scraped['operator'] = OPERATOR_REGEX.match(collapse_space(train_info_div.p.text)).groups()[0]
|
||||
|
||||
results_div = results_div.div
|
||||
status_div = results_div('div', recursive=False)[0]
|
||||
route_text = collapse_space(status_div.h4.text)
|
||||
route_from, route_to = ROUTE_REGEX.match(route_text).groups()
|
||||
scraped['route'] = {
|
||||
'from': route_from,
|
||||
'to': route_to,
|
||||
}
|
||||
try:
|
||||
status_line_match = SL_REGEX.match(collapse_space(status_div.div.text))
|
||||
slm_delay, slm_late, slm_arrival, slm_station = status_line_match.groups()
|
||||
scraped['status'] = {
|
||||
'delay': (int(slm_delay) if slm_late == 'întârziere' else -int(slm_delay)) if slm_delay else 0,
|
||||
'station': slm_station,
|
||||
'state': SL_STATE_MAP[slm_arrival[0]],
|
||||
}
|
||||
except Exception:
|
||||
scraped['status'] = None
|
||||
|
||||
stations = status_div.ul('li', recursive=False)
|
||||
scraped['stations'] = []
|
||||
dt_seq = DateTimeSequencer(date.year, date.month, date.day)
|
||||
tz = pytz.timezone('Europe/Bucharest')
|
||||
for station in stations:
|
||||
station_scraped = {}
|
||||
|
||||
left, middle, right = station.div('div', recursive=False)
|
||||
station_scraped['name'] = collapse_space(middle.div.div('div', recursive=False)[0]('div', recursive=False)[0].text)
|
||||
station_scraped['km'] = collapse_space(middle.div.div('div', recursive=False)[0]('div', recursive=False)[1].text)
|
||||
station_scraped['km'] = int(KM_REGEX.match(station_scraped['km']).groups()[0])
|
||||
station_scraped['stoppingTime'] = collapse_space(middle.div.div('div', recursive=False)[0]('div', recursive=False)[2].text)
|
||||
if not station_scraped['stoppingTime']:
|
||||
station_scraped['stoppingTime'] = None
|
||||
else:
|
||||
station_scraped['stoppingTime'] = int(STOPPING_TIME_REGEX.match(station_scraped['stoppingTime']).groups()[0])
|
||||
station_scraped['platform'] = collapse_space(middle.div.div('div', recursive=False)[0]('div', recursive=False)[3].text)
|
||||
if not station_scraped['platform']:
|
||||
station_scraped['platform'] = None
|
||||
else:
|
||||
station_scraped['platform'] = PLATFORM_REGEX.match(station_scraped['platform']).groups()[0]
|
||||
|
||||
def scrape_time(elem, setter):
|
||||
parts = elem.div.div('div', recursive=False)
|
||||
if parts:
|
||||
result = {}
|
||||
|
||||
time, *_ = parts
|
||||
result['scheduleTime'] = collapse_space(time.text)
|
||||
st_hr, st_min = (int(comp) for comp in result['scheduleTime'].split(':'))
|
||||
result['scheduleTime'] = tz.localize(dt_seq(st_hr, st_min)).isoformat()
|
||||
if len(parts) >= 2:
|
||||
_, status, *_ = parts
|
||||
result['status'] = {}
|
||||
on_time, delay, approx = STATION_DEPARR_STATUS_REGEX.match(collapse_space(status.text)).groups()
|
||||
result['status']['delay'] = 0 if on_time else int(delay)
|
||||
result['status']['real'] = not approx
|
||||
else:
|
||||
result['status'] = None
|
||||
|
||||
setter(result)
|
||||
else:
|
||||
setter(None)
|
||||
|
||||
scrape_time(left, lambda value: station_scraped.update(arrival=value))
|
||||
scrape_time(right, lambda value: station_scraped.update(departure=value))
|
||||
|
||||
scraped['stations'].append(station_scraped)
|
||||
|
||||
return scraped
|
134
scraper/scrape_train_schema.json
Normal file
134
scraper/scrape_train_schema.json
Normal file
|
@ -0,0 +1,134 @@
|
|||
{
|
||||
"$schema": "http://json-schema.org/schema",
|
||||
"title": "Train Info InfoFer Scrap Train Schema",
|
||||
"description": "Results of scrapping InfoFer website for train info",
|
||||
"definitions": {
|
||||
"delayType": {
|
||||
"description": "Delay of the train (negative for being early)",
|
||||
"type": "integer"
|
||||
},
|
||||
"stationArrDepTime": {
|
||||
"description": "Time of arrival at/departure from station",
|
||||
"type": ["object", "null"],
|
||||
"properties": {
|
||||
"scheduleTime": {
|
||||
"description": "The time the train is scheduled to arrive/depart",
|
||||
"type": "string",
|
||||
"pattern": "^[0-9]{1,2}:[0-9]{2}$"
|
||||
},
|
||||
"status": {
|
||||
"type": ["object", "null"],
|
||||
"properties": {
|
||||
"delay": {
|
||||
"$ref": "#/definitions/delayType"
|
||||
},
|
||||
"real": {
|
||||
"description": "Determines whether delay was actually reported or is an approximation",
|
||||
"type": "boolean"
|
||||
}
|
||||
},
|
||||
"required": ["delay", "real"]
|
||||
}
|
||||
},
|
||||
"required": ["scheduleTime"]
|
||||
}
|
||||
},
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"rank": {
|
||||
"description": "The rank of the train",
|
||||
"type": "string",
|
||||
"examples": [
|
||||
"R",
|
||||
"R-E",
|
||||
"IR",
|
||||
"IRN"
|
||||
]
|
||||
},
|
||||
"number": {
|
||||
"description": "The number of the train",
|
||||
"type": "string",
|
||||
"examples": [
|
||||
"74",
|
||||
"15934"
|
||||
]
|
||||
},
|
||||
"date": {
|
||||
"description": "Date of departure from the first station (dd.mm.yyyy)",
|
||||
"type": "string",
|
||||
"pattern": "^[0-9]{1,2}\\.[0-9]{2}\\.[0-9]{4}$"
|
||||
},
|
||||
"operator": {
|
||||
"description": "Operator of the train",
|
||||
"type": "string",
|
||||
"examples": [
|
||||
"CFR Călători",
|
||||
"Softrans",
|
||||
"Regio Călători"
|
||||
]
|
||||
},
|
||||
"route": {
|
||||
"description": "Route of the train",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"from": {
|
||||
"type": "string"
|
||||
},
|
||||
"to": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"required": ["from", "to"]
|
||||
},
|
||||
"status": {
|
||||
"description": "Current status of the train",
|
||||
"type": ["object", "null"],
|
||||
"properties": {
|
||||
"delay": {
|
||||
"$ref": "#/definitions/delayType"
|
||||
},
|
||||
"station": {
|
||||
"type": "string"
|
||||
},
|
||||
"state": {
|
||||
"type": "string",
|
||||
"enum": ["passing", "arrival", "departure"]
|
||||
}
|
||||
},
|
||||
"required": ["delay", "station", "state"]
|
||||
},
|
||||
"stations": {
|
||||
"description": "List of stations the train stops at",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"name": {
|
||||
"type": "string"
|
||||
},
|
||||
"km": {
|
||||
"description": "The distance the train travelled until reaching this station",
|
||||
"type": "integer"
|
||||
},
|
||||
"stoppingTime": {
|
||||
"description": "The number of minutes the train is scheduled to stop in this station",
|
||||
"type": ["integer", "null"],
|
||||
"minimum": 1
|
||||
},
|
||||
"platform": {
|
||||
"description": "The platform the train stopped at",
|
||||
"type": ["string", "null"]
|
||||
},
|
||||
"arrival": {
|
||||
"$ref": "#/definitions/stationArrDepTime"
|
||||
},
|
||||
"departure": {
|
||||
"$ref": "#/definitions/stationArrDepTime"
|
||||
}
|
||||
},
|
||||
"required": ["name", "km"]
|
||||
}
|
||||
}
|
||||
},
|
||||
"required": ["route", "stations", "rank", "number", "date", "operator"]
|
||||
}
|
|
@ -1,6 +1,6 @@
|
|||
{
|
||||
"$schema": "http://json-schema.org/schema",
|
||||
"title": "Train Info InfoFer Scrap Result Schema",
|
||||
"title": "Train Info InfoFer Scrap Train Schema",
|
||||
"description": "Results of scrapping InfoFer website for train info",
|
||||
"definitions": {
|
||||
"delayType": {
|
||||
|
@ -13,7 +13,8 @@
|
|||
"properties": {
|
||||
"scheduleTime": {
|
||||
"description": "The time the train is scheduled to arrive/depart",
|
||||
"type": "string"
|
||||
"type": "string",
|
||||
"format": "date-time"
|
||||
},
|
||||
"status": {
|
||||
"type": ["object", "null"],
|
||||
|
@ -37,14 +38,6 @@
|
|||
"rank": {
|
||||
"description": "The rank of the train",
|
||||
"type": "string",
|
||||
"examples": [
|
||||
"74",
|
||||
"15934"
|
||||
]
|
||||
},
|
||||
"number": {
|
||||
"description": "The number of the train",
|
||||
"type": "string",
|
||||
"examples": [
|
||||
"R",
|
||||
"R-E",
|
||||
|
@ -52,9 +45,18 @@
|
|||
"IRN"
|
||||
]
|
||||
},
|
||||
"number": {
|
||||
"description": "The number of the train",
|
||||
"type": "string",
|
||||
"examples": [
|
||||
"74",
|
||||
"15934"
|
||||
]
|
||||
},
|
||||
"date": {
|
||||
"description": "Date of departure from the first station",
|
||||
"type": "string"
|
||||
"description": "Date of departure from the first station (dd.mm.yyyy)",
|
||||
"type": "string",
|
||||
"pattern": "^[0-9]{1,2}\\.[0-9]{2}\\.[0-9]{4}$"
|
||||
},
|
||||
"operator": {
|
||||
"description": "Operator of the train",
|
|
@ -1,177 +1,12 @@
|
|||
#! /usr/bin/env python3
|
||||
|
||||
from datetime import datetime, timedelta
|
||||
import re
|
||||
|
||||
import pytz
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from urllib.parse import quote, urlencode
|
||||
|
||||
TRAIN_INFO_REGEX = re.compile(r'^([A-Z-]+) ([0-9]+) în ([0-9.]+)$')
|
||||
|
||||
OPERATOR_REGEX = re.compile(r'^Operat de (.+)$')
|
||||
|
||||
SL_REGEX = re.compile(r'^(?:Fără|([0-9]+) min) (întârziere|mai devreme) la (trecerea fără oprire prin|sosirea în|plecarea din) (.+)\.$')
|
||||
SL_STATE_MAP = {
|
||||
't': 'passing',
|
||||
's': 'arrival',
|
||||
'p': 'departure',
|
||||
}
|
||||
|
||||
RO_LETTERS = r'A-Za-zăâîșțĂÂÎȚȘ'
|
||||
|
||||
ROUTE_REGEX = re.compile(rf'^Parcurs tren ([{RO_LETTERS} ]+)[-–]([{RO_LETTERS} ]+)$')
|
||||
|
||||
KM_REGEX = re.compile(r'^km ([0-9]+)$')
|
||||
|
||||
PLATFORM_REGEX = re.compile(r'^linia (.+)$')
|
||||
|
||||
STOPPING_TIME_REGEX = re.compile(r'^([0-9]+) min oprire$')
|
||||
|
||||
STATION_DEPARR_STATUS_REGEX = re.compile(r'^(?:(la timp)|(?:((?:\+|-)[0-9]+) min \((?:(?:întârziere)|(?:mai devreme))\)))(\*?)$')
|
||||
|
||||
class DateTimeSequencer:
|
||||
def __init__(self, year: int, month: int, day: int) -> None:
|
||||
self.current = datetime(year, month, day, 0, 0, 0)
|
||||
self.current -= timedelta(seconds=1)
|
||||
|
||||
def __call__(self, hour: int, minute: int = 0, second: int = 0) -> datetime:
|
||||
potential_new_date = datetime(self.current.year, self.current.month, self.current.day, hour, minute, second)
|
||||
if (self.current > potential_new_date):
|
||||
potential_new_date += timedelta(days=1)
|
||||
self.current = potential_new_date
|
||||
return self.current
|
||||
|
||||
def collapse_space(string: str) -> str:
|
||||
return re.sub(
|
||||
rf'[{BeautifulSoup.ASCII_SPACES}]+',
|
||||
' ',
|
||||
string,
|
||||
flags=re.MULTILINE
|
||||
).strip()
|
||||
|
||||
def build_url(base: str, /, query: dict, **kwargs):
|
||||
result = base.format(**{ k: quote(str(v)) for k, v in kwargs.items() })
|
||||
if query:
|
||||
result += '?'
|
||||
result += urlencode(query)
|
||||
return result
|
||||
|
||||
def scrape(train_no: int, use_yesterday=False, date_override=None):
|
||||
# Start scrapping session
|
||||
s = requests.Session()
|
||||
|
||||
date = datetime.today()
|
||||
if use_yesterday:
|
||||
date -= timedelta(days=1)
|
||||
if date_override:
|
||||
date = date_override
|
||||
|
||||
r = s.get(build_url(
|
||||
'https://mersultrenurilor.infofer.ro/ro-RO/Tren/{train_no}',
|
||||
train_no=train_no,
|
||||
query=[
|
||||
('Date', date.strftime('%d.%m.%Y')),
|
||||
],
|
||||
))
|
||||
|
||||
soup = BeautifulSoup(r.text, features='html.parser')
|
||||
sform = soup.find(id='form-search')
|
||||
result_data = { elem['name']: elem['value'] for elem in sform('input') }
|
||||
|
||||
r = s.post('https://mersultrenurilor.infofer.ro/ro-RO/Trains/TrainsResult', data=result_data)
|
||||
soup = BeautifulSoup(r.text, features='html.parser')
|
||||
|
||||
scraped = {}
|
||||
|
||||
train_info_div, _, _, results_div, *_ = soup('div', recursive=False)
|
||||
|
||||
train_info_div = train_info_div.div('div', recursive=False)[0]
|
||||
|
||||
scraped['rank'], scraped['number'], scraped['date'] = TRAIN_INFO_REGEX.match(collapse_space(train_info_div.h2.text)).groups()
|
||||
date_d, date_m, date_y = (int(comp) for comp in scraped['date'].split('.'))
|
||||
date = datetime(date_y, date_m, date_d)
|
||||
|
||||
scraped['operator'] = OPERATOR_REGEX.match(collapse_space(train_info_div.p.text)).groups()[0]
|
||||
|
||||
results_div = results_div.div
|
||||
status_div = results_div('div', recursive=False)[0]
|
||||
route_text = collapse_space(status_div.h4.text)
|
||||
route_from, route_to = ROUTE_REGEX.match(route_text).groups()
|
||||
scraped['route'] = {
|
||||
'from': route_from,
|
||||
'to': route_to,
|
||||
}
|
||||
try:
|
||||
status_line_match = SL_REGEX.match(collapse_space(status_div.div.text))
|
||||
slm_delay, slm_late, slm_arrival, slm_station = status_line_match.groups()
|
||||
scraped['status'] = {
|
||||
'delay': (int(slm_delay) if slm_late == 'întârziere' else -int(slm_delay)) if slm_delay else 0,
|
||||
'station': slm_station,
|
||||
'state': SL_STATE_MAP[slm_arrival[0]],
|
||||
}
|
||||
except Exception:
|
||||
scraped['status'] = None
|
||||
|
||||
stations = status_div.ul('li', recursive=False)
|
||||
scraped['stations'] = []
|
||||
dt_seq = DateTimeSequencer(date.year, date.month, date.day)
|
||||
tz = pytz.timezone('Europe/Bucharest')
|
||||
for station in stations:
|
||||
station_scraped = {}
|
||||
|
||||
left, middle, right = station.div('div', recursive=False)
|
||||
station_scraped['name'] = collapse_space(middle.div.div('div', recursive=False)[0]('div', recursive=False)[0].text)
|
||||
station_scraped['km'] = collapse_space(middle.div.div('div', recursive=False)[0]('div', recursive=False)[1].text)
|
||||
station_scraped['km'] = int(KM_REGEX.match(station_scraped['km']).groups()[0])
|
||||
station_scraped['stoppingTime'] = collapse_space(middle.div.div('div', recursive=False)[0]('div', recursive=False)[2].text)
|
||||
if not station_scraped['stoppingTime']:
|
||||
station_scraped['stoppingTime'] = None
|
||||
else:
|
||||
station_scraped['stoppingTime'] = int(STOPPING_TIME_REGEX.match(station_scraped['stoppingTime']).groups()[0])
|
||||
station_scraped['platform'] = collapse_space(middle.div.div('div', recursive=False)[0]('div', recursive=False)[3].text)
|
||||
if not station_scraped['platform']:
|
||||
station_scraped['platform'] = None
|
||||
else:
|
||||
station_scraped['platform'] = PLATFORM_REGEX.match(station_scraped['platform']).groups()[0]
|
||||
|
||||
def scrape_time(elem, setter):
|
||||
parts = elem.div.div('div', recursive=False)
|
||||
if parts:
|
||||
result = {}
|
||||
|
||||
time, *_ = parts
|
||||
result['scheduleTime'] = collapse_space(time.text)
|
||||
st_hr, st_min = (int(comp) for comp in result['scheduleTime'].split(':'))
|
||||
result['scheduleTime'] = tz.localize(dt_seq(st_hr, st_min)).isoformat()
|
||||
if len(parts) >= 2:
|
||||
_, status, *_ = parts
|
||||
result['status'] = {}
|
||||
on_time, delay, approx = STATION_DEPARR_STATUS_REGEX.match(collapse_space(status.text)).groups()
|
||||
result['status']['delay'] = 0 if on_time else int(delay)
|
||||
result['status']['real'] = not approx
|
||||
else:
|
||||
result['status'] = None
|
||||
|
||||
setter(result)
|
||||
else:
|
||||
setter(None)
|
||||
|
||||
scrape_time(left, lambda value: station_scraped.update(arrival=value))
|
||||
scrape_time(right, lambda value: station_scraped.update(departure=value))
|
||||
|
||||
scraped['stations'].append(station_scraped)
|
||||
|
||||
return scraped
|
||||
|
||||
from .scrape_train import scrape as scrape_train
|
||||
from .scrape_station import scrape as scrape_station
|
||||
|
||||
def main():
|
||||
train_no = 1538
|
||||
print(f'Testing package with train number {train_no}')
|
||||
from pprint import pprint
|
||||
# pprint(scrape('473'))
|
||||
pprint(scrape(train_no))
|
||||
pprint(scrape_train(train_no))
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
|
79
scraper/utils.py
Normal file
79
scraper/utils.py
Normal file
|
@ -0,0 +1,79 @@
|
|||
import re
|
||||
|
||||
from datetime import datetime, timedelta
|
||||
from urllib.parse import urlencode, quote
|
||||
|
||||
# From: https://en.wikipedia.org/wiki/Whitespace_character#Unicode
|
||||
ASCII_WHITESPACE = [
|
||||
'\u0009', # HT; Character Tabulation
|
||||
'\u000a', # LF
|
||||
'\u000b', # VT; Line Tabulation
|
||||
'\u000c', # FF; Form Feed
|
||||
'\u000d', # CR
|
||||
'\u0020', # Space
|
||||
]
|
||||
|
||||
WHITESPACE = ASCII_WHITESPACE + [
|
||||
'\u0085', # NEL; Next Line
|
||||
'\u00a0', # No-break Space;
|
||||
'\u1680', # Ogham Space Mark
|
||||
'\u2000', # En Quad
|
||||
'\u2001', # Em Quad
|
||||
'\u2002', # En Space
|
||||
'\u2003', # Em Space
|
||||
'\u2004', # Three-per-em Space
|
||||
'\u2005', # Four-per-em Space
|
||||
'\u2006', # Six-per-em Space
|
||||
'\u2007', # Figure Space
|
||||
'\u2008', # Punctuation Space
|
||||
'\u2009', # Thin Space
|
||||
'\u200A', # Hair Space
|
||||
'\u2028', # Line Separator
|
||||
'\u2029', # Paragraph Separator
|
||||
'\u202f', # Narrow No-break Space
|
||||
'\u205d', # Meduam Mathematical Space
|
||||
'\u3000', # Ideographic Space
|
||||
]
|
||||
|
||||
WHITESPACE_REGEX = re.compile(rf'[{"".join(WHITESPACE)}]+', flags=re.MULTILINE)
|
||||
|
||||
class DateTimeSequencer:
|
||||
def __init__(self, year: int, month: int, day: int) -> None:
|
||||
self.current = datetime(year, month, day, 0, 0, 0)
|
||||
self.current -= timedelta(seconds=1)
|
||||
|
||||
def __call__(self, hour: int, minute: int = 0, second: int = 0) -> datetime:
|
||||
potential_new_date = datetime(self.current.year, self.current.month, self.current.day, hour, minute, second)
|
||||
if (self.current > potential_new_date):
|
||||
potential_new_date += timedelta(days=1)
|
||||
self.current = potential_new_date
|
||||
return self.current
|
||||
|
||||
def collapse_space(string: str) -> str:
|
||||
return WHITESPACE_REGEX.sub(
|
||||
' ',
|
||||
string,
|
||||
).strip()
|
||||
|
||||
def build_url(base: str, /, query: dict = {}, **kwargs):
|
||||
result = base.format(**{ k: quote(str(v)) for k, v in kwargs.items() })
|
||||
if query:
|
||||
result += '?'
|
||||
result += urlencode(query)
|
||||
return result
|
||||
|
||||
RO_TO_EN = {
|
||||
'ă': 'a',
|
||||
'Ă': 'A',
|
||||
'â': 'a',
|
||||
'Â': 'A',
|
||||
'î': 'i',
|
||||
'Î': 'I',
|
||||
'ș': 's',
|
||||
'Ș': 'S',
|
||||
'ț': 't',
|
||||
'Ț': 'T',
|
||||
}
|
||||
|
||||
def ro_letters_to_en(string: str) -> str:
|
||||
return ''.join((RO_TO_EN.get(letter, letter) for letter in string))
|
|
@ -7,6 +7,7 @@ name = "pypi"
|
|||
flask = "*"
|
||||
gevent = "*"
|
||||
scraper = { editable = true, path = '../scraper' }
|
||||
jsonschema = "*"
|
||||
|
||||
[dev-packages]
|
||||
|
||||
|
|
114
server/Pipfile.lock
generated
114
server/Pipfile.lock
generated
|
@ -1,7 +1,7 @@
|
|||
{
|
||||
"_meta": {
|
||||
"hash": {
|
||||
"sha256": "9d422680ab15ce184b043276f5d0d2cac228ff60dfc66ec193b6314bdc0f6ce2"
|
||||
"sha256": "3c7f09679bdd68674754a714ee39503cf1a3ae265400eea074fec83559246dff"
|
||||
},
|
||||
"pipfile-spec": 6,
|
||||
"requires": {
|
||||
|
@ -16,6 +16,14 @@
|
|||
]
|
||||
},
|
||||
"default": {
|
||||
"attrs": {
|
||||
"hashes": [
|
||||
"sha256:149e90d6d8ac20db7a955ad60cf0e6881a3f20d37096140088356da6c716b0b1",
|
||||
"sha256:ef6aaac3ca6cd92904cdd0d83f629a15f18053ec84e6432106f7a4d04ae4f5fb"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
|
||||
"version": "==21.2.0"
|
||||
},
|
||||
"beautifulsoup4": {
|
||||
"hashes": [
|
||||
"sha256:4c98143716ef1cb40bf7f39a8e3eec8f8b009509e74904ba3a7b315431577e35",
|
||||
|
@ -31,57 +39,6 @@
|
|||
],
|
||||
"version": "==2021.5.30"
|
||||
},
|
||||
"cffi": {
|
||||
"hashes": [
|
||||
"sha256:06c54a68935738d206570b20da5ef2b6b6d92b38ef3ec45c5422c0ebaf338d4d",
|
||||
"sha256:0c0591bee64e438883b0c92a7bed78f6290d40bf02e54c5bf0978eaf36061771",
|
||||
"sha256:19ca0dbdeda3b2615421d54bef8985f72af6e0c47082a8d26122adac81a95872",
|
||||
"sha256:22b9c3c320171c108e903d61a3723b51e37aaa8c81255b5e7ce102775bd01e2c",
|
||||
"sha256:26bb2549b72708c833f5abe62b756176022a7b9a7f689b571e74c8478ead51dc",
|
||||
"sha256:33791e8a2dc2953f28b8d8d300dde42dd929ac28f974c4b4c6272cb2955cb762",
|
||||
"sha256:3c8d896becff2fa653dc4438b54a5a25a971d1f4110b32bd3068db3722c80202",
|
||||
"sha256:4373612d59c404baeb7cbd788a18b2b2a8331abcc84c3ba40051fcd18b17a4d5",
|
||||
"sha256:487d63e1454627c8e47dd230025780e91869cfba4c753a74fda196a1f6ad6548",
|
||||
"sha256:48916e459c54c4a70e52745639f1db524542140433599e13911b2f329834276a",
|
||||
"sha256:4922cd707b25e623b902c86188aca466d3620892db76c0bdd7b99a3d5e61d35f",
|
||||
"sha256:55af55e32ae468e9946f741a5d51f9896da6b9bf0bbdd326843fec05c730eb20",
|
||||
"sha256:57e555a9feb4a8460415f1aac331a2dc833b1115284f7ded7278b54afc5bd218",
|
||||
"sha256:5d4b68e216fc65e9fe4f524c177b54964af043dde734807586cf5435af84045c",
|
||||
"sha256:64fda793737bc4037521d4899be780534b9aea552eb673b9833b01f945904c2e",
|
||||
"sha256:6d6169cb3c6c2ad50db5b868db6491a790300ade1ed5d1da29289d73bbe40b56",
|
||||
"sha256:7bcac9a2b4fdbed2c16fa5681356d7121ecabf041f18d97ed5b8e0dd38a80224",
|
||||
"sha256:80b06212075346b5546b0417b9f2bf467fea3bfe7352f781ffc05a8ab24ba14a",
|
||||
"sha256:818014c754cd3dba7229c0f5884396264d51ffb87ec86e927ef0be140bfdb0d2",
|
||||
"sha256:8eb687582ed7cd8c4bdbff3df6c0da443eb89c3c72e6e5dcdd9c81729712791a",
|
||||
"sha256:99f27fefe34c37ba9875f224a8f36e31d744d8083e00f520f133cab79ad5e819",
|
||||
"sha256:9f3e33c28cd39d1b655ed1ba7247133b6f7fc16fa16887b120c0c670e35ce346",
|
||||
"sha256:a8661b2ce9694ca01c529bfa204dbb144b275a31685a075ce123f12331be790b",
|
||||
"sha256:a9da7010cec5a12193d1af9872a00888f396aba3dc79186604a09ea3ee7c029e",
|
||||
"sha256:aedb15f0a5a5949ecb129a82b72b19df97bbbca024081ed2ef88bd5c0a610534",
|
||||
"sha256:b315d709717a99f4b27b59b021e6207c64620790ca3e0bde636a6c7f14618abb",
|
||||
"sha256:ba6f2b3f452e150945d58f4badd92310449876c4c954836cfb1803bdd7b422f0",
|
||||
"sha256:c33d18eb6e6bc36f09d793c0dc58b0211fccc6ae5149b808da4a62660678b156",
|
||||
"sha256:c9a875ce9d7fe32887784274dd533c57909b7b1dcadcc128a2ac21331a9765dd",
|
||||
"sha256:c9e005e9bd57bc987764c32a1bee4364c44fdc11a3cc20a40b93b444984f2b87",
|
||||
"sha256:d2ad4d668a5c0645d281dcd17aff2be3212bc109b33814bbb15c4939f44181cc",
|
||||
"sha256:d950695ae4381ecd856bcaf2b1e866720e4ab9a1498cba61c602e56630ca7195",
|
||||
"sha256:e22dcb48709fc51a7b58a927391b23ab37eb3737a98ac4338e2448bef8559b33",
|
||||
"sha256:e8c6a99be100371dbb046880e7a282152aa5d6127ae01783e37662ef73850d8f",
|
||||
"sha256:e9dc245e3ac69c92ee4c167fbdd7428ec1956d4e754223124991ef29eb57a09d",
|
||||
"sha256:eb687a11f0a7a1839719edd80f41e459cc5366857ecbed383ff376c4e3cc6afd",
|
||||
"sha256:eb9e2a346c5238a30a746893f23a9535e700f8192a68c07c0258e7ece6ff3728",
|
||||
"sha256:ed38b924ce794e505647f7c331b22a693bee1538fdf46b0222c4717b42f744e7",
|
||||
"sha256:f0010c6f9d1a4011e429109fda55a225921e3206e7f62a0c22a35344bfd13cca",
|
||||
"sha256:f0c5d1acbfca6ebdd6b1e3eded8d261affb6ddcf2186205518f1428b8569bb99",
|
||||
"sha256:f10afb1004f102c7868ebfe91c28f4a712227fe4cb24974350ace1f90e1febbf",
|
||||
"sha256:f174135f5609428cc6e1b9090f9268f5c8935fddb1b25ccb8255a2d50de6789e",
|
||||
"sha256:f3ebe6e73c319340830a9b2825d32eb6d8475c1dac020b4f0aa774ee3b898d1c",
|
||||
"sha256:f627688813d0a4140153ff532537fbe4afea5a3dffce1f9deb7f91f848a832b5",
|
||||
"sha256:fd4305f86f53dfd8cd3522269ed7fc34856a8ee3709a5e28b2836b2db9d4cd69"
|
||||
],
|
||||
"markers": "platform_python_implementation == 'CPython' and sys_platform == 'win32'",
|
||||
"version": "==1.14.6"
|
||||
},
|
||||
"charset-normalizer": {
|
||||
"hashes": [
|
||||
"sha256:0c8911edd15d19223366a194a513099a302055a962bca2cec0f54b8b63175d8b",
|
||||
|
@ -98,14 +55,6 @@
|
|||
"markers": "python_version >= '3.6'",
|
||||
"version": "==8.0.1"
|
||||
},
|
||||
"colorama": {
|
||||
"hashes": [
|
||||
"sha256:5941b2b48a20143d2267e95b1c2a7603ce057ee39fd88e7329b0c292aa16869b",
|
||||
"sha256:9f47eda37229f68eee03b24b9748937c7dc3868f906e8ba69fbcbdd3bc5dc3e2"
|
||||
],
|
||||
"markers": "platform_system == 'Windows'",
|
||||
"version": "==0.4.4"
|
||||
},
|
||||
"flask": {
|
||||
"hashes": [
|
||||
"sha256:1c4c257b1892aec1398784c63791cbaa43062f1f7aeb555c4da961b20ee68f55",
|
||||
|
@ -230,6 +179,14 @@
|
|||
"markers": "python_version >= '3.6'",
|
||||
"version": "==3.0.1"
|
||||
},
|
||||
"jsonschema": {
|
||||
"hashes": [
|
||||
"sha256:4e5b3cf8216f577bee9ce139cbe72eca3ea4f292ec60928ff24758ce626cd163",
|
||||
"sha256:c8a85b28d377cc7737e46e2d9f2b4f44ee3c0e1deac6bf46ddefc7187d30797a"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==3.2.0"
|
||||
},
|
||||
"markupsafe": {
|
||||
"hashes": [
|
||||
"sha256:01a9b8ea66f1658938f65b93a85ebe8bc016e6769611be228d797c9d998dd298",
|
||||
|
@ -290,13 +247,32 @@
|
|||
"markers": "python_version >= '3.6'",
|
||||
"version": "==2.0.1"
|
||||
},
|
||||
"pycparser": {
|
||||
"pyrsistent": {
|
||||
"hashes": [
|
||||
"sha256:2d475327684562c3a96cc71adf7dc8c4f0565175cf86b6d7a404ff4c771f15f0",
|
||||
"sha256:7582ad22678f0fcd81102833f60ef8d0e57288b6b5fb00323d101be910e35705"
|
||||
"sha256:097b96f129dd36a8c9e33594e7ebb151b1515eb52cceb08474c10a5479e799f2",
|
||||
"sha256:2aaf19dc8ce517a8653746d98e962ef480ff34b6bc563fc067be6401ffb457c7",
|
||||
"sha256:404e1f1d254d314d55adb8d87f4f465c8693d6f902f67eb6ef5b4526dc58e6ea",
|
||||
"sha256:48578680353f41dca1ca3dc48629fb77dfc745128b56fc01096b2530c13fd426",
|
||||
"sha256:4916c10896721e472ee12c95cdc2891ce5890898d2f9907b1b4ae0f53588b710",
|
||||
"sha256:527be2bfa8dc80f6f8ddd65242ba476a6c4fb4e3aedbf281dfbac1b1ed4165b1",
|
||||
"sha256:58a70d93fb79dc585b21f9d72487b929a6fe58da0754fa4cb9f279bb92369396",
|
||||
"sha256:5e4395bbf841693eaebaa5bb5c8f5cdbb1d139e07c975c682ec4e4f8126e03d2",
|
||||
"sha256:6b5eed00e597b5b5773b4ca30bd48a5774ef1e96f2a45d105db5b4ebb4bca680",
|
||||
"sha256:73ff61b1411e3fb0ba144b8f08d6749749775fe89688093e1efef9839d2dcc35",
|
||||
"sha256:772e94c2c6864f2cd2ffbe58bb3bdefbe2a32afa0acb1a77e472aac831f83427",
|
||||
"sha256:773c781216f8c2900b42a7b638d5b517bb134ae1acbebe4d1e8f1f41ea60eb4b",
|
||||
"sha256:a0c772d791c38bbc77be659af29bb14c38ced151433592e326361610250c605b",
|
||||
"sha256:b29b869cf58412ca5738d23691e96d8aff535e17390128a1a52717c9a109da4f",
|
||||
"sha256:c1a9ff320fa699337e05edcaae79ef8c2880b52720bc031b219e5b5008ebbdef",
|
||||
"sha256:cd3caef37a415fd0dae6148a1b6957a8c5f275a62cca02e18474608cb263640c",
|
||||
"sha256:d5ec194c9c573aafaceebf05fc400656722793dac57f254cd4741f3c27ae57b4",
|
||||
"sha256:da6e5e818d18459fa46fac0a4a4e543507fe1110e808101277c5a2b5bab0cd2d",
|
||||
"sha256:e79d94ca58fcafef6395f6352383fa1a76922268fa02caa2272fff501c2fdc78",
|
||||
"sha256:f3ef98d7b76da5eb19c37fda834d50262ff9167c65658d1d8f974d2e4d90676b",
|
||||
"sha256:f4c8cabb46ff8e5d61f56a037974228e978f26bfefce4f61a4b1ac0ba7a2ab72"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
|
||||
"version": "==2.20"
|
||||
"markers": "python_version >= '3.6'",
|
||||
"version": "==0.18.0"
|
||||
},
|
||||
"pytz": {
|
||||
"hashes": [
|
||||
|
@ -317,6 +293,14 @@
|
|||
"editable": true,
|
||||
"path": "../scraper"
|
||||
},
|
||||
"six": {
|
||||
"hashes": [
|
||||
"sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926",
|
||||
"sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
|
||||
"version": "==1.16.0"
|
||||
},
|
||||
"soupsieve": {
|
||||
"hashes": [
|
||||
"sha256:052774848f448cf19c7e959adf5566904d525f33a3f8b6ba6f6f8f26ec7de0cc",
|
||||
|
|
|
@ -1,6 +1,9 @@
|
|||
# Globals
|
||||
stations = []
|
||||
trains = []
|
||||
db_data = {
|
||||
'version': 2,
|
||||
}
|
||||
|
||||
# Examples
|
||||
example_station = {
|
||||
|
@ -20,38 +23,100 @@ example_train = {
|
|||
import json
|
||||
import os
|
||||
from os import path, stat
|
||||
from contextlib import contextmanager
|
||||
|
||||
from .utils import take_while
|
||||
|
||||
DB_DIR = os.environ.get('DB_DIR', '') or './db'
|
||||
if not path.exists(DB_DIR):
|
||||
os.mkdir(DB_DIR)
|
||||
|
||||
DB_FILE = path.join(DB_DIR, 'db.json')
|
||||
|
||||
STATIONS_FILE = path.join(DB_DIR, 'stations.json')
|
||||
|
||||
TRAINS_FILE = path.join(DB_DIR, 'trains.json')
|
||||
|
||||
def migration():
|
||||
global db_data
|
||||
global trains
|
||||
global stations
|
||||
if not path.exists(DB_FILE):
|
||||
print('[Migration] Migrating DB version 1 -> 2')
|
||||
if path.exists(STATIONS_FILE):
|
||||
with open(STATIONS_FILE) as f:
|
||||
stations = json.load(f)
|
||||
for i in range(len(stations)):
|
||||
stations[i]['stoppedAtBy'] = [str(num) for num in stations[i]['stoppedAtBy']]
|
||||
with open(STATIONS_FILE, 'w') as f:
|
||||
json.dump(stations, f)
|
||||
if path.exists(TRAINS_FILE):
|
||||
with open(TRAINS_FILE) as f:
|
||||
trains = json.load(f)
|
||||
for i in range(len(trains)):
|
||||
trains[i]['number'] = trains[i]['numberString']
|
||||
del trains[i]['numberString']
|
||||
with open(TRAINS_FILE, 'w') as f:
|
||||
json.dump(trains, f)
|
||||
db_data = {
|
||||
'version': 2,
|
||||
}
|
||||
with open(DB_FILE, 'w') as f:
|
||||
json.dump(db_data, f)
|
||||
migration()
|
||||
else:
|
||||
with open(DB_FILE) as f:
|
||||
db_data = json.load(f)
|
||||
if db_data['version'] == 2:
|
||||
print('[Migration] DB Version: 2, noop')
|
||||
|
||||
migration()
|
||||
|
||||
if path.exists(DB_FILE):
|
||||
with open(DB_FILE) as f:
|
||||
db_data = json.load(f)
|
||||
else:
|
||||
with open(DB_FILE, 'w') as f:
|
||||
json.dump(db_data, f)
|
||||
|
||||
if path.exists(STATIONS_FILE):
|
||||
with open(STATIONS_FILE) as f:
|
||||
stations = json.load(f)
|
||||
|
||||
TRAINS_FILE = path.join(DB_DIR, 'trains.json')
|
||||
|
||||
if path.exists(TRAINS_FILE):
|
||||
with open(TRAINS_FILE) as f:
|
||||
trains = json.load(f)
|
||||
|
||||
_should_commit_on_every_change = True
|
||||
|
||||
@contextmanager
|
||||
def db_transaction():
|
||||
global _should_commit_on_every_change
|
||||
_should_commit_on_every_change = False
|
||||
yield
|
||||
with open(DB_FILE, 'w') as f:
|
||||
json.dump(db_data, f)
|
||||
with open(STATIONS_FILE, 'w') as f:
|
||||
stations.sort(key=lambda s: len(s['stoppedAtBy']), reverse=True)
|
||||
json.dump(stations, f)
|
||||
with open(TRAINS_FILE, 'w') as f:
|
||||
json.dump(trains, f)
|
||||
_should_commit_on_every_change = True
|
||||
|
||||
def found_train(rank: str, number: str, company: str) -> int:
|
||||
number_int = int(''.join(take_while(lambda s: str(s).isnumeric(), number)))
|
||||
number = ''.join(take_while(lambda s: str(s).isnumeric(), number))
|
||||
try:
|
||||
next(filter(lambda tr: tr['number'] == number_int, trains))
|
||||
next(filter(lambda tr: tr['number'] == number, trains))
|
||||
except StopIteration:
|
||||
trains.append({
|
||||
'number': number_int,
|
||||
'numberString': number,
|
||||
'number': number,
|
||||
'company': company,
|
||||
'rank': rank,
|
||||
})
|
||||
if _should_commit_on_every_change:
|
||||
with open(TRAINS_FILE, 'w') as f:
|
||||
json.dump(trains, f)
|
||||
return number_int
|
||||
return number
|
||||
|
||||
def found_station(name: str):
|
||||
try:
|
||||
|
@ -61,25 +126,46 @@ def found_station(name: str):
|
|||
'name': name,
|
||||
'stoppedAtBy': [],
|
||||
})
|
||||
if _should_commit_on_every_change:
|
||||
stations.sort(key=lambda s: len(s['stoppedAtBy']), reverse=True)
|
||||
with open(STATIONS_FILE, 'w') as f:
|
||||
json.dump(stations, f)
|
||||
|
||||
def found_train_at_station(station_name: str, train_number: int):
|
||||
def found_train_at_station(station_name: str, train_number: str):
|
||||
train_number = ''.join(take_while(lambda s: str(s).isnumeric(), train_number))
|
||||
found_station(station_name)
|
||||
for i in range(len(stations)):
|
||||
if stations[i]['name'] == station_name:
|
||||
if train_number not in stations[i]['stoppedAtBy']:
|
||||
stations[i]['stoppedAtBy'].append(train_number)
|
||||
break
|
||||
if _should_commit_on_every_change:
|
||||
stations.sort(key=lambda s: len(s['stoppedAtBy']), reverse=True)
|
||||
with open(STATIONS_FILE, 'w') as f:
|
||||
json.dump(stations, f)
|
||||
break
|
||||
|
||||
def on_train_data(train_data: dict):
|
||||
with db_transaction():
|
||||
train_no = found_train(train_data['rank'], train_data['number'], train_data['operator'])
|
||||
for station in train_data['stations']:
|
||||
found_train_at_station(station['name'], train_no)
|
||||
|
||||
def on_train_lookup_failure(train_no: int):
|
||||
def on_train_lookup_failure(train_no: str):
|
||||
pass
|
||||
|
||||
def on_station(station_data: dict):
|
||||
station_name = station_data['stationName']
|
||||
|
||||
def process_train(train_data: dict):
|
||||
train_number = train_data['train']['number']
|
||||
train_number = found_train(train_data['train']['rank'], train_number, train_data['train']['operator'])
|
||||
found_train_at_station(station_name, train_number)
|
||||
if 'route' in train_data['train'] and train_data['train']['route']:
|
||||
for station in train_data['train']['route']:
|
||||
found_train_at_station(station, train_number)
|
||||
|
||||
with db_transaction():
|
||||
for train in station_data['arrivals']:
|
||||
process_train(train)
|
||||
for train in station_data['departures']:
|
||||
process_train(train)
|
||||
|
|
29
server/server/flask_utils.py
Normal file
29
server/server/flask_utils.py
Normal file
|
@ -0,0 +1,29 @@
|
|||
from flask import request as _f_request
|
||||
|
||||
from .utils import filter_result as _filter_result
|
||||
|
||||
def filtered_data(fn):
|
||||
def filterer(*args, **kwargs):
|
||||
filters = _f_request.args.get('filters', None)
|
||||
if filters:
|
||||
filters_raw = [f.split(':', 1) for f in filters.split(',')]
|
||||
filters = {'.': []}
|
||||
for key, value in filters_raw:
|
||||
def add_to(obj, key, value):
|
||||
if '.' in key:
|
||||
prop, key = key.split('.', 1)
|
||||
if prop not in filters:
|
||||
obj[prop] = {'.': []}
|
||||
add_to(obj[prop], key, value)
|
||||
else:
|
||||
obj['.'].append({key: value})
|
||||
add_to(filters, key, value)
|
||||
properties = _f_request.args.get('properties', None)
|
||||
if properties:
|
||||
properties = properties.split(',')
|
||||
|
||||
data = fn(*args, **kwargs)
|
||||
|
||||
return _filter_result(data, properties, filters)
|
||||
|
||||
return filterer
|
|
@ -1,9 +1,13 @@
|
|||
print(f'Server {__name__=}')
|
||||
|
||||
import datetime
|
||||
from flask import Flask, json, request, jsonify
|
||||
|
||||
from flask import Flask, jsonify, url_for
|
||||
from jsonschema import validate
|
||||
|
||||
from .cache import CachedData
|
||||
from .scraper.schemas import TRAIN_INFO_SCHEMA
|
||||
from .utils import get_hostname
|
||||
|
||||
app = Flask(__name__)
|
||||
|
||||
|
@ -14,14 +18,18 @@ app.register_blueprint(v2.bp)
|
|||
def root():
|
||||
return 'Test'
|
||||
|
||||
@app.route('/train/.schema.json')
|
||||
def get_train_info_schema():
|
||||
return jsonify(TRAIN_INFO_SCHEMA['v1'])
|
||||
|
||||
train_data_cache = {}
|
||||
|
||||
@app.route('/train/<int:train_no>')
|
||||
def get_train_info(train_no: int):
|
||||
def get_data():
|
||||
from .scraper.scraper import scrape
|
||||
from .scraper.scraper import scrape_train
|
||||
use_yesterday = False
|
||||
result = scrape(train_no, use_yesterday=use_yesterday)
|
||||
result = scrape_train(train_no, use_yesterday=use_yesterday)
|
||||
|
||||
from . import db
|
||||
db.on_train_data(result)
|
||||
|
@ -40,6 +48,8 @@ def get_train_info(train_no: int):
|
|||
if train_no not in train_data_cache:
|
||||
train_data_cache[train_no] = CachedData(get_data, validity=1000 * 30)
|
||||
data, fetch_time = train_data_cache[train_no]()
|
||||
data['$schema'] = get_hostname() + url_for('.get_train_info_schema')
|
||||
validate(data, schema=TRAIN_INFO_SCHEMA['v1'])
|
||||
resp = jsonify(data)
|
||||
resp.headers['X-Last-Fetched'] = fetch_time.isoformat()
|
||||
return resp
|
||||
|
|
|
@ -16,3 +16,26 @@ def check_yes_no(input: str, default=_NO_DEFAULT, considered_yes=None) -> bool:
|
|||
considered_yes = ['y', 'yes', 't', 'true', '1']
|
||||
return input in considered_yes
|
||||
|
||||
def get_hostname():
|
||||
import os
|
||||
import platform
|
||||
return os.getenv('HOSTNAME', os.getenv('COMPUTERNAME', platform.node()))
|
||||
|
||||
def filter_result(data, properties=None, filters=None):
|
||||
is_array = not hasattr(data, 'get')
|
||||
result = data if is_array else [data]
|
||||
|
||||
if filters:
|
||||
# Todo: implement filters
|
||||
pass
|
||||
# def f(lst, filters):
|
||||
# def condition(item):
|
||||
|
||||
# return list(filter(condition, lst))
|
||||
# result = f(result, filters)
|
||||
|
||||
if properties:
|
||||
for i in range(len(result)):
|
||||
result[i] = {p:result[i].get(p, None) for p in properties}
|
||||
|
||||
return result if is_array else result[0]
|
||||
|
|
|
@ -1,32 +1,87 @@
|
|||
import json
|
||||
from flask import Blueprint, jsonify, request
|
||||
from flask.helpers import url_for
|
||||
from jsonschema import validate
|
||||
|
||||
from .. import db
|
||||
from ..cache import CachedData
|
||||
from ..utils import check_yes_no
|
||||
from ..utils import check_yes_no, get_hostname
|
||||
from ..flask_utils import filtered_data
|
||||
from ..scraper.utils import ro_letters_to_en
|
||||
from ..scraper.schemas import STATION_SCHEMA, TRAIN_INFO_SCHEMA
|
||||
|
||||
bp = Blueprint('v2', __name__, url_prefix='/v2')
|
||||
|
||||
@bp.get('/trains')
|
||||
def get_known_trains():
|
||||
return jsonify(db.trains)
|
||||
@filtered_data
|
||||
def get_data():
|
||||
return db.trains
|
||||
|
||||
result = get_data()
|
||||
|
||||
return jsonify(result)
|
||||
|
||||
@bp.get('/stations')
|
||||
def get_known_stations():
|
||||
return jsonify(db.stations)
|
||||
@filtered_data
|
||||
def get_data():
|
||||
return db.stations
|
||||
|
||||
result = get_data()
|
||||
|
||||
return jsonify(result)
|
||||
|
||||
train_data_cache = {}
|
||||
|
||||
@bp.route('/train/.schema.json')
|
||||
def get_train_info_schema():
|
||||
return jsonify(TRAIN_INFO_SCHEMA['v2'])
|
||||
|
||||
@bp.route('/train/<int:train_no>')
|
||||
def get_train_info(train_no: int):
|
||||
use_yesterday = check_yes_no(request.args.get('use_yesterday', ''), default=False)
|
||||
@filtered_data
|
||||
def get_data():
|
||||
from ..scraper.scraper import scrape
|
||||
result = scrape(train_no, use_yesterday=use_yesterday)
|
||||
from ..scraper.scraper import scrape_train
|
||||
result = scrape_train(train_no, use_yesterday=use_yesterday)
|
||||
db.on_train_data(result)
|
||||
return result
|
||||
if train_no not in train_data_cache:
|
||||
if (train_no, use_yesterday) not in train_data_cache:
|
||||
train_data_cache[(train_no, use_yesterday)] = CachedData(get_data, validity=1000 * 30)
|
||||
data, fetch_time = train_data_cache[(train_no, use_yesterday)]()
|
||||
data['$schema'] = get_hostname() + url_for('.get_train_info_schema')
|
||||
validate(data, schema=TRAIN_INFO_SCHEMA['v2'])
|
||||
resp = jsonify(data)
|
||||
resp.headers['X-Last-Fetched'] = fetch_time.isoformat()
|
||||
return resp
|
||||
|
||||
station_cache = {}
|
||||
|
||||
@bp.route('/station/.schema.json')
|
||||
def get_station_schema():
|
||||
return jsonify(STATION_SCHEMA['v2'])
|
||||
|
||||
@bp.route('/station/<station_name>')
|
||||
def get_station(station_name: str):
|
||||
station_name = ro_letters_to_en(station_name.lower().replace(' ', '-'))
|
||||
|
||||
def get_data():
|
||||
from ..scraper.scraper import scrape_station
|
||||
result = scrape_station(station_name)
|
||||
db.on_station(result)
|
||||
return result
|
||||
if station_name not in train_data_cache:
|
||||
station_cache[station_name] = CachedData(get_data, validity=1000 * 30)
|
||||
data, fetch_time = station_cache[station_name]()
|
||||
data['$schema'] = get_hostname() + url_for('.get_station_schema')
|
||||
validate(data, schema=STATION_SCHEMA['v2'])
|
||||
|
||||
@filtered_data
|
||||
def filter(data):
|
||||
return data
|
||||
|
||||
resp = jsonify(filter(data))
|
||||
resp.headers['X-Last-Fetched'] = fetch_time.isoformat()
|
||||
return resp
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue