Initial commit
This commit is contained in:
commit
89cefc3fb3
7 changed files with 396 additions and 0 deletions
5
.gitignore
vendored
Normal file
5
.gitignore
vendored
Normal file
|
@ -0,0 +1,5 @@
|
||||||
|
# CPython compiler output
|
||||||
|
*.pyc
|
||||||
|
|
||||||
|
# VS Code
|
||||||
|
.vscode
|
13
scrapper/Pipfile
Normal file
13
scrapper/Pipfile
Normal file
|
@ -0,0 +1,13 @@
|
||||||
|
[[source]]
|
||||||
|
url = "https://pypi.org/simple"
|
||||||
|
verify_ssl = true
|
||||||
|
name = "pypi"
|
||||||
|
|
||||||
|
[packages]
|
||||||
|
beautifulsoup4 = "*"
|
||||||
|
requests = "*"
|
||||||
|
|
||||||
|
[dev-packages]
|
||||||
|
|
||||||
|
[requires]
|
||||||
|
python_version = "3.9"
|
77
scrapper/Pipfile.lock
generated
Normal file
77
scrapper/Pipfile.lock
generated
Normal file
|
@ -0,0 +1,77 @@
|
||||||
|
{
|
||||||
|
"_meta": {
|
||||||
|
"hash": {
|
||||||
|
"sha256": "e7c5f7eab5a8f9202caaaa3bdca8e911579596b8dd25319c2f50e84794eb9fa8"
|
||||||
|
},
|
||||||
|
"pipfile-spec": 6,
|
||||||
|
"requires": {
|
||||||
|
"python_version": "3.9"
|
||||||
|
},
|
||||||
|
"sources": [
|
||||||
|
{
|
||||||
|
"name": "pypi",
|
||||||
|
"url": "https://pypi.org/simple",
|
||||||
|
"verify_ssl": true
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"default": {
|
||||||
|
"beautifulsoup4": {
|
||||||
|
"hashes": [
|
||||||
|
"sha256:4c98143716ef1cb40bf7f39a8e3eec8f8b009509e74904ba3a7b315431577e35",
|
||||||
|
"sha256:84729e322ad1d5b4d25f805bfa05b902dd96450f43842c4e99067d5e1369eb25",
|
||||||
|
"sha256:fff47e031e34ec82bf17e00da8f592fe7de69aeea38be00523c04623c04fb666"
|
||||||
|
],
|
||||||
|
"index": "pypi",
|
||||||
|
"version": "==4.9.3"
|
||||||
|
},
|
||||||
|
"certifi": {
|
||||||
|
"hashes": [
|
||||||
|
"sha256:2bbf76fd432960138b3ef6dda3dde0544f27cbf8546c458e60baf371917ba9ee",
|
||||||
|
"sha256:50b1e4f8446b06f41be7dd6338db18e0990601dce795c2b1686458aa7e8fa7d8"
|
||||||
|
],
|
||||||
|
"version": "==2021.5.30"
|
||||||
|
},
|
||||||
|
"charset-normalizer": {
|
||||||
|
"hashes": [
|
||||||
|
"sha256:0c8911edd15d19223366a194a513099a302055a962bca2cec0f54b8b63175d8b",
|
||||||
|
"sha256:f23667ebe1084be45f6ae0538e4a5a865206544097e4e8bbcacf42cd02a348f3"
|
||||||
|
],
|
||||||
|
"markers": "python_version >= '3'",
|
||||||
|
"version": "==2.0.4"
|
||||||
|
},
|
||||||
|
"idna": {
|
||||||
|
"hashes": [
|
||||||
|
"sha256:14475042e284991034cb48e06f6851428fb14c4dc953acd9be9a5e95c7b6dd7a",
|
||||||
|
"sha256:467fbad99067910785144ce333826c71fb0e63a425657295239737f7ecd125f3"
|
||||||
|
],
|
||||||
|
"markers": "python_version >= '3'",
|
||||||
|
"version": "==3.2"
|
||||||
|
},
|
||||||
|
"requests": {
|
||||||
|
"hashes": [
|
||||||
|
"sha256:6c1246513ecd5ecd4528a0906f910e8f0f9c6b8ec72030dc9fd154dc1a6efd24",
|
||||||
|
"sha256:b8aa58f8cf793ffd8782d3d8cb19e66ef36f7aba4353eec859e74678b01b07a7"
|
||||||
|
],
|
||||||
|
"index": "pypi",
|
||||||
|
"version": "==2.26.0"
|
||||||
|
},
|
||||||
|
"soupsieve": {
|
||||||
|
"hashes": [
|
||||||
|
"sha256:052774848f448cf19c7e959adf5566904d525f33a3f8b6ba6f6f8f26ec7de0cc",
|
||||||
|
"sha256:c2c1c2d44f158cdbddab7824a9af8c4f83c76b1e23e049479aa432feb6c4c23b"
|
||||||
|
],
|
||||||
|
"markers": "python_version >= '3'",
|
||||||
|
"version": "==2.2.1"
|
||||||
|
},
|
||||||
|
"urllib3": {
|
||||||
|
"hashes": [
|
||||||
|
"sha256:39fb8672126159acb139a7718dd10806104dec1e2f0f6c88aab05d17df10c8d4",
|
||||||
|
"sha256:f57b4c16c62fa2760b7e3d97c35b255512fb6b59a259730f36ba32ce9f8e342f"
|
||||||
|
],
|
||||||
|
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4'",
|
||||||
|
"version": "==1.26.6"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"develop": {}
|
||||||
|
}
|
1
scrapper/__init__.py
Normal file
1
scrapper/__init__.py
Normal file
|
@ -0,0 +1 @@
|
||||||
|
__all__ = ['scrapper']
|
44
scrapper/main.py
Normal file
44
scrapper/main.py
Normal file
|
@ -0,0 +1,44 @@
|
||||||
|
from scraper import scrape
|
||||||
|
|
||||||
|
_NO_DEFAULT = object()
|
||||||
|
|
||||||
|
def check_yes_no(input: str, default=_NO_DEFAULT, considered_yes=None) -> bool:
|
||||||
|
input = str(input).strip().lower()
|
||||||
|
if not input:
|
||||||
|
if default == _NO_DEFAULT:
|
||||||
|
raise Exception('Empty input with no default')
|
||||||
|
return default
|
||||||
|
if not considered_yes:
|
||||||
|
considered_yes = ['y', 'yes', 't', 'true', '1']
|
||||||
|
return input in considered_yes
|
||||||
|
|
||||||
|
def main():
|
||||||
|
train_no = int(input('Train number: '))
|
||||||
|
use_yesterday = input('Train departed yesterday? [y/N] ')
|
||||||
|
data = scrape(train_no, use_yesterday=check_yes_no(use_yesterday, default=False))
|
||||||
|
print(f'Train {train_no}\t{data["route"]["from"]}\t{data["route"]["to"]}')
|
||||||
|
print()
|
||||||
|
if 'status' in data and data['status']:
|
||||||
|
delay = data['status']['delay']
|
||||||
|
if delay == 0:
|
||||||
|
delay = 'on time'
|
||||||
|
else:
|
||||||
|
delay = f'{delay} min'
|
||||||
|
state = data['status']['state']
|
||||||
|
station = data['status']['station']
|
||||||
|
print(f'Status: {delay}\t{state}\t{station}')
|
||||||
|
print()
|
||||||
|
for station in data['stations']:
|
||||||
|
if 'arrival' in station and station['arrival']:
|
||||||
|
print(station['arrival']['scheduleTime'], end='\t')
|
||||||
|
else:
|
||||||
|
print(end='\t')
|
||||||
|
print(station['name'], end='\t')
|
||||||
|
if 'departure' in station and station['departure']:
|
||||||
|
print(station['departure']['scheduleTime'], end='\t')
|
||||||
|
else:
|
||||||
|
print(end='\t')
|
||||||
|
print()
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
157
scrapper/scraper.py
Normal file
157
scrapper/scraper.py
Normal file
|
@ -0,0 +1,157 @@
|
||||||
|
#! /usr/bin/env python3
|
||||||
|
|
||||||
|
from datetime import datetime, timedelta
|
||||||
|
import re
|
||||||
|
|
||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from urllib.parse import quote, urlencode
|
||||||
|
|
||||||
|
SL_REGEX = re.compile(r'^(?:Fără|([0-9]+) min) (întârziere|mai devreme) la (trecerea fără oprire prin|sosirea în|plecarea din) (.+)\.$')
|
||||||
|
SL_STATE_MAP = {
|
||||||
|
't': 'passing',
|
||||||
|
's': 'arrival',
|
||||||
|
'p': 'departure',
|
||||||
|
}
|
||||||
|
|
||||||
|
RO_LETTERS = r'A-Za-zăâîșțĂÂÎȚȘ'
|
||||||
|
|
||||||
|
ROUTE_REGEX = re.compile(rf'^Parcurs tren ([{RO_LETTERS} ]+)[-–]([{RO_LETTERS} ]+)$')
|
||||||
|
|
||||||
|
KM_REGEX = re.compile(r'^km ([0-9]+)$')
|
||||||
|
|
||||||
|
PLATFORM_REGEX = re.compile(r'^linia (.+)$')
|
||||||
|
|
||||||
|
STOPPING_TIME_REGEX = re.compile(r'^([0-9]+) min oprire$')
|
||||||
|
|
||||||
|
STATION_DEPARR_STATUS_REGEX = re.compile(r'^(?:(la timp)|(?:((?:\+|-)[0-9]+) min \((?:(?:întârziere)|(?:mai devreme))\)))(\*?)$')
|
||||||
|
|
||||||
|
def collapse_space(string: str) -> str:
|
||||||
|
return re.sub(
|
||||||
|
rf'[{BeautifulSoup.ASCII_SPACES}]+',
|
||||||
|
' ',
|
||||||
|
string,
|
||||||
|
flags=re.MULTILINE
|
||||||
|
).strip()
|
||||||
|
|
||||||
|
def build_url(base: str, /, query: dict, **kwargs):
|
||||||
|
result = base.format(**{ k: quote(str(v)) for k, v in kwargs.items() })
|
||||||
|
if query:
|
||||||
|
result += '?'
|
||||||
|
result += urlencode(query)
|
||||||
|
return result
|
||||||
|
|
||||||
|
def scrape(train_no: int, use_yesterday=False, date_override=None):
|
||||||
|
# Start scrapping session
|
||||||
|
s = requests.Session()
|
||||||
|
|
||||||
|
date = datetime.today()
|
||||||
|
if use_yesterday:
|
||||||
|
date -= timedelta(days=1)
|
||||||
|
if date_override:
|
||||||
|
date = date_override
|
||||||
|
|
||||||
|
r = s.get(build_url(
|
||||||
|
'https://mersultrenurilor.infofer.ro/ro-RO/Tren/{train_no}',
|
||||||
|
train_no=train_no,
|
||||||
|
query=[
|
||||||
|
('Date', date.strftime('%d.%m.%Y')),
|
||||||
|
],
|
||||||
|
))
|
||||||
|
|
||||||
|
soup = BeautifulSoup(r.text, features='html.parser')
|
||||||
|
sform = soup.find(id='form-search')
|
||||||
|
# required_fields = [
|
||||||
|
# 'Date',
|
||||||
|
# 'TrainRunningNumber',
|
||||||
|
# 'SelectedBranchCode',
|
||||||
|
# 'ReCaptcha',
|
||||||
|
# 'ConfirmationKey',
|
||||||
|
# 'IsSearchWanted',
|
||||||
|
# 'IsReCaptchaFailed',
|
||||||
|
# '__RequestVerificationToken',
|
||||||
|
# ]
|
||||||
|
# result_data = { field: sform.find('input', attrs={'name': field})['value'] for field in required_fields }
|
||||||
|
result_data = { elem['name']: elem['value'] for elem in sform('input') }
|
||||||
|
|
||||||
|
r = s.post('https://mersultrenurilor.infofer.ro/ro-RO/Trains/TrainsResult', data=result_data)
|
||||||
|
soup = BeautifulSoup(r.text, features='html.parser')
|
||||||
|
|
||||||
|
scraped = {}
|
||||||
|
|
||||||
|
results_div = soup('div', recursive=False)[3].div
|
||||||
|
status_div = results_div('div', recursive=False)[0]
|
||||||
|
route_text = collapse_space(status_div.h4.text)
|
||||||
|
route_from, route_to = ROUTE_REGEX.match(route_text).groups()
|
||||||
|
scraped['route'] = {
|
||||||
|
'from': route_from,
|
||||||
|
'to': route_to,
|
||||||
|
}
|
||||||
|
try:
|
||||||
|
status_line_match = SL_REGEX.match(collapse_space(status_div.div.text))
|
||||||
|
slm_delay, slm_late, slm_arrival, slm_station = status_line_match.groups()
|
||||||
|
scraped['status'] = {
|
||||||
|
'delay': (int(slm_delay) if slm_late == 'întârziere' else -int(slm_delay)) if slm_delay else 0,
|
||||||
|
'station': slm_station,
|
||||||
|
'state': SL_STATE_MAP[slm_arrival[0]],
|
||||||
|
}
|
||||||
|
except Exception:
|
||||||
|
scraped['status'] = None
|
||||||
|
|
||||||
|
stations = status_div.ul('li', recursive=False)
|
||||||
|
scraped['stations'] = []
|
||||||
|
for station in stations:
|
||||||
|
station_scraped = {}
|
||||||
|
|
||||||
|
left, middle, right = station.div('div', recursive=False)
|
||||||
|
station_scraped['name'] = collapse_space(middle.div.div('div', recursive=False)[0]('div', recursive=False)[0].text)
|
||||||
|
station_scraped['km'] = collapse_space(middle.div.div('div', recursive=False)[0]('div', recursive=False)[1].text)
|
||||||
|
station_scraped['km'] = int(KM_REGEX.match(station_scraped['km']).groups()[0])
|
||||||
|
station_scraped['stoppingTime'] = collapse_space(middle.div.div('div', recursive=False)[0]('div', recursive=False)[2].text)
|
||||||
|
if not station_scraped['stoppingTime']:
|
||||||
|
station_scraped['stoppingTime'] = None
|
||||||
|
else:
|
||||||
|
station_scraped['stoppingTime'] = int(STOPPING_TIME_REGEX.match(station_scraped['stoppingTime']).groups()[0])
|
||||||
|
station_scraped['platform'] = collapse_space(middle.div.div('div', recursive=False)[0]('div', recursive=False)[3].text)
|
||||||
|
if not station_scraped['platform']:
|
||||||
|
station_scraped['platform'] = None
|
||||||
|
else:
|
||||||
|
station_scraped['platform'] = PLATFORM_REGEX.match(station_scraped['platform']).groups()[0]
|
||||||
|
|
||||||
|
def scrape_time(elem, setter):
|
||||||
|
parts = elem.div.div('div', recursive=False)
|
||||||
|
if parts:
|
||||||
|
result = {}
|
||||||
|
|
||||||
|
time, *_ = parts
|
||||||
|
result['scheduleTime'] = collapse_space(time.text)
|
||||||
|
if len(parts) >= 2:
|
||||||
|
_, status, *_ = parts
|
||||||
|
result['status'] = {}
|
||||||
|
on_time, delay, approx = STATION_DEPARR_STATUS_REGEX.match(collapse_space(status.text)).groups()
|
||||||
|
result['status']['delay'] = 0 if on_time else int(delay)
|
||||||
|
result['status']['real'] = not approx
|
||||||
|
else:
|
||||||
|
result['status'] = None
|
||||||
|
|
||||||
|
setter(result)
|
||||||
|
else:
|
||||||
|
setter(None)
|
||||||
|
|
||||||
|
scrape_time(left, lambda value: station_scraped.update(arrival=value))
|
||||||
|
scrape_time(right, lambda value: station_scraped.update(departure=value))
|
||||||
|
|
||||||
|
scraped['stations'].append(station_scraped)
|
||||||
|
|
||||||
|
return scraped
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
train_no = 1538
|
||||||
|
print(f'Testing package with train number {train_no}')
|
||||||
|
from pprint import pprint
|
||||||
|
# pprint(scrape('473'))
|
||||||
|
pprint(scrape(train_no))
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
99
scrapper/trainInfoScrapResultSchema.json
Normal file
99
scrapper/trainInfoScrapResultSchema.json
Normal file
|
@ -0,0 +1,99 @@
|
||||||
|
{
|
||||||
|
"$schema": "http://json-schema.org/schema",
|
||||||
|
"title": "Train Info InfoFer Scrap Result Schema",
|
||||||
|
"description": "Results of scrapping InfoFer website for train info",
|
||||||
|
"definitions": {
|
||||||
|
"delayType": {
|
||||||
|
"description": "Delay of the train (negative for being early)",
|
||||||
|
"type": "number"
|
||||||
|
},
|
||||||
|
"stationArrDepTime": {
|
||||||
|
"description": "Time of arrival at/departure from station",
|
||||||
|
"type": ["object", "null"],
|
||||||
|
"properties": {
|
||||||
|
"scheduleTime": {
|
||||||
|
"description": "The time the train is scheduled to arrive/depart",
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"status": {
|
||||||
|
"type": ["object", "null"],
|
||||||
|
"properties": {
|
||||||
|
"delay": {
|
||||||
|
"$ref": "#/definitions/delayType"
|
||||||
|
},
|
||||||
|
"real": {
|
||||||
|
"description": "Determines whether delay was actually reported or is an approximation",
|
||||||
|
"type": "boolean"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": ["delay", "real"]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": ["scheduleTime"]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"route": {
|
||||||
|
"description": "Route of the train",
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"from": {
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"to": {
|
||||||
|
"type": "string"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": ["from", "to"]
|
||||||
|
},
|
||||||
|
"status": {
|
||||||
|
"description": "Current status of the train",
|
||||||
|
"type": ["object", "null"],
|
||||||
|
"properties": {
|
||||||
|
"delay": {
|
||||||
|
"$ref": "#/definitions/delayType"
|
||||||
|
},
|
||||||
|
"station": {
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"state": {
|
||||||
|
"type": "string",
|
||||||
|
"enum": ["passing", "arrival", "departure"]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"stations": {
|
||||||
|
"description": "List of stations the train stops at",
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"name": {
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"km": {
|
||||||
|
"description": "The distance the train travelled until reaching this station",
|
||||||
|
"type": "number"
|
||||||
|
},
|
||||||
|
"stoppingTime": {
|
||||||
|
"description": "The number of minutes the train is scheduled to stop in this station",
|
||||||
|
"type": ["number", "null"]
|
||||||
|
},
|
||||||
|
"platform": {
|
||||||
|
"description": "The platform the train stopped at",
|
||||||
|
"type": ["string", "null"]
|
||||||
|
},
|
||||||
|
"arrival": {
|
||||||
|
"$ref": "#/definitions/stationArrDepTime"
|
||||||
|
},
|
||||||
|
"departure": {
|
||||||
|
"$ref": "#/definitions/stationArrDepTime"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": ["name", "km"]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": ["route", "stations"]
|
||||||
|
}
|
Loading…
Add table
Reference in a new issue