Added v2 API

This commit is contained in:
Kenneth Bruen 2021-08-23 20:12:10 +03:00
parent d507a227f1
commit de78a094a1
Signed by: kbruen
GPG key ID: CB77B9FE7F902176
16 changed files with 230 additions and 39 deletions

View file

@ -6,16 +6,16 @@ WORKDIR /var/app/scraper
COPY scraper/Pipfil* ./ COPY scraper/Pipfil* ./
COPY scraper/setup.py ./ COPY scraper/setup.py ./
WORKDIR /var/app/server WORKDIR /var/app/server
RUN ln -s /var/app/scraper scraper
COPY server/Pipfil* ./ COPY server/Pipfil* ./
RUN pipenv install RUN pipenv install
RUN pipenv graph
WORKDIR /var/app/scraper WORKDIR /var/app/scraper
COPY scraper . COPY scraper .
WORKDIR /var/app/server WORKDIR /var/app/server
COPY server . COPY server .
RUN rm scraper RUN rm server/scraper
RUN ln -s /var/app/scraper scraper RUN ln -s /var/app/scraper ./server/scraper
ENV PORT 5000 ENV PORT 5000
EXPOSE ${PORT} EXPOSE ${PORT}

View file

@ -6,6 +6,7 @@ name = "pypi"
[packages] [packages]
beautifulsoup4 = "*" beautifulsoup4 = "*"
requests = "*" requests = "*"
pytz = "*"
[dev-packages] [dev-packages]

10
scraper/Pipfile.lock generated
View file

@ -1,7 +1,7 @@
{ {
"_meta": { "_meta": {
"hash": { "hash": {
"sha256": "e7c5f7eab5a8f9202caaaa3bdca8e911579596b8dd25319c2f50e84794eb9fa8" "sha256": "d7e3ebca9807b4f0c9dcac014554e9d1c9cb3a0c30b5c71b0b7cd4ccdc4934e1"
}, },
"pipfile-spec": 6, "pipfile-spec": 6,
"requires": { "requires": {
@ -48,6 +48,14 @@
"markers": "python_version >= '3'", "markers": "python_version >= '3'",
"version": "==3.2" "version": "==3.2"
}, },
"pytz": {
"hashes": [
"sha256:83a4a90894bf38e243cf052c8b58f381bfe9a7a483f6a9cab140bc7f702ac4da",
"sha256:eb10ce3e7736052ed3623d49975ce333bcd712c7bb19a58b9e2089d4057d0798"
],
"index": "pypi",
"version": "==2021.1"
},
"requests": { "requests": {
"hashes": [ "hashes": [
"sha256:6c1246513ecd5ecd4528a0906f910e8f0f9c6b8ec72030dc9fd154dc1a6efd24", "sha256:6c1246513ecd5ecd4528a0906f910e8f0f9c6b8ec72030dc9fd154dc1a6efd24",

View file

@ -3,6 +3,7 @@
from datetime import datetime, timedelta from datetime import datetime, timedelta
import re import re
import pytz
import requests import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from urllib.parse import quote, urlencode from urllib.parse import quote, urlencode
@ -30,6 +31,18 @@ STOPPING_TIME_REGEX = re.compile(r'^([0-9]+) min oprire$')
STATION_DEPARR_STATUS_REGEX = re.compile(r'^(?:(la timp)|(?:((?:\+|-)[0-9]+) min \((?:(?:întârziere)|(?:mai devreme))\)))(\*?)$') STATION_DEPARR_STATUS_REGEX = re.compile(r'^(?:(la timp)|(?:((?:\+|-)[0-9]+) min \((?:(?:întârziere)|(?:mai devreme))\)))(\*?)$')
class DateTimeSequencer:
def __init__(self, year: int, month: int, day: int) -> None:
self.current = datetime(year, month, day, 0, 0, 0)
self.current -= timedelta(seconds=1)
def __call__(self, hour: int, minute: int = 0, second: int = 0) -> datetime:
potential_new_date = datetime(self.current.year, self.current.month, self.current.day, hour, minute, second)
if (self.current > potential_new_date):
potential_new_date += timedelta(days=1)
self.current = potential_new_date
return self.current
def collapse_space(string: str) -> str: def collapse_space(string: str) -> str:
return re.sub( return re.sub(
rf'[{BeautifulSoup.ASCII_SPACES}]+', rf'[{BeautifulSoup.ASCII_SPACES}]+',
@ -77,6 +90,8 @@ def scrape(train_no: int, use_yesterday=False, date_override=None):
train_info_div = train_info_div.div('div', recursive=False)[0] train_info_div = train_info_div.div('div', recursive=False)[0]
scraped['rank'], scraped['number'], scraped['date'] = TRAIN_INFO_REGEX.match(collapse_space(train_info_div.h2.text)).groups() scraped['rank'], scraped['number'], scraped['date'] = TRAIN_INFO_REGEX.match(collapse_space(train_info_div.h2.text)).groups()
date_d, date_m, date_y = (int(comp) for comp in scraped['date'].split('.'))
date = datetime(date_y, date_m, date_d)
scraped['operator'] = OPERATOR_REGEX.match(collapse_space(train_info_div.p.text)).groups()[0] scraped['operator'] = OPERATOR_REGEX.match(collapse_space(train_info_div.p.text)).groups()[0]
@ -101,6 +116,8 @@ def scrape(train_no: int, use_yesterday=False, date_override=None):
stations = status_div.ul('li', recursive=False) stations = status_div.ul('li', recursive=False)
scraped['stations'] = [] scraped['stations'] = []
dt_seq = DateTimeSequencer(date.year, date.month, date.day)
tz = pytz.timezone('Europe/Bucharest')
for station in stations: for station in stations:
station_scraped = {} station_scraped = {}
@ -126,6 +143,8 @@ def scrape(train_no: int, use_yesterday=False, date_override=None):
time, *_ = parts time, *_ = parts
result['scheduleTime'] = collapse_space(time.text) result['scheduleTime'] = collapse_space(time.text)
st_hr, st_min = (int(comp) for comp in result['scheduleTime'].split(':'))
result['scheduleTime'] = tz.localize(dt_seq(st_hr, st_min)).isoformat()
if len(parts) >= 2: if len(parts) >= 2:
_, status, *_ = parts _, status, *_ = parts
result['status'] = {} result['status'] = {}

View file

@ -4,5 +4,5 @@ setup(
name='InfoFer_Scraper', name='InfoFer_Scraper',
version='0.1', version='0.1',
author='Dan Cojocaru', author='Dan Cojocaru',
install_requires=['beautifulsoup4', 'requests'] install_requires=['beautifulsoup4', 'requests', 'pytz']
) )

7
server/Pipfile.lock generated
View file

@ -298,6 +298,13 @@
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
"version": "==2.20" "version": "==2.20"
}, },
"pytz": {
"hashes": [
"sha256:83a4a90894bf38e243cf052c8b58f381bfe9a7a483f6a9cab140bc7f702ac4da",
"sha256:eb10ce3e7736052ed3623d49975ce333bcd712c7bb19a58b9e2089d4057d0798"
],
"version": "==2021.1"
},
"requests": { "requests": {
"hashes": [ "hashes": [
"sha256:6c1246513ecd5ecd4528a0906f910e8f0f9c6b8ec72030dc9fd154dc1a6efd24", "sha256:6c1246513ecd5ecd4528a0906f910e8f0f9c6b8ec72030dc9fd154dc1a6efd24",

View file

@ -1,5 +1,5 @@
from gevent.pywsgi import WSGIServer from gevent.pywsgi import WSGIServer
from server import app from server.server import app
def main(): def main():
port = 5000 port = 5000

View file

@ -1,33 +0,0 @@
from flask import Flask, json, request, jsonify
from cache import CachedData
app = Flask(__name__)
@app.route('/')
def root():
return 'Test'
train_data_cache = {}
@app.route('/train/<int:train_no>')
def get_train_info(train_no: int):
def get_data():
print(f'Cache miss for {train_no}')
from scraper.scraper import scrape
use_yesterday = False
return scrape(train_no, use_yesterday=use_yesterday)
if train_no not in train_data_cache:
train_data_cache[train_no] = CachedData(get_data, validity=1000 * 30)
data, fetch_time = train_data_cache[train_no]()
resp = jsonify(data)
resp.headers['X-Last-Fetched'] = fetch_time.isoformat()
return resp
@app.route('/trains')
def get_trains():
return jsonify(list(train_data_cache.keys()))
if __name__ == '__main__':
print('Starting debug server on port 5001')
app.run(port=5000)

85
server/server/db.py Normal file
View file

@ -0,0 +1,85 @@
# Globals
stations = []
trains = []
# Examples
example_station = {
'name': 'Gară',
'stoppedAtBy': [123, 456]
}
example_train = {
'rank': 'IR',
'numberString': '74',
'number': 74,
'company': 'CFR Călători'
}
# Init
import json
import os
from os import path, stat
from .utils import take_while
DB_DIR = os.environ.get('DB_DIR', '') or './db'
if not path.exists(DB_DIR):
os.mkdir(DB_DIR)
STATIONS_FILE = path.join(DB_DIR, 'stations.json')
if path.exists(STATIONS_FILE):
with open(STATIONS_FILE) as f:
stations = json.load(f)
TRAINS_FILE = path.join(DB_DIR, 'trains.json')
if path.exists(TRAINS_FILE):
with open(TRAINS_FILE) as f:
trains = json.load(f)
def found_train(rank: str, number: str, company: str) -> int:
number_int = int(''.join(take_while(lambda s: str(s).isnumeric(), number)))
try:
next(filter(lambda tr: tr['number'] == number_int, trains))
except StopIteration:
trains.append({
'number': number_int,
'numberString': number,
'company': company,
'rank': rank,
})
with open(TRAINS_FILE, 'w') as f:
json.dump(trains, f)
return number_int
def found_station(name: str):
try:
next(filter(lambda s: s['name'] == name, stations))
except StopIteration:
stations.append({
'name': name,
'stoppedAtBy': [],
})
stations.sort(key=lambda s: len(s['stoppedAtBy']), reverse=True)
with open(STATIONS_FILE, 'w') as f:
json.dump(stations, f)
def found_train_at_station(station_name: str, train_number: int):
found_station(station_name)
for i in range(len(stations)):
if stations[i]['name'] == station_name:
if train_number not in stations[i]['stoppedAtBy']:
stations[i]['stoppedAtBy'].append(train_number)
stations.sort(key=lambda s: len(s['stoppedAtBy']), reverse=True)
with open(STATIONS_FILE, 'w') as f:
json.dump(stations, f)
break
def on_train_data(train_data: dict):
train_no = found_train(train_data['rank'], train_data['number'], train_data['operator'])
for station in train_data['stations']:
found_train_at_station(station['name'], train_no)
def on_train_lookup_failure(train_no: int):
pass

53
server/server/server.py Normal file
View file

@ -0,0 +1,53 @@
print(f'Server {__name__=}')
import datetime
from flask import Flask, json, request, jsonify
from .cache import CachedData
app = Flask(__name__)
from .v2 import v2
app.register_blueprint(v2.bp)
@app.route('/')
def root():
return 'Test'
train_data_cache = {}
@app.route('/train/<int:train_no>')
def get_train_info(train_no: int):
def get_data():
from .scraper.scraper import scrape
use_yesterday = False
result = scrape(train_no, use_yesterday=use_yesterday)
from . import db
db.on_train_data(result)
# Convert to v1
# datetime ISO string to hh:mm
for i in range(len(result['stations'])):
if result['stations'][i]['arrival']:
date = datetime.datetime.fromisoformat(result['stations'][i]['arrival']['scheduleTime'])
result['stations'][i]['arrival']['scheduleTime'] = f'{date.hour}:{date.minute:02}'
if result['stations'][i]['departure']:
date = datetime.datetime.fromisoformat(result['stations'][i]['departure']['scheduleTime'])
result['stations'][i]['departure']['scheduleTime'] = f'{date.hour}:{date.minute:02}'
return result
if train_no not in train_data_cache:
train_data_cache[train_no] = CachedData(get_data, validity=1000 * 30)
data, fetch_time = train_data_cache[train_no]()
resp = jsonify(data)
resp.headers['X-Last-Fetched'] = fetch_time.isoformat()
return resp
@app.route('/trains')
def get_trains():
return jsonify(list(train_data_cache.keys()))
if __name__ == '__main__':
print('Starting debug server on port 5001')
app.run(port=5000)

18
server/server/utils.py Normal file
View file

@ -0,0 +1,18 @@
def take_while(predicate, input):
for element in input:
if not predicate(element):
break
yield element
_NO_DEFAULT = object()
def check_yes_no(input: str, default=_NO_DEFAULT, considered_yes=None) -> bool:
input = str(input).strip().lower()
if not input:
if default == _NO_DEFAULT:
raise Exception('Empty input with no default')
return default
if not considered_yes:
considered_yes = ['y', 'yes', 't', 'true', '1']
return input in considered_yes

View file

@ -0,0 +1 @@
__all__ = ['v2']

32
server/server/v2/v2.py Normal file
View file

@ -0,0 +1,32 @@
from flask import Blueprint, jsonify, request
from .. import db
from ..cache import CachedData
from ..utils import check_yes_no
bp = Blueprint('v2', __name__, url_prefix='/v2')
@bp.get('/trains')
def get_known_trains():
return jsonify(db.trains)
@bp.get('/stations')
def get_known_stations():
return jsonify(db.stations)
train_data_cache = {}
@bp.route('/train/<int:train_no>')
def get_train_info(train_no: int):
use_yesterday = check_yes_no(request.args.get('use_yesterday', ''), default=False)
def get_data():
from ..scraper.scraper import scrape
result = scrape(train_no, use_yesterday=use_yesterday)
db.on_train_data(result)
return result
if train_no not in train_data_cache:
train_data_cache[(train_no, use_yesterday)] = CachedData(get_data, validity=1000 * 30)
data, fetch_time = train_data_cache[(train_no, use_yesterday)]()
resp = jsonify(data)
resp.headers['X-Last-Fetched'] = fetch_time.isoformat()
return resp