Added v2 API
This commit is contained in:
parent
d507a227f1
commit
de78a094a1
16 changed files with 230 additions and 39 deletions
|
@ -6,16 +6,16 @@ WORKDIR /var/app/scraper
|
|||
COPY scraper/Pipfil* ./
|
||||
COPY scraper/setup.py ./
|
||||
WORKDIR /var/app/server
|
||||
RUN ln -s /var/app/scraper scraper
|
||||
COPY server/Pipfil* ./
|
||||
RUN pipenv install
|
||||
RUN pipenv graph
|
||||
|
||||
WORKDIR /var/app/scraper
|
||||
COPY scraper .
|
||||
WORKDIR /var/app/server
|
||||
COPY server .
|
||||
RUN rm scraper
|
||||
RUN ln -s /var/app/scraper scraper
|
||||
RUN rm server/scraper
|
||||
RUN ln -s /var/app/scraper ./server/scraper
|
||||
|
||||
ENV PORT 5000
|
||||
EXPOSE ${PORT}
|
||||
|
|
|
@ -6,6 +6,7 @@ name = "pypi"
|
|||
[packages]
|
||||
beautifulsoup4 = "*"
|
||||
requests = "*"
|
||||
pytz = "*"
|
||||
|
||||
[dev-packages]
|
||||
|
||||
|
|
10
scraper/Pipfile.lock
generated
10
scraper/Pipfile.lock
generated
|
@ -1,7 +1,7 @@
|
|||
{
|
||||
"_meta": {
|
||||
"hash": {
|
||||
"sha256": "e7c5f7eab5a8f9202caaaa3bdca8e911579596b8dd25319c2f50e84794eb9fa8"
|
||||
"sha256": "d7e3ebca9807b4f0c9dcac014554e9d1c9cb3a0c30b5c71b0b7cd4ccdc4934e1"
|
||||
},
|
||||
"pipfile-spec": 6,
|
||||
"requires": {
|
||||
|
@ -48,6 +48,14 @@
|
|||
"markers": "python_version >= '3'",
|
||||
"version": "==3.2"
|
||||
},
|
||||
"pytz": {
|
||||
"hashes": [
|
||||
"sha256:83a4a90894bf38e243cf052c8b58f381bfe9a7a483f6a9cab140bc7f702ac4da",
|
||||
"sha256:eb10ce3e7736052ed3623d49975ce333bcd712c7bb19a58b9e2089d4057d0798"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==2021.1"
|
||||
},
|
||||
"requests": {
|
||||
"hashes": [
|
||||
"sha256:6c1246513ecd5ecd4528a0906f910e8f0f9c6b8ec72030dc9fd154dc1a6efd24",
|
||||
|
|
|
@ -3,6 +3,7 @@
|
|||
from datetime import datetime, timedelta
|
||||
import re
|
||||
|
||||
import pytz
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from urllib.parse import quote, urlencode
|
||||
|
@ -30,6 +31,18 @@ STOPPING_TIME_REGEX = re.compile(r'^([0-9]+) min oprire$')
|
|||
|
||||
STATION_DEPARR_STATUS_REGEX = re.compile(r'^(?:(la timp)|(?:((?:\+|-)[0-9]+) min \((?:(?:întârziere)|(?:mai devreme))\)))(\*?)$')
|
||||
|
||||
class DateTimeSequencer:
|
||||
def __init__(self, year: int, month: int, day: int) -> None:
|
||||
self.current = datetime(year, month, day, 0, 0, 0)
|
||||
self.current -= timedelta(seconds=1)
|
||||
|
||||
def __call__(self, hour: int, minute: int = 0, second: int = 0) -> datetime:
|
||||
potential_new_date = datetime(self.current.year, self.current.month, self.current.day, hour, minute, second)
|
||||
if (self.current > potential_new_date):
|
||||
potential_new_date += timedelta(days=1)
|
||||
self.current = potential_new_date
|
||||
return self.current
|
||||
|
||||
def collapse_space(string: str) -> str:
|
||||
return re.sub(
|
||||
rf'[{BeautifulSoup.ASCII_SPACES}]+',
|
||||
|
@ -77,6 +90,8 @@ def scrape(train_no: int, use_yesterday=False, date_override=None):
|
|||
train_info_div = train_info_div.div('div', recursive=False)[0]
|
||||
|
||||
scraped['rank'], scraped['number'], scraped['date'] = TRAIN_INFO_REGEX.match(collapse_space(train_info_div.h2.text)).groups()
|
||||
date_d, date_m, date_y = (int(comp) for comp in scraped['date'].split('.'))
|
||||
date = datetime(date_y, date_m, date_d)
|
||||
|
||||
scraped['operator'] = OPERATOR_REGEX.match(collapse_space(train_info_div.p.text)).groups()[0]
|
||||
|
||||
|
@ -101,6 +116,8 @@ def scrape(train_no: int, use_yesterday=False, date_override=None):
|
|||
|
||||
stations = status_div.ul('li', recursive=False)
|
||||
scraped['stations'] = []
|
||||
dt_seq = DateTimeSequencer(date.year, date.month, date.day)
|
||||
tz = pytz.timezone('Europe/Bucharest')
|
||||
for station in stations:
|
||||
station_scraped = {}
|
||||
|
||||
|
@ -126,6 +143,8 @@ def scrape(train_no: int, use_yesterday=False, date_override=None):
|
|||
|
||||
time, *_ = parts
|
||||
result['scheduleTime'] = collapse_space(time.text)
|
||||
st_hr, st_min = (int(comp) for comp in result['scheduleTime'].split(':'))
|
||||
result['scheduleTime'] = tz.localize(dt_seq(st_hr, st_min)).isoformat()
|
||||
if len(parts) >= 2:
|
||||
_, status, *_ = parts
|
||||
result['status'] = {}
|
||||
|
|
|
@ -4,5 +4,5 @@ setup(
|
|||
name='InfoFer_Scraper',
|
||||
version='0.1',
|
||||
author='Dan Cojocaru',
|
||||
install_requires=['beautifulsoup4', 'requests']
|
||||
install_requires=['beautifulsoup4', 'requests', 'pytz']
|
||||
)
|
7
server/Pipfile.lock
generated
7
server/Pipfile.lock
generated
|
@ -298,6 +298,13 @@
|
|||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
|
||||
"version": "==2.20"
|
||||
},
|
||||
"pytz": {
|
||||
"hashes": [
|
||||
"sha256:83a4a90894bf38e243cf052c8b58f381bfe9a7a483f6a9cab140bc7f702ac4da",
|
||||
"sha256:eb10ce3e7736052ed3623d49975ce333bcd712c7bb19a58b9e2089d4057d0798"
|
||||
],
|
||||
"version": "==2021.1"
|
||||
},
|
||||
"requests": {
|
||||
"hashes": [
|
||||
"sha256:6c1246513ecd5ecd4528a0906f910e8f0f9c6b8ec72030dc9fd154dc1a6efd24",
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
from gevent.pywsgi import WSGIServer
|
||||
from server import app
|
||||
from server.server import app
|
||||
|
||||
def main():
|
||||
port = 5000
|
||||
|
|
|
@ -1,33 +0,0 @@
|
|||
from flask import Flask, json, request, jsonify
|
||||
|
||||
from cache import CachedData
|
||||
|
||||
app = Flask(__name__)
|
||||
|
||||
@app.route('/')
|
||||
def root():
|
||||
return 'Test'
|
||||
|
||||
train_data_cache = {}
|
||||
|
||||
@app.route('/train/<int:train_no>')
|
||||
def get_train_info(train_no: int):
|
||||
def get_data():
|
||||
print(f'Cache miss for {train_no}')
|
||||
from scraper.scraper import scrape
|
||||
use_yesterday = False
|
||||
return scrape(train_no, use_yesterday=use_yesterday)
|
||||
if train_no not in train_data_cache:
|
||||
train_data_cache[train_no] = CachedData(get_data, validity=1000 * 30)
|
||||
data, fetch_time = train_data_cache[train_no]()
|
||||
resp = jsonify(data)
|
||||
resp.headers['X-Last-Fetched'] = fetch_time.isoformat()
|
||||
return resp
|
||||
|
||||
@app.route('/trains')
|
||||
def get_trains():
|
||||
return jsonify(list(train_data_cache.keys()))
|
||||
|
||||
if __name__ == '__main__':
|
||||
print('Starting debug server on port 5001')
|
||||
app.run(port=5000)
|
85
server/server/db.py
Normal file
85
server/server/db.py
Normal file
|
@ -0,0 +1,85 @@
|
|||
# Globals
|
||||
stations = []
|
||||
trains = []
|
||||
|
||||
# Examples
|
||||
example_station = {
|
||||
'name': 'Gară',
|
||||
'stoppedAtBy': [123, 456]
|
||||
}
|
||||
|
||||
example_train = {
|
||||
'rank': 'IR',
|
||||
'numberString': '74',
|
||||
'number': 74,
|
||||
'company': 'CFR Călători'
|
||||
}
|
||||
|
||||
# Init
|
||||
|
||||
import json
|
||||
import os
|
||||
from os import path, stat
|
||||
from .utils import take_while
|
||||
|
||||
DB_DIR = os.environ.get('DB_DIR', '') or './db'
|
||||
if not path.exists(DB_DIR):
|
||||
os.mkdir(DB_DIR)
|
||||
|
||||
STATIONS_FILE = path.join(DB_DIR, 'stations.json')
|
||||
|
||||
if path.exists(STATIONS_FILE):
|
||||
with open(STATIONS_FILE) as f:
|
||||
stations = json.load(f)
|
||||
|
||||
TRAINS_FILE = path.join(DB_DIR, 'trains.json')
|
||||
|
||||
if path.exists(TRAINS_FILE):
|
||||
with open(TRAINS_FILE) as f:
|
||||
trains = json.load(f)
|
||||
|
||||
def found_train(rank: str, number: str, company: str) -> int:
|
||||
number_int = int(''.join(take_while(lambda s: str(s).isnumeric(), number)))
|
||||
try:
|
||||
next(filter(lambda tr: tr['number'] == number_int, trains))
|
||||
except StopIteration:
|
||||
trains.append({
|
||||
'number': number_int,
|
||||
'numberString': number,
|
||||
'company': company,
|
||||
'rank': rank,
|
||||
})
|
||||
with open(TRAINS_FILE, 'w') as f:
|
||||
json.dump(trains, f)
|
||||
return number_int
|
||||
|
||||
def found_station(name: str):
|
||||
try:
|
||||
next(filter(lambda s: s['name'] == name, stations))
|
||||
except StopIteration:
|
||||
stations.append({
|
||||
'name': name,
|
||||
'stoppedAtBy': [],
|
||||
})
|
||||
stations.sort(key=lambda s: len(s['stoppedAtBy']), reverse=True)
|
||||
with open(STATIONS_FILE, 'w') as f:
|
||||
json.dump(stations, f)
|
||||
|
||||
def found_train_at_station(station_name: str, train_number: int):
|
||||
found_station(station_name)
|
||||
for i in range(len(stations)):
|
||||
if stations[i]['name'] == station_name:
|
||||
if train_number not in stations[i]['stoppedAtBy']:
|
||||
stations[i]['stoppedAtBy'].append(train_number)
|
||||
stations.sort(key=lambda s: len(s['stoppedAtBy']), reverse=True)
|
||||
with open(STATIONS_FILE, 'w') as f:
|
||||
json.dump(stations, f)
|
||||
break
|
||||
|
||||
def on_train_data(train_data: dict):
|
||||
train_no = found_train(train_data['rank'], train_data['number'], train_data['operator'])
|
||||
for station in train_data['stations']:
|
||||
found_train_at_station(station['name'], train_no)
|
||||
|
||||
def on_train_lookup_failure(train_no: int):
|
||||
pass
|
53
server/server/server.py
Normal file
53
server/server/server.py
Normal file
|
@ -0,0 +1,53 @@
|
|||
print(f'Server {__name__=}')
|
||||
|
||||
import datetime
|
||||
from flask import Flask, json, request, jsonify
|
||||
|
||||
from .cache import CachedData
|
||||
|
||||
app = Flask(__name__)
|
||||
|
||||
from .v2 import v2
|
||||
app.register_blueprint(v2.bp)
|
||||
|
||||
@app.route('/')
|
||||
def root():
|
||||
return 'Test'
|
||||
|
||||
train_data_cache = {}
|
||||
|
||||
@app.route('/train/<int:train_no>')
|
||||
def get_train_info(train_no: int):
|
||||
def get_data():
|
||||
from .scraper.scraper import scrape
|
||||
use_yesterday = False
|
||||
result = scrape(train_no, use_yesterday=use_yesterday)
|
||||
|
||||
from . import db
|
||||
db.on_train_data(result)
|
||||
|
||||
# Convert to v1
|
||||
# datetime ISO string to hh:mm
|
||||
for i in range(len(result['stations'])):
|
||||
if result['stations'][i]['arrival']:
|
||||
date = datetime.datetime.fromisoformat(result['stations'][i]['arrival']['scheduleTime'])
|
||||
result['stations'][i]['arrival']['scheduleTime'] = f'{date.hour}:{date.minute:02}'
|
||||
if result['stations'][i]['departure']:
|
||||
date = datetime.datetime.fromisoformat(result['stations'][i]['departure']['scheduleTime'])
|
||||
result['stations'][i]['departure']['scheduleTime'] = f'{date.hour}:{date.minute:02}'
|
||||
|
||||
return result
|
||||
if train_no not in train_data_cache:
|
||||
train_data_cache[train_no] = CachedData(get_data, validity=1000 * 30)
|
||||
data, fetch_time = train_data_cache[train_no]()
|
||||
resp = jsonify(data)
|
||||
resp.headers['X-Last-Fetched'] = fetch_time.isoformat()
|
||||
return resp
|
||||
|
||||
@app.route('/trains')
|
||||
def get_trains():
|
||||
return jsonify(list(train_data_cache.keys()))
|
||||
|
||||
if __name__ == '__main__':
|
||||
print('Starting debug server on port 5001')
|
||||
app.run(port=5000)
|
18
server/server/utils.py
Normal file
18
server/server/utils.py
Normal file
|
@ -0,0 +1,18 @@
|
|||
def take_while(predicate, input):
|
||||
for element in input:
|
||||
if not predicate(element):
|
||||
break
|
||||
yield element
|
||||
|
||||
_NO_DEFAULT = object()
|
||||
|
||||
def check_yes_no(input: str, default=_NO_DEFAULT, considered_yes=None) -> bool:
|
||||
input = str(input).strip().lower()
|
||||
if not input:
|
||||
if default == _NO_DEFAULT:
|
||||
raise Exception('Empty input with no default')
|
||||
return default
|
||||
if not considered_yes:
|
||||
considered_yes = ['y', 'yes', 't', 'true', '1']
|
||||
return input in considered_yes
|
||||
|
1
server/server/v2/__init__.py
Normal file
1
server/server/v2/__init__.py
Normal file
|
@ -0,0 +1 @@
|
|||
__all__ = ['v2']
|
32
server/server/v2/v2.py
Normal file
32
server/server/v2/v2.py
Normal file
|
@ -0,0 +1,32 @@
|
|||
from flask import Blueprint, jsonify, request
|
||||
|
||||
from .. import db
|
||||
from ..cache import CachedData
|
||||
from ..utils import check_yes_no
|
||||
|
||||
bp = Blueprint('v2', __name__, url_prefix='/v2')
|
||||
|
||||
@bp.get('/trains')
|
||||
def get_known_trains():
|
||||
return jsonify(db.trains)
|
||||
|
||||
@bp.get('/stations')
|
||||
def get_known_stations():
|
||||
return jsonify(db.stations)
|
||||
|
||||
train_data_cache = {}
|
||||
|
||||
@bp.route('/train/<int:train_no>')
|
||||
def get_train_info(train_no: int):
|
||||
use_yesterday = check_yes_no(request.args.get('use_yesterday', ''), default=False)
|
||||
def get_data():
|
||||
from ..scraper.scraper import scrape
|
||||
result = scrape(train_no, use_yesterday=use_yesterday)
|
||||
db.on_train_data(result)
|
||||
return result
|
||||
if train_no not in train_data_cache:
|
||||
train_data_cache[(train_no, use_yesterday)] = CachedData(get_data, validity=1000 * 30)
|
||||
data, fetch_time = train_data_cache[(train_no, use_yesterday)]()
|
||||
resp = jsonify(data)
|
||||
resp.headers['X-Last-Fetched'] = fetch_time.isoformat()
|
||||
return resp
|
Loading…
Add table
Reference in a new issue