From 99810e174a7379444d1358bf215d9f15d1dcc3c9 Mon Sep 17 00:00:00 2001 From: Roy Olav Purser Date: Mon, 24 May 2021 20:59:35 +0200 Subject: [PATCH] add html metadata parsing --- backend/stream.py | 65 ++++++++++++++++++++++++++++++++++------------- 1 file changed, 48 insertions(+), 17 deletions(-) diff --git a/backend/stream.py b/backend/stream.py index ae40b70..2a13447 100755 --- a/backend/stream.py +++ b/backend/stream.py @@ -12,6 +12,7 @@ import tornado.web import tornado.routing import aiohttp import aiohttp_socks +import html.parser import stream_providers logging.basicConfig(format='[%(filename)s:%(lineno)d] %(message)s', stream=sys.stdout, level=logging.INFO) @@ -111,6 +112,30 @@ for key in providers: for proxy in current: proxies[key].append(ProxyElem(proxy)) +class MetaParser(html.parser.HTMLParser): + def __init__(self): + self.meta_data = {} + self.accepted_attrs = [] + self.accepted_attrs.append("og:title") + self.accepted_attrs.append("og:description") + self.accepted_attrs.append("og:image") + self.accepted_attrs.append("og:video:height") + self.accepted_attrs.append("og:video:width") + self.accepted_attrs.append("og:image:height") + self.accepted_attrs.append("og:image:width") + super().__init__() + def handle_starttag(self, tag, attrs): + if tag == "meta": + name = None + for attr in (attrs + attrs): + if len(attr) == 2: + if isinstance(name, str): + if attr[0] == "content": + self.meta_data[name] = attr[1] + return + elif attr[0] == "property" and attr[1] in self.accepted_attrs: + name = attr[1] + class UpstreamHandler(): def __init__(self): self.provider = None @@ -184,23 +209,29 @@ class UpstreamHandler(): async with self.proxy.session() as session: resp = await session.get(embed_url) text = await resp.text() - data_raw = json.loads(text) - if isinstance(data_raw, dict): - data_new = {} - data_valid = True - data_new["og:title"] = data_raw.get("title") - data_new["og:description"] = data_raw.get("author_name") - data_new["og:image"] = data_raw.get("thumbnail_url") - data_new["og:video:height"] = data_raw.get("height") - data_new["og:video:width"] = data_raw.get("width") - data_new["og:image:height"] = data_raw.get("thumbnail_height") - data_new["og:image:width"] = data_raw.get("thumbnail_width") - data_filtered = {} - for key in data_new: - value = data_new.get(key) - if isinstance(value, str): - data_filtered[key] = value - data = list(data_filtered.items()) + data_raw = json.loads(text) + if isinstance(data_raw, dict): + data_new = {} + data_valid = True + data_new["og:title"] = data_raw.get("title") + data_new["og:description"] = data_raw.get("author_name") + data_new["og:image"] = data_raw.get("thumbnail_url") + data_new["og:video:height"] = data_raw.get("height") + data_new["og:video:width"] = data_raw.get("width") + data_new["og:image:height"] = data_raw.get("thumbnail_height") + data_new["og:image:width"] = data_raw.get("thumbnail_width") + data_filtered = {} + for key in data_new: + value = data_new.get(key) + if isinstance(value, str): + data_filtered[key] = value + if len(data_filtered) == 0: + resp = await session.get(self.upstream) + text = await resp.text() + parser = MetaParser() + parser.feed(text) + data_filtered = parser.meta_data + data = list(data_filtered.items()) except Exception as e: logger.info(e) logger.info(data)