add html metadata parsing

This commit is contained in:
Roy Olav Purser 2021-05-24 20:59:35 +02:00
parent 202f1344a8
commit 99810e174a
Signed by: roypur
GPG Key ID: E14D26A036F21656

View File

@ -12,6 +12,7 @@ import tornado.web
import tornado.routing import tornado.routing
import aiohttp import aiohttp
import aiohttp_socks import aiohttp_socks
import html.parser
import stream_providers import stream_providers
logging.basicConfig(format='[%(filename)s:%(lineno)d] %(message)s', stream=sys.stdout, level=logging.INFO) logging.basicConfig(format='[%(filename)s:%(lineno)d] %(message)s', stream=sys.stdout, level=logging.INFO)
@ -111,6 +112,30 @@ for key in providers:
for proxy in current: for proxy in current:
proxies[key].append(ProxyElem(proxy)) proxies[key].append(ProxyElem(proxy))
class MetaParser(html.parser.HTMLParser):
def __init__(self):
self.meta_data = {}
self.accepted_attrs = []
self.accepted_attrs.append("og:title")
self.accepted_attrs.append("og:description")
self.accepted_attrs.append("og:image")
self.accepted_attrs.append("og:video:height")
self.accepted_attrs.append("og:video:width")
self.accepted_attrs.append("og:image:height")
self.accepted_attrs.append("og:image:width")
super().__init__()
def handle_starttag(self, tag, attrs):
if tag == "meta":
name = None
for attr in (attrs + attrs):
if len(attr) == 2:
if isinstance(name, str):
if attr[0] == "content":
self.meta_data[name] = attr[1]
return
elif attr[0] == "property" and attr[1] in self.accepted_attrs:
name = attr[1]
class UpstreamHandler(): class UpstreamHandler():
def __init__(self): def __init__(self):
self.provider = None self.provider = None
@ -200,6 +225,12 @@ class UpstreamHandler():
value = data_new.get(key) value = data_new.get(key)
if isinstance(value, str): if isinstance(value, str):
data_filtered[key] = value data_filtered[key] = value
if len(data_filtered) == 0:
resp = await session.get(self.upstream)
text = await resp.text()
parser = MetaParser()
parser.feed(text)
data_filtered = parser.meta_data
data = list(data_filtered.items()) data = list(data_filtered.items())
except Exception as e: except Exception as e:
logger.info(e) logger.info(e)