add html metadata parsing
This commit is contained in:
		@@ -12,6 +12,7 @@ import tornado.web
 | 
			
		||||
import tornado.routing
 | 
			
		||||
import aiohttp
 | 
			
		||||
import aiohttp_socks
 | 
			
		||||
import html.parser
 | 
			
		||||
import stream_providers
 | 
			
		||||
 | 
			
		||||
logging.basicConfig(format='[%(filename)s:%(lineno)d] %(message)s', stream=sys.stdout, level=logging.INFO)
 | 
			
		||||
@@ -111,6 +112,30 @@ for key in providers:
 | 
			
		||||
        for proxy in current:
 | 
			
		||||
            proxies[key].append(ProxyElem(proxy))
 | 
			
		||||
 | 
			
		||||
class MetaParser(html.parser.HTMLParser):
 | 
			
		||||
    def __init__(self):
 | 
			
		||||
        self.meta_data = {}
 | 
			
		||||
        self.accepted_attrs = []
 | 
			
		||||
        self.accepted_attrs.append("og:title")
 | 
			
		||||
        self.accepted_attrs.append("og:description")
 | 
			
		||||
        self.accepted_attrs.append("og:image")
 | 
			
		||||
        self.accepted_attrs.append("og:video:height")
 | 
			
		||||
        self.accepted_attrs.append("og:video:width")
 | 
			
		||||
        self.accepted_attrs.append("og:image:height")
 | 
			
		||||
        self.accepted_attrs.append("og:image:width")
 | 
			
		||||
        super().__init__()
 | 
			
		||||
    def handle_starttag(self, tag, attrs):
 | 
			
		||||
        if tag == "meta":
 | 
			
		||||
            name = None
 | 
			
		||||
            for attr in (attrs + attrs):
 | 
			
		||||
                if len(attr) == 2:
 | 
			
		||||
                    if isinstance(name, str):
 | 
			
		||||
                        if attr[0] == "content":
 | 
			
		||||
                            self.meta_data[name] = attr[1]
 | 
			
		||||
                            return
 | 
			
		||||
                    elif attr[0] == "property" and attr[1] in self.accepted_attrs:
 | 
			
		||||
                        name = attr[1]
 | 
			
		||||
 | 
			
		||||
class UpstreamHandler():
 | 
			
		||||
    def __init__(self):
 | 
			
		||||
        self.provider = None
 | 
			
		||||
@@ -184,23 +209,29 @@ class UpstreamHandler():
 | 
			
		||||
            async with self.proxy.session() as session:
 | 
			
		||||
                resp = await session.get(embed_url)
 | 
			
		||||
                text = await resp.text()
 | 
			
		||||
            data_raw = json.loads(text)
 | 
			
		||||
            if isinstance(data_raw, dict):
 | 
			
		||||
                data_new = {}
 | 
			
		||||
                data_valid = True
 | 
			
		||||
                data_new["og:title"] = data_raw.get("title")
 | 
			
		||||
                data_new["og:description"] = data_raw.get("author_name")
 | 
			
		||||
                data_new["og:image"] = data_raw.get("thumbnail_url")
 | 
			
		||||
                data_new["og:video:height"] = data_raw.get("height")
 | 
			
		||||
                data_new["og:video:width"] = data_raw.get("width")
 | 
			
		||||
                data_new["og:image:height"] = data_raw.get("thumbnail_height")
 | 
			
		||||
                data_new["og:image:width"] = data_raw.get("thumbnail_width")
 | 
			
		||||
                data_filtered = {}
 | 
			
		||||
                for key in data_new:
 | 
			
		||||
                    value = data_new.get(key)
 | 
			
		||||
                    if isinstance(value, str):
 | 
			
		||||
                        data_filtered[key] = value
 | 
			
		||||
                data = list(data_filtered.items())
 | 
			
		||||
                data_raw = json.loads(text)
 | 
			
		||||
                if isinstance(data_raw, dict):
 | 
			
		||||
                    data_new = {}
 | 
			
		||||
                    data_valid = True
 | 
			
		||||
                    data_new["og:title"] = data_raw.get("title")
 | 
			
		||||
                    data_new["og:description"] = data_raw.get("author_name")
 | 
			
		||||
                    data_new["og:image"] = data_raw.get("thumbnail_url")
 | 
			
		||||
                    data_new["og:video:height"] = data_raw.get("height")
 | 
			
		||||
                    data_new["og:video:width"] = data_raw.get("width")
 | 
			
		||||
                    data_new["og:image:height"] = data_raw.get("thumbnail_height")
 | 
			
		||||
                    data_new["og:image:width"] = data_raw.get("thumbnail_width")
 | 
			
		||||
                    data_filtered = {}
 | 
			
		||||
                    for key in data_new:
 | 
			
		||||
                        value = data_new.get(key)
 | 
			
		||||
                        if isinstance(value, str):
 | 
			
		||||
                            data_filtered[key] = value
 | 
			
		||||
                    if len(data_filtered) == 0:
 | 
			
		||||
                        resp = await session.get(self.upstream)
 | 
			
		||||
                        text = await resp.text()
 | 
			
		||||
                        parser = MetaParser()
 | 
			
		||||
                        parser.feed(text)
 | 
			
		||||
                        data_filtered = parser.meta_data
 | 
			
		||||
                    data = list(data_filtered.items())
 | 
			
		||||
        except Exception as e:
 | 
			
		||||
            logger.info(e)
 | 
			
		||||
        logger.info(data)
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user