From 12bfa2352bb7af94c667c9d0fd435dd459491f16 Mon Sep 17 00:00:00 2001 From: Michiel Scholten Date: Wed, 20 Jul 2016 22:06:17 +0200 Subject: [PATCH] More robust http codes, tag sanitation --- digimarks.py | 34 +++++++++++++++++++++++++--------- requirements.txt | 2 ++ 2 files changed, 27 insertions(+), 9 deletions(-) diff --git a/digimarks.py b/digimarks.py index 03b1805..7c85dc3 100644 --- a/digimarks.py +++ b/digimarks.py @@ -5,6 +5,7 @@ import sys import requests import shutil import bs4 +from more_itertools import unique_everseen from urlparse import urlparse from utilkit import datetimeutil @@ -92,29 +93,28 @@ class Bookmark(db.Model): """ Generate hash """ self.url_hash = hashlib.md5(self.url).hexdigest() - def set_title_from_source(self): """ Request the title by requesting the source url """ - result = requests.get(self.url) - print result.status_code - if result.status_code == 200: + try: + result = requests.get(self.url) + self.http_status = result.status_code + except: + # For example 'MissingSchema: Invalid URL 'abc': No schema supplied. Perhaps you meant http://abc?' + self.http_status = 404 + if self.http_status == 200: html = bs4.BeautifulSoup(result.text, 'html.parser') try: self.title = html.title.text.strip() except AttributeError: self.title = '' - else: - self.http_status = result.status_code return self.title - def set_status_code(self): """ Check the HTTP status of the url, as it might not exist for example """ result = requests.head(self.url) self.http_status = result.status_code return self.http_status - def set_favicon(self): """ Fetch favicon for the domain """ # http://codingclues.eu/2009/retrieve-the-favicon-for-any-url-thanks-to-google/ @@ -128,6 +128,21 @@ class Bookmark(db.Model): del response self.favicon = domain + '.png' + def set_tags(self, tags): + """ Set tags from `tags`, strip and sort them """ + tags_split = tags.split(',') + print tags_split + #map(str.strip, tags_split) + tags_split = [x.strip() for x in tags_split] + tags_split = list(unique_everseen(tags_split)) + tags_split.sort() + print tags_split + self.tags = ','.join(tags_split) + print self.tags + + def get_tags(self): + return self.tags.split(',') + def to_dict(self): result = { @@ -220,7 +235,8 @@ def addingbookmark(userkey): starred = False print starred if url: - bookmark = Bookmark(url=url, title=title, tags=tags, starred=starred, userkey=userkey) + bookmark = Bookmark(url=url, title=title, starred=starred, userkey=userkey) + bookmark.set_tags(tags) bookmark.set_hash() #bookmark.fetch_image() if not title: diff --git a/requirements.txt b/requirements.txt index 38e95cf..920b612 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,7 @@ flask peewee flask-peewee +bs4 +more_itertools requests utilkit