introduction-to-python-bsuir-2019 · BoryaD · Nov 8, 2019 · Nov 10, 2019 · Nov 10, 2019 · Nov 17, 2019
diff --git a/.gitignore b/.gitignore
@@ -102,3 +102,11 @@ venv.bak/
 
 # mypy
 .mypy_cache/
+
+.idea/
+
+*.db
+
+*.fb2
+
+*.html
diff --git a/Json structure.md b/Json structure.md
@@ -0,0 +1,21 @@
+{
+
+    "Source:":string,
+
+    "Feeds": [{"title": string, 
+
+             "date": string,
+
+             "link":string,
+
+             "description": string,
+
+             "media": [{"url": string,
+
+                      "type": string}],
+
+             "links": [{"url": string,
+
+                      "type": "string"}]
+            }]
+}
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,2 @@
+feedparser~=5.2
+requests~=2.22
diff --git a/rss_reader/__init__.py b/rss_reader/__init__.py
diff --git a/rss_reader/__main__.py b/rss_reader/__main__.py
@@ -0,0 +1,4 @@
+from .rss_reader import main
+
+if __name__ == '__main__':
+    main()
diff --git a/rss_reader/cache.py b/rss_reader/cache.py
@@ -0,0 +1,72 @@
+import sqlite3
+import logging
+
+
+file_path = 'cache.db'
+
+
+class Cache:
+    """"This class contains news and methods of work whit cache"""
+    cursor = None
+    conn = None
+
+    def __init__(self):
+        """This method initialize cursor to database"""
+        if self.cursor is None:
+            Cache._init_cursor()
+        else:
+            logger = logging.getLogger('rss_reader')
+            logger.error("This is singleton class. Use get_cursor")
+
+    @staticmethod
+    def _init_cursor():
+        Cache.conn = sqlite3.connect(file_path)
+        Cache.cursor = Cache.conn.cursor()
+        Cache.cursor.execute('''CREATE TABLE IF NOT EXISTS news(id INTEGER PRIMARY KEY, 
+         title text, pub_date_key numeric, pub_date text, link text, description text, UNIQUE(link))''')
+        Cache.cursor.execute('''CREATE TABLE IF NOT EXISTS links( id INTEGER PRIMARY KEY, 
+         link text, news numeric)''')
+        Cache.cursor.execute('''CREATE TABLE IF NOT EXISTS media( id INTEGER PRIMARY KEY,
+         link text, news numeric)''')
+
+    @staticmethod
+    def get_cursor():
+        """Static access method. """
+        if Cache.cursor is None:
+            Cache()
+        return Cache.cursor
+
+    @staticmethod
+    def commit():
+        """This method commit to database database"""
+        return Cache.conn.commit()
+
+    @staticmethod
+    def close():
+        """This method close connection to database"""
+        return Cache.conn.close()
+
+    @staticmethod
+    def print_news(date):
+        """This method print news to std from selected date to database"""
+        Cache.get_cursor()
+        Cache.cursor.execute('''SELECT * FROM news WHERE pub_date_key = ?''', (date,))
+        news = Cache.cursor.fetchall()
+        if len(news) == 0:
+            return 1
+        for elem in news:
+            print('\nTitle: ', elem[1])
+            print('Date: ', elem[3])
+            print('Link: ', elem[4])
+            print(f'Description: {elem[5]}\n')
+            Cache.cursor.execute('''SELECT * FROM links WHERE news= ?''', (elem[0],))
+            links = Cache.cursor.fetchall()
+            i = 1
+            for link in links:
+                print(f'Link[{i}]: ', link[1])
+                i = i + 1
+            Cache.cursor.execute('''SELECT * FROM media WHERE news= ?''', (elem[0],))
+            links = Cache.cursor.fetchall()
+            for link in links:
+                print(f'Link[{i}]: ', link[1])
+                i = i + 1
diff --git a/rss_reader/news.py b/rss_reader/news.py
@@ -0,0 +1,186 @@
+import html
+import os
+import re
+import json
+import logging
+from .cache import Cache
+import base64
+import requests
+
+
+class News:
+    """This class contains news and methods of work whit news"""
+
+    http_header = 'http'
+    err_media_type = 'No type'
+
+    def __init__(self, feeds_dict, limit):
+
+        logger = logging.getLogger('rss_reader')
+        formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+        file_handler = logging.FileHandler('rss_reader_logs.log')
+        file_handler.setFormatter(formatter)
+        logger.addHandler(file_handler)
+        logger.setLevel(logging.INFO)
+        self.news = dict()
+        self.all_news = list()
+
+        self.name_of_source = feeds_dict.feed['title']
+
+        real_limit = len(feeds_dict.entries)
+        if limit > 0:
+            if limit < len(feeds_dict.entries):
+                real_limit = limit
+
+        cursor = Cache.get_cursor()
+
+        for i in range(real_limit):
+            list_to_cache = list()
+            self.news['title'] = html.unescape(feeds_dict.entries[i].title)
+            self.news['date'] = html.unescape(feeds_dict.entries[i].published)
+            self.news['link'] = html.unescape(feeds_dict.entries[i].link)
+            self.news['description'] = self.clean_from_tags(html.unescape(feeds_dict.entries[i].description))
+
+            date_dict = feeds_dict.entries[i].published_parsed
+            date_str = str(date_dict.tm_year) + str(date_dict.tm_mon) + str(date_dict.tm_mday)
+
+            list_to_cache.append(self.news['title'])
+            list_to_cache.append(date_str)
+            list_to_cache.append(self.news['date'])
+            list_to_cache.append(self.news['link'])
+            list_to_cache.append(self.news['description'])
+
+            self.news['media'] = self._parse_media(feeds_dict.entries[i])
+            self.news['links'] = self._parse_links(feeds_dict.entries[i])
+
+            self._cache_feed(list_to_cache, self.news['links'], self.news['media'], cursor)
+
+            self.all_news.append(self.news.copy())
+        Cache.close()
+
+    @staticmethod
+    def _parse_links(news_dict):
+        """This function parse links of feed"""
+        list_of_links = list()
+        if news_dict.links:
+            for elem in news_dict.links:
+                list_of_links.append({'url': elem.setdefault('url', None), 'type': elem.setdefault('type', None)})
+        return list_of_links
+
+    def _parse_media(self, news_dict):
+        """This function parse media of feed"""
+        if news_dict.setdefault('media_content', None):
+            media = list()
+            if news_dict.media_content:
+                for elem in news_dict.media_content:
+                    if elem['url'].rfind(self.http_header, 0, len(elem['url'])) > 0:
+                        # Some sources of news write two links in one string of media. And only second string is image
+                        links = elem['url'].split(self.http_header)
+                        media.append({'url': self.http_header + links[2], 'type': "img"})
+                    else:
+                        if elem.setdefault('url', None):
+                            media.append({'url': elem.setdefault('url', None),
+                                          'type': elem.setdefault('type', None)})
+            return media
+        else:
+            return ''
+
+    def _cache_feed(self, list_of_main_info, list_of_links, list_of_media, cursor):
+        """This function write feed to cache"""
+        cursor.execute('''INSERT or IGNORE INTO news (title, pub_date_key, pub_date, link, description)
+                     VALUES(?,?,?,?,?)''', list_of_main_info)
+        ids = cursor.lastrowid
+
+        list_to_cache_of_links = list()
+        for elem in list_of_links:
+            list_to_cache_of_links.append(elem.setdefault('url', None))
+            list_to_cache_of_links.append(ids)
+            cursor.execute('''INSERT or IGNORE INTO links (link, news) VALUES(?,?)''', list_to_cache_of_links)
+            list_to_cache_of_links.clear()
+
+        list_to_cache_of_media = list()
+        for elem in list_of_media:
+            list_to_cache_of_media.append(elem.setdefault('url', None))
+            list_to_cache_of_media.append(ids)
+            cursor.execute('''INSERT or IGNORE INTO media (link, news) VALUES(?,?)''', list_to_cache_of_media)
+            list_to_cache_of_media.clear()
+
+        Cache.commit()
+
+    @staticmethod
+    def clean_from_tags(text_with_tags):
+        """This function delete tags from string"""
+        return re.sub('<.*?>', '', text_with_tags)
+
+    def print(self):
+        """This function print news to stdout in readable format"""
+        print(f'Source: {self.name_of_source}\n')
+        for elem in self.all_news:
+            print(f'Title: {elem["title"]}')
+            print(f'Date: {elem["date"]}')
+            print(f'Link: {elem["link"]}')
+            print(f'Description: {elem["description"]}\n')
+
+            j = 1
+            print('Links: ')
+            for link in elem['links']:
+                print(f'[{j}] {link["url"]} ({link["type"]})')
+                j = j + 1
+
+            if elem.setdefault('media', None):
+                print("Media: ")
+                for media in elem['media']:
+                    print(f'[{j}] {media["url"]} ({media["type"]})')
+                    j = j + 1
+
+    def to_json(self):
+        """This function returns JSON-string with news"""
+        return json.dumps({'Source:': self.name_of_source, 'Feeds': self.all_news}, ensure_ascii=False).encode('utf8')
+
+    def create_fb2(self, filepath):
+        if filepath[-4::] != ".fb2":
+            filename = filepath + ".fb2"
+        with open(filename, 'w', encoding="utf-8") as fb2_file:
+            fb2_file.write('<?xml version="1.0" ?>\n')
+            fb2_file.write(f'''<FictionBook xmlns:l="http://www.w3.org/1999/xlink"><description/><body>''')
+            fb2_file.write(f'''<title><p>{self.name_of_source.replace("&", "&amp;")}</p></title><empty-line/>''')
+            for elem in self.all_news:
+                fb2_file.write(f'<section><title><p>{elem["title"].replace("&", "&amp;")}</p></title>')
+                fb2_file.write(f'<p>Date of posting: {elem["date"].replace("&", "&amp;")}</p>')
+                fb2_file.write(f'<p>{elem["description"].replace("&", "&amp;")}</p><empty-line/>')
+                fb2_file.write(f'<p>Source: {elem["link"]}</p></section>'.replace("&", "&amp;"))
+
+                for media in elem['media']:
+                    if media['type'] != self.err_media_type:
+                        fb2_file.write(f'''<empty-line/><empty-line/>
+                        <image l:href="#{media["url"]}"/><empty-line/><empty-line/>''')
+                        pass
+            fb2_file.write('</body>')
+            for elem in self.all_news:
+                for media in elem['media']:
+                    if media['type'] != self.err_media_type:
+                        fb2_file.write(f'<binary content-type="image/png" id="{media["url"]}">')
+                        content = base64.b64encode(requests.get(media["url"]).content)
+                        fb2_file.write(content.decode('ascii'))
+                        fb2_file.write('</binary>')
+            fb2_file.write('</FictionBook>')
+
+        print(f'All news you can find at {os.path.realpath(filename)}')
+
+    def create_html(self, filepath):
+        if filepath[-5::] != ".html":
+            filename = filepath + ".html"
+        with open(filename, 'w', encoding="utf-8") as html_file:
+            html_file.write(f'<html>\n<head>{self.name_of_source}</head>\n<body>\n')
+            for elem in self.all_news:
+                html_file.write(f'<h3>{elem["title"]}</h3>')
+                html_file.write(f'<p>Date of posting: {elem["date"]}</p>')
+                html_file.write(f'<p>{elem["description"]}</p>')
+                html_file.write(f'<p><a href="{elem["link"]}">Link to source</a></p>')
+
+                for media in elem['media']:
+                    if media['type'] != self.err_media_type:
+                        html_file.write(f'<p><img src="{media["url"]}"></p>')
+                html_file.write('<hr>')
+            html_file.write('</body></html>')
+        print(f'All news you can find at {os.path.realpath(filename)}')
diff --git a/rss_reader/rss_reader.py b/rss_reader/rss_reader.py
@@ -0,0 +1,77 @@
+import argparse
+import feedparser
+from .news import News
+from .cache import Cache
+import logging
+import sys
+
+
+version = '1.5'
+
+
+def main():
+    """Main function of program"""
+
+    logger = logging.getLogger('rss_reader')
+    formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+    file_handler = logging.FileHandler('rss_reader_logs.log')
+    file_handler.setFormatter(formatter)
+    logger.addHandler(file_handler)
+    logger.setLevel(logging.INFO)
+
+    parser = argparse.ArgumentParser(description='Python RSS-reader')
+    parser.add_argument('URL', type=str, help='RSS URL')
+    parser.add_argument('--version', help='Print version info', action='version', version=version)
+    parser.add_argument('--json', help='Print result as JSON in stdout', action='store_true')
+    parser.add_argument('-V', '--verbose', help='Outputs verbose status messages', action='store_true')
+    parser.add_argument('-L', '--limit', help='Limit news topics if this parameter is provided', type=int, default=0)
+    parser.add_argument('--date', help='Find news in cache if this parameter is provided', type=int, default=0)
+    parser.add_argument('--to-html', help='Create a HTML file with news', type=str, default="")
+    parser.add_argument('--to-fb2', help='Create a fb2 file with news', type=str, default="")
+    args = parser.parse_args()
+
+    if args.verbose:
+        stdout_handler = logging.StreamHandler(sys.stdout)
+        stdout_handler.setFormatter(formatter)
+        logger.addHandler(stdout_handler)
+
+    if args.date:
+        logger.info('Starting to read from cache')
+        state = Cache.print_news(args.date)
+        if state == 1:
+            print(f'''There are not exist news with date of publication at {args.date}
+            \nMake sure that your format date in %Y%m%d''', file=sys.stderr)
+        else:
+            logger.info('News from cache ware read')
+    else:
+
+        feeds = feedparser.parse(args.URL)
+
+        if feeds.bozo:
+            print('This is not well formed XML', file=sys.stderr)
+            exit()
+
+        else:
+            logger.info('The XML file with news is received and correct')
+
+        news = News(feeds, args.limit)
+        logger.info('News is parsed')
+
+        if args.to_html:
+            news.create_html(args.to_html)
+
+        elif args.to_fb2:
+            news.create_fb2(args.to_fb2)
+
+        elif args.json:
+            print(news.to_json().decode())
+            logger.info('News is displayed in stdout in a json format')
+        else:
+            news.print()
+            logger.info('News is displayed in stdout in a readability format')
+
+    logger.info('Program is over')
+
+
+if __name__ == '__main__':
+    main()
-Original file line number
+Diff line change
@@ Expand Up / @@ -102,3 +102,11 @@ venv.bak/ @@
     # mypy
     .mypy_cache/
+    .idea/
+    *.db
+    *.fb2
+    *.html