Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -102,3 +102,11 @@ venv.bak/

# mypy
.mypy_cache/

.idea/

*.db

*.fb2

*.html
21 changes: 21 additions & 0 deletions Json structure.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
{

"Source:":string,

"Feeds": [{"title": string,

"date": string,

"link":string,

"description": string,

"media": [{"url": string,

"type": string}],

"links": [{"url": string,

"type": "string"}]
}]
}
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
feedparser~=5.2
requests~=2.22
Empty file added rss_reader/__init__.py
Empty file.
4 changes: 4 additions & 0 deletions rss_reader/__main__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
from .rss_reader import main

if __name__ == '__main__':
main()
72 changes: 72 additions & 0 deletions rss_reader/cache.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
import sqlite3
import logging


file_path = 'cache.db'


class Cache:
""""This class contains news and methods of work whit cache"""
cursor = None
conn = None

def __init__(self):
"""This method initialize cursor to database"""
if self.cursor is None:
Cache._init_cursor()
else:
logger = logging.getLogger('rss_reader')
logger.error("This is singleton class. Use get_cursor")

@staticmethod
def _init_cursor():
Cache.conn = sqlite3.connect(file_path)
Cache.cursor = Cache.conn.cursor()
Cache.cursor.execute('''CREATE TABLE IF NOT EXISTS news(id INTEGER PRIMARY KEY,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

В данном случае в init происходит нетривальная логика.
Я думаю, будет лучше, если вынести эту логику в отдельный метод.
Все таки из-за этого этот класс имеет иногда страный способ использования например вот так:

    @staticmethod
    def get_cursor():
        """Static access method. """
        if Cache.cursor is None:
            Cache()
        return Cache.cursor

title text, pub_date_key numeric, pub_date text, link text, description text, UNIQUE(link))''')
Cache.cursor.execute('''CREATE TABLE IF NOT EXISTS links( id INTEGER PRIMARY KEY,
link text, news numeric)''')
Cache.cursor.execute('''CREATE TABLE IF NOT EXISTS media( id INTEGER PRIMARY KEY,
link text, news numeric)''')

@staticmethod
def get_cursor():
"""Static access method. """
if Cache.cursor is None:
Cache()
return Cache.cursor

@staticmethod
def commit():
"""This method commit to database database"""
return Cache.conn.commit()

@staticmethod
def close():
"""This method close connection to database"""
return Cache.conn.close()

@staticmethod
def print_news(date):
"""This method print news to std from selected date to database"""
Cache.get_cursor()
Cache.cursor.execute('''SELECT * FROM news WHERE pub_date_key = ?''', (date,))
news = Cache.cursor.fetchall()
if len(news) == 0:
return 1
for elem in news:
print('\nTitle: ', elem[1])
print('Date: ', elem[3])
print('Link: ', elem[4])
print(f'Description: {elem[5]}\n')
Cache.cursor.execute('''SELECT * FROM links WHERE news= ?''', (elem[0],))
links = Cache.cursor.fetchall()
i = 1
for link in links:
print(f'Link[{i}]: ', link[1])
i = i + 1
Cache.cursor.execute('''SELECT * FROM media WHERE news= ?''', (elem[0],))
links = Cache.cursor.fetchall()
for link in links:
print(f'Link[{i}]: ', link[1])
i = i + 1
186 changes: 186 additions & 0 deletions rss_reader/news.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,186 @@
import html
import os
import re
import json
import logging
from .cache import Cache
import base64
import requests


class News:
"""This class contains news and methods of work whit news"""

http_header = 'http'
err_media_type = 'No type'

def __init__(self, feeds_dict, limit):

logger = logging.getLogger('rss_reader')
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
file_handler = logging.FileHandler('rss_reader_logs.log')
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)
logger.setLevel(logging.INFO)
self.news = dict()
self.all_news = list()

self.name_of_source = feeds_dict.feed['title']

real_limit = len(feeds_dict.entries)
if limit > 0:
if limit < len(feeds_dict.entries):
real_limit = limit

cursor = Cache.get_cursor()

for i in range(real_limit):
list_to_cache = list()
self.news['title'] = html.unescape(feeds_dict.entries[i].title)
self.news['date'] = html.unescape(feeds_dict.entries[i].published)
self.news['link'] = html.unescape(feeds_dict.entries[i].link)
self.news['description'] = self.clean_from_tags(html.unescape(feeds_dict.entries[i].description))

date_dict = feeds_dict.entries[i].published_parsed
date_str = str(date_dict.tm_year) + str(date_dict.tm_mon) + str(date_dict.tm_mday)

list_to_cache.append(self.news['title'])
list_to_cache.append(date_str)
list_to_cache.append(self.news['date'])
list_to_cache.append(self.news['link'])
list_to_cache.append(self.news['description'])

self.news['media'] = self._parse_media(feeds_dict.entries[i])
self.news['links'] = self._parse_links(feeds_dict.entries[i])

self._cache_feed(list_to_cache, self.news['links'], self.news['media'], cursor)

self.all_news.append(self.news.copy())
Cache.close()

@staticmethod
def _parse_links(news_dict):
"""This function parse links of feed"""
list_of_links = list()
if news_dict.links:
for elem in news_dict.links:
list_of_links.append({'url': elem.setdefault('url', None), 'type': elem.setdefault('type', None)})
return list_of_links

def _parse_media(self, news_dict):
"""This function parse media of feed"""
if news_dict.setdefault('media_content', None):
media = list()
if news_dict.media_content:
for elem in news_dict.media_content:
if elem['url'].rfind(self.http_header, 0, len(elem['url'])) > 0:
# Some sources of news write two links in one string of media. And only second string is image
links = elem['url'].split(self.http_header)
media.append({'url': self.http_header + links[2], 'type': "img"})
else:
if elem.setdefault('url', None):
media.append({'url': elem.setdefault('url', None),
'type': elem.setdefault('type', None)})
return media
else:
return ''

def _cache_feed(self, list_of_main_info, list_of_links, list_of_media, cursor):
"""This function write feed to cache"""
cursor.execute('''INSERT or IGNORE INTO news (title, pub_date_key, pub_date, link, description)
VALUES(?,?,?,?,?)''', list_of_main_info)
ids = cursor.lastrowid

list_to_cache_of_links = list()
for elem in list_of_links:
list_to_cache_of_links.append(elem.setdefault('url', None))
list_to_cache_of_links.append(ids)
cursor.execute('''INSERT or IGNORE INTO links (link, news) VALUES(?,?)''', list_to_cache_of_links)
list_to_cache_of_links.clear()

list_to_cache_of_media = list()
for elem in list_of_media:
list_to_cache_of_media.append(elem.setdefault('url', None))
list_to_cache_of_media.append(ids)
cursor.execute('''INSERT or IGNORE INTO media (link, news) VALUES(?,?)''', list_to_cache_of_media)
list_to_cache_of_media.clear()

Cache.commit()

@staticmethod
def clean_from_tags(text_with_tags):
"""This function delete tags from string"""
return re.sub('<.*?>', '', text_with_tags)

def print(self):
"""This function print news to stdout in readable format"""
print(f'Source: {self.name_of_source}\n')
for elem in self.all_news:
print(f'Title: {elem["title"]}')
print(f'Date: {elem["date"]}')
print(f'Link: {elem["link"]}')
print(f'Description: {elem["description"]}\n')

j = 1
print('Links: ')
for link in elem['links']:
print(f'[{j}] {link["url"]} ({link["type"]})')
j = j + 1

if elem.setdefault('media', None):
print("Media: ")
for media in elem['media']:
print(f'[{j}] {media["url"]} ({media["type"]})')
j = j + 1

def to_json(self):
"""This function returns JSON-string with news"""
return json.dumps({'Source:': self.name_of_source, 'Feeds': self.all_news}, ensure_ascii=False).encode('utf8')

def create_fb2(self, filepath):
if filepath[-4::] != ".fb2":
filename = filepath + ".fb2"
with open(filename, 'w', encoding="utf-8") as fb2_file:
fb2_file.write('<?xml version="1.0" ?>\n')
fb2_file.write(f'''<FictionBook xmlns:l="http://www.w3.org/1999/xlink"><description/><body>''')
fb2_file.write(f'''<title><p>{self.name_of_source.replace("&", "&amp;")}</p></title><empty-line/>''')
for elem in self.all_news:
fb2_file.write(f'<section><title><p>{elem["title"].replace("&", "&amp;")}</p></title>')
fb2_file.write(f'<p>Date of posting: {elem["date"].replace("&", "&amp;")}</p>')
fb2_file.write(f'<p>{elem["description"].replace("&", "&amp;")}</p><empty-line/>')
fb2_file.write(f'<p>Source: {elem["link"]}</p></section>'.replace("&", "&amp;"))

for media in elem['media']:
if media['type'] != self.err_media_type:
fb2_file.write(f'''<empty-line/><empty-line/>
<image l:href="#{media["url"]}"/><empty-line/><empty-line/>''')
pass
fb2_file.write('</body>')
for elem in self.all_news:
for media in elem['media']:
if media['type'] != self.err_media_type:
fb2_file.write(f'<binary content-type="image/png" id="{media["url"]}">')
content = base64.b64encode(requests.get(media["url"]).content)
fb2_file.write(content.decode('ascii'))
fb2_file.write('</binary>')
fb2_file.write('</FictionBook>')

print(f'All news you can find at {os.path.realpath(filename)}')

def create_html(self, filepath):
if filepath[-5::] != ".html":
filename = filepath + ".html"
with open(filename, 'w', encoding="utf-8") as html_file:
html_file.write(f'<html>\n<head>{self.name_of_source}</head>\n<body>\n')
for elem in self.all_news:
html_file.write(f'<h3>{elem["title"]}</h3>')
html_file.write(f'<p>Date of posting: {elem["date"]}</p>')
html_file.write(f'<p>{elem["description"]}</p>')
html_file.write(f'<p><a href="{elem["link"]}">Link to source</a></p>')

for media in elem['media']:
if media['type'] != self.err_media_type:
html_file.write(f'<p><img src="{media["url"]}"></p>')
html_file.write('<hr>')
html_file.write('</body></html>')
print(f'All news you can find at {os.path.realpath(filename)}')
77 changes: 77 additions & 0 deletions rss_reader/rss_reader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
import argparse
import feedparser
from .news import News
from .cache import Cache
import logging
import sys


version = '1.5'


def main():
"""Main function of program"""

logger = logging.getLogger('rss_reader')
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
file_handler = logging.FileHandler('rss_reader_logs.log')
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)
logger.setLevel(logging.INFO)

parser = argparse.ArgumentParser(description='Python RSS-reader')
parser.add_argument('URL', type=str, help='RSS URL')
parser.add_argument('--version', help='Print version info', action='version', version=version)
parser.add_argument('--json', help='Print result as JSON in stdout', action='store_true')
parser.add_argument('-V', '--verbose', help='Outputs verbose status messages', action='store_true')
parser.add_argument('-L', '--limit', help='Limit news topics if this parameter is provided', type=int, default=0)
parser.add_argument('--date', help='Find news in cache if this parameter is provided', type=int, default=0)
parser.add_argument('--to-html', help='Create a HTML file with news', type=str, default="")
parser.add_argument('--to-fb2', help='Create a fb2 file with news', type=str, default="")
args = parser.parse_args()

if args.verbose:
stdout_handler = logging.StreamHandler(sys.stdout)
stdout_handler.setFormatter(formatter)
logger.addHandler(stdout_handler)

if args.date:
logger.info('Starting to read from cache')
state = Cache.print_news(args.date)
if state == 1:
print(f'''There are not exist news with date of publication at {args.date}
\nMake sure that your format date in %Y%m%d''', file=sys.stderr)
else:
logger.info('News from cache ware read')
else:

feeds = feedparser.parse(args.URL)

if feeds.bozo:
print('This is not well formed XML', file=sys.stderr)
exit()

else:
logger.info('The XML file with news is received and correct')

news = News(feeds, args.limit)
logger.info('News is parsed')

if args.to_html:
news.create_html(args.to_html)

elif args.to_fb2:
news.create_fb2(args.to_fb2)

elif args.json:
print(news.to_json().decode())
logger.info('News is displayed in stdout in a json format')
else:
news.print()
logger.info('News is displayed in stdout in a readability format')

logger.info('Program is over')


if __name__ == '__main__':
main()
Loading