# coding: utf-8

import urllib
import urllib2
import cookielib
from threading import Thread
from BeautifulSoup import BeautifulSoup
from BeautifulSoup import BeautifulStoneSoup
from BeautifulSoup import Comment
from BeautifulSoup import CData
from BeautifulSoup import NavigableString
from BeautifulSoup import Tag
from urlparse import urlparse
from urlparse import urlunparse
from urlparse import urlsplit
from urlparse import urlunsplit
from datetime import datetime
from datetime import date
from datetime import timedelta
import uuid
import os
import re
import locale
import glob
import StringIO
import gzip
import shutil
import sys
import traceback


month_names = {"Jan": 1, "Feb": 2, "Mar": 3, "Apr": 4, "May": 5, "Jun": 6,
    "Jul": 7, "Aug": 8, "Sep": 9, "Oct": 10, "Nov": 11, "Dec": 12}
zone_names = {"UT": "+0000", "GMT": "+0000", "EST": "-0500", "EDT": "-0400",
    "CST": "-0600", "CDT": "-0500", "MST": "-0700", "MDT": "-0600",
    "PST": "-0800", "PDT": "-0700",
    "Z": "+0000", "A": "-0100", "M": "-1200", "N": "+0100", "Y": "+1200"}
massage = [(re.compile('<([^<>\s]+)([^<>]*)/>'),
    lambda x: "<" + x.group(1) + x.group(2) + "></" + x.group(1) + ">")]


feed_item_template_default = """
<b><a class="feedcircuit-item-link"/></b>
<div class="feedcircuit-item-body">
</div>
<br>
"""
page_template_default = """
<html>
    <head>
        <meta http-equiv="Content-Type" content="text/html;charset=utf-8" >
    </head>
    <body>
    </body>
</html>
"""
feed_update_template_default = """%title, %timestamp <hr>"""
item_comment_prefix = "feed item: "
print_version_keywords_default = [u"printable version", u"версия для печати"]
single_page_keywords_default = [u"single page", u"одной страницей"]
next_page_keywords_default = [u"next page"]

try:
    import html5lib
    from html5lib import treebuilders

    def parse_html(data = None, fromEncoding = None):
        if data:
            parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("beautifulsoup"))
            return parser.parse(data, encoding = fromEncoding)
        return BeautifulSoup()


    def parse_html_fragment(data, fromEncoding = None):
        parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("beautifulsoup"))
        return parser.parseFragment(data)


#    def parse_xml(data):
#        parser = html5lib.XMLParser(tree=treebuilders.getTreeBuilder("beautifulsoup"))
#        return parser.parse(data)
    def parse_xml(data):
        return BeautifulStoneSoup(data, markupMassage = massage)

except ImportError:
    def parse_html(data = None, fromEncoding = None):
        if data:
            return BeautifulSoup(data, fromEncoding = fromEncoding)
        return BeautifulSoup()


    def parse_html_fragment(data, fromEncoding = None):
        return parse_html(data, fromEncoding = fromEncoding)


    def parse_xml(data):
        return BeautifulStoneSoup(data, markupMassage = massage)


def parse_date(str):
    """ RSS or Atom feed date parsing.

    RSS and Atom feeds contains locale independent dates so it is impossible to use strptime().
    """
    str = str.strip()
    parts = str.split(" ")
    (y, m, d, hh, mm, ss, ms) = (0, 1, 1, 0, 0, 0, 0)
    if len(parts) >= 5:
        if parts[0].find(",") >= 0:
            parts = parts[1:]
        zone = parts[-1]
        if zone_names.has_key(parts[-1]):
            zone = zone_names[parts[-1]]
        time_parts = parts[3].split(":")
        if len(time_parts) == 3:
            ss = int(time_parts[2])
        (y, m, d, hh, mm) = (int(parts[2]), month_names[parts[1]], int(parts[0]),
            int(time_parts[0]), int(time_parts[1]))
    else:
        zone = "+0000"
        if str[-1] == "Z":
            str = str[:-1]
        elif str[-6] == "-" or str[-6] == "+":
            zone = str[-6:].replace(":", "")
            str = str[:-6]
        parts = str.split(" ")
        if len(parts) == 1:
            parts = str.strip().split("T")
        date_parts = parts[0].split("-")
        y = int(date_parts[0])
        if len(date_parts) > 1:
            m = int(date_parts[1])
        if len(date_parts) > 2:
            d = int(date_parts[2])
        if len(parts) > 1:
            time_parts = parts[1].split(":")
            hh = int(time_parts[0])
            mm = int(time_parts[1][:2])
            if len(time_parts) > 2:
                ss = int(time_parts[2][:2])
                if time_parts[2].find(".") >= 0:
                    ms = int(time_parts[2][3:]) * 1000
    (dh, dm) = (int(zone[1:3]), int(zone[3:]))
    if zone[0] == "-":
        dh *= -1
    return (datetime(y, m, d, hh, mm, ss, ms), timedelta(minutes = dm, hours = dh))


def create_url_opener():
    """ Opener is for opening urls.

    Opener tries to use either microb (on N800) or firefox (on PC) cookies.
    """
    filename = os.path.join(os.environ["HOME"], ".mozilla/microb/cookies.txt")
    if not os.path.exists(filename):
        files = glob.glob(os.path.join(os.environ["HOME"], ".mozilla/firefox/*default/cookies.txt"))
        if len(files):
            filename = files[0]
        else:
            filename = None
    if filename:
        cj = cookielib.MozillaCookieJar()
        cj.load(filename)
        return urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
    return urllib2.build_opener()


urlopener = create_url_opener()


def quote_url_path(url):
    p = urlsplit(url)
    return urlunsplit((p[0], p[1], urllib.quote(p[2], "/%"), p[3], p[4]))


def urljoin_rfc1808(base, url, allow_fragments=True):
    """Join a base URL and a possibly relative URL to form an absolute
    interpretation of the latter. RFC 1808 superceeds 2369 here."""

    from urlparse import uses_relative
    from urlparse import uses_netloc

    if not base:
        return url
    if not url:
        return base
    bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
            urlparse(base, '', allow_fragments)
    scheme, netloc, path, params, query, fragment = \
            urlparse(url, bscheme, allow_fragments)
    if scheme != bscheme or scheme not in uses_relative:
        return url
    if scheme in uses_netloc:
        if netloc:
            return urlunparse((scheme, netloc, path,
                               params, query, fragment))
        netloc = bnetloc
    if path[:1] == '/':
        return urlunparse((scheme, netloc, path,
                           params, query, fragment))
    if not path:
        return urlunparse((scheme, netloc, bpath,
                           params, query, fragment))
    segments = bpath.split('/')[:-1] + path.split('/')
    # XXX The stuff below is bogus in various ways...
    if segments[-1] == '.':
        segments[-1] = ''
    while '.' in segments:
        segments.remove('.')
    while 1:
        i = 1
        n = len(segments) - 1
        while i < n:
            if (segments[i] == '..'
                and segments[i-1] not in ('', '..')):
                del segments[i-1:i+1]
                break
            i = i+1
        else:
            break
    if segments == ['', '..']:
        segments[-1] = ''
    elif len(segments) >= 2 and segments[-1] == '..':
        segments[-2:] = ['']
    return urlunparse((scheme, netloc, '/'.join(segments),
                       params, query, fragment))


urljoin = urljoin_rfc1808
#from urlparse import urljoin


def get_or_create_tag(html, list, tag = None):
    if not tag:
        tag = html
    sub = tag.find(list[0])
    if not sub:
        sub = Tag(html, list[0])
        tag.append(sub)
    if len(list) > 1:
        return get_or_create_tag(html, list[1:], tag = sub)
    else:
        return sub


def get_feedcircuit_root(html):
    root = html.find("div", {"id": "feedcircuit-root"})
    if not root:
        root = get_or_create_tag(html, ["body"])
    return root


def tag_contains(tag, another):
    if tag:
        while another:
            if tag is another:
                return True
            another = another.parent
    return False


def get_page_encoding(page):
    """ Not all html documents has http-equiv meta, so this retrieves real http headers.
    """
    if page.info().has_key("Content-Type"):
        params = map(lambda nvs: nvs.split("="), page.info()["Content-Type"].split(";"))
        charsets = filter(lambda nv: nv[0].strip() == "charset" and len(nv) > 1, params)
        if len(charsets):
            return charsets[0][1].replace("'", "").replace('"', "").strip()
    return None


def get_page_base(page, html):
    base = html.find("base")
    if base and base.has_key("href"):
        return base["href"]
    return page.geturl()


def get_body_or_html(html):
    if html and html.body:
        return html.body
    return html


refresh_reqex = re.compile("refresh", re.IGNORECASE)


def get_feed_title(xml):
    chan = xml.find(["channel", "feed"])
    if chan:
        title = chan.find("title", recursive = False)
        if title:
            return title.string
    return "untitled"


def get_feed_details(url, icon_url = None, page = None):
    if not page:
        try:
            page = urlopener.open(url)
            xml = parse_xml(page)
        finally:
            page.close()
    else:
        xml = parse_xml(page)
    title = get_feed_title(xml)
    icon_tag = xml.find(["icon", "logo", "image"])
    if icon_tag:
        if icon_tag.url:
            icon_url = icon_tag.url.string
        else:
            icon_url = icon_tag.string
    if not icon_url:
        html_url = None
        html_tag = xml.find(name = "link", attrs = {"rel": "alternate"})
        if html_tag:
            html_url = html_tag["href"]
        else:
            html_tag = xml.find("link")
            if html_tag:
                html_url = html_tag.string
        if html_url:
            html_page = urlopener.open(html_url)
            try:
                html = parse_html(html_page, get_page_encoding(html_page))
                base = get_page_base(html_page, html)
            finally:
                html_page.close()
            icon_tag = html.find(name = "link", attrs = {"rel": re.compile("icon")})
            if icon_tag:
                icon_url = urljoin(base, icon_tag["href"])
    icon_filename = ""
    icon_data = None
    if icon_url:
        icon_filename = urlparse(icon_url)[2].split("/")[-1]
        icon = urlopener.open(icon_url)
        try:
            icon_data = icon.read()
        finally:
            icon.close()
    return (url, title, icon_filename, icon_data)


def discover_feed(url):
    """ Returns RSS or Atom feed url from related html page.

    Not only RSS or Atom feed urls may be entered by user, so
    this function check what is passed and if it is html page
    function try to detect feed url. It also retrieves icon.
    """
    page = urlopener.open(url)
    try:
        icon_url = None
        content_type = None
        if page.info().has_key("Content-Type"):
            content_type = page.info()["Content-Type"]
        if content_type and content_type.find("html") >=0:
            html = parse_html(page, get_page_encoding(page))
            base = get_page_base(page, html)
            icon_tag = html.find(name = "link", attrs = {"rel": re.compile("icon", flags = re.I)})
            if icon_tag:
                icon_url = urljoin(base, icon_tag["href"])
            rss_tag = html.find(name = "link", attrs = {"rel": "alternate", "type": "application/rss+xml"})
            if not rss_tag:
                rss_tag = html.find(name = "link", attrs = {"rel": "alternate", "type": "application/atom+xml"})
            if rss_tag:
                return get_feed_details(urljoin(base, rss_tag["href"]), icon_url)
        return get_feed_details(url, icon_url, page)
    finally:
        page.close()


class PageDownloader(Thread):


    def __init__(self, callback, url, **kwargs):
        Thread.__init__(self)
        self.daemon = True
        self.__callback = callback
        self.__url = url
        self.__kwargs = kwargs 
        self.__result = None


    def run(self):
        if not "headers" in self.__kwargs:
            self.__kwargs["headers"] = {}
        self.__kwargs["headers"]["Accept-Encoding"] = "gzip"
        page = urlopener.open(urllib2.Request(url=quote_url_path(self.__url), **self.__kwargs))
        content = page.read()
        data = content
        if "Content-Encoding" in page.info():
            if page.info()["Content-Encoding"] == "gzip":
                sio = StringIO.StringIO(content)
                try:
                    gzf = gzip.GzipFile(fileobj = sio)
                    try:
                        data = gzf.read()
                    finally:
                        gzf.close()
                finally:
                    sio.close()
        self.__result = (page, data)


    def download(self):
        self.start()
        while self.isAlive():
            if self.__callback:
                self.__callback()
            self.join(.25)
        return self.__result


class DownloadQueue:
    """ This class helps managing multithreading downloads. """

    def __init__(self, callback):
        self.__threads = []
        self.__callback = callback
        self.__cached = {}


    def queue(self, url, abs_path, rel_path):
        if self.__cached.has_key(url):
            return self.__cached[url]
        filename = uuid.uuid1().hex + urllib.unquote(urlparse(url)[2].split("/")[-1])
        path_for_url = os.path.join(rel_path, filename)
        self.__cached[url] = path_for_url
        thread = Thread(target = self.__retrieve, args = (quote_url_path(url), os.path.join(abs_path, filename)))
        thread.daemon = True
        self.__threads.append(thread)
        thread.start()
        return path_for_url


    def __retrieve(self, url, filename):
        try:
            urllib.urlretrieve(url, filename)
        except Exception, ex:
            if self.__callback:
                self.__callback(exception = ex)


    def wait_completion(self):
        for thread in self.__threads:
            while thread.isAlive():
                if self.__callback:
                    self.__callback()
                thread.join(.25)


class Item:
    """ This class groups feed element related operations together.

    This is really not related to OOP best practices. It simply handy
    to have all this fields accessible via self.
    """

    def __init__(self, url = None, tag = None, base = None, path = "", cache_path = "", callback = None,
        print_version_keywords = print_version_keywords_default,
        single_page_keywords = single_page_keywords_default,
        next_page_keywords = next_page_keywords_default,
        page_template = None, allow_scripts = False, content_kwargs = None,
        reformatter_url = None, download_queue = None, ignore_images = False):
        """ It is clear that this is constructor :)

        Item is represented either by its content, when tag is specified, or by reference
        (url is used in this case). 
        base - html document base used to resolve relative references.
        path - base directory used to store cached item
        cache_path - subdirectory of path used to cache related resources (such as images).
        callback - for reporting progress.
        page_template - used for creating cached items.
        print_version_keywords - a list of phrases to scan html document searching for reference to
        printable version.
        allow_scripts - do not remove script tags from item before caching.
        content_kwargs - it is passed to BeautifulSoup's find method. It is usually
        tag name and its attributes.
        reformatter_url - format string (%s is replaced with url from item).
        next_page_keywords - a list of names of links which leads to the next page of document if any
        single_page_keywords - a list of names of links which leads to the single page version
        ignore_images - strip images from the item
        Used to process html document through online services such as Skweezer.
        """
        self.__url = url
        self.__tag = tag
        self.__base = base
        self.__path = path
        self.__cache_path = cache_path
        self.__page_template = page_template_default
        if page_template:
            self.__page_template = page_template
        self.__print_version_keywords = print_version_keywords
        self.__single_page_keywords = single_page_keywords
        self.__next_page_keywords = next_page_keywords
        self.__callback = callback
        self.__attrs_for_fix = [("a", "href"), ("iframe", "src"), ("form", "action")]
        self.__attrs_for_cache = [("script", "src"), ("img", "src"), ("input", "src")]
        self.__tags_for_removal = ["head", "object", "applet", "embed"]
        self.__allow_scripts = allow_scripts
        if not self.__allow_scripts:
            self.__tags_for_removal.append("script")
        self.__ignore_images = ignore_images
        if self.__ignore_images:
            self.__tags_for_removal.append("img")
        self.title = "untitled"
        self.__content_kwargs = content_kwargs
        self.__reformatter_url = reformatter_url
        self.__next_page_a = None
        self.__download_queue = download_queue
        if not self.__download_queue:
            self.__download_queue = DownloadQueue(self.__report)


    def __report(self, action = None, item = None, exception = None):
        details = None
        if exception:
            action = "error"
            item = exception
            details = ''.join(traceback.format_tb(sys.exc_info()[2]))
        if self.__callback:
            self.__callback((action, item, None, details))


    def __calc_the_tag(self, tag):
        """ Searches for tag that contains useful content such as article, news etc.

        Here is the primary FeedCircuit heuristic, sorta "know how".
        """
        (textsum, tagsum, max, node) = (0, 0, 0, None)
        for text in tag.findAll(text = lambda text: not isinstance(text, Comment), recursive = False):
            textsum += len(text.strip())
        for sub in tag.findAll(lambda t: not t.name in ["a", "script"], recursive = False):
            (taglen, maxtaglen, maxtag) = self.__calc_the_tag(sub)
            tagsum += taglen
            if maxtaglen > max:
                (max, node) = (maxtaglen, maxtag)
        result = textsum + tagsum / 2
        if result > max and tagsum:
            (max, node) = (result, tag)
        return (result, max, node)


    def __find_the_tag(self, tag):
        return self.__calc_the_tag(tag)[2]


    def __get_cache_directory(self):
        cache_path = os.path.join(self.__path, self.__cache_path)
        if not os.path.exists(cache_path):
            os.makedirs(cache_path)
        return cache_path


    def __get_page_html(self, url, **kwargs):
        """ Retrieves and parses html page.

        kwargs are passed to Request object constructor.
        """
        self.__report("downloading", url)
        downloader = PageDownloader(self.__report, url, **kwargs)
        (page, data) = downloader.download()
        try:
            self.__report("parsing", url)
            html = parse_html(data, get_page_encoding(page))
            base = get_page_base(page, html)
            url = page.geturl()
        finally:
            page.close()
        return (html, base, url)


    def __cache_next_page(self, html, base):
        try:
            text = html.find(text=lambda t: t.strip().lower() in self.__next_page_keywords)
            if text:
                a = text.findParent(lambda t: t.name == "a" and t.has_key("href"))
                if a:
                    url = urljoin(base, a["href"])
                    item = Item(url = url, path = self.__path, cache_path = self.__cache_path,
                        callback = self.__callback, page_template = self.__page_template,
                        allow_scripts = self.__allow_scripts, next_page_keywords = self.__next_page_keywords,
                        content_kwargs = self.__content_kwargs)
                    item.process()
                    a["href"] = item.save()
                    return a
        except Exception, ex:
            self.__report(exception = ex)


    def __find_single_page(self, html, base, referer):
        try:
            text = html.find(text=lambda t: t.strip().lower() in self.__single_page_keywords)
            if text:
                a = text.findParent(lambda t: t.name == "a" and t.has_key("href"))
                if a:
                    url = urljoin(base, a["href"])
                    (new_html, new_base, real_url) = self.__get_page_html(url, headers={"Referer": referer})
                    return (new_html, new_base, True)
        except Exception, ex:
            self.__report(exception = ex)
        return (html, base, False)


    def __find_print_version(self, html, base, referer):
        try:
            text = html.find(text=lambda t: t.strip().lower() in self.__print_version_keywords)
            if text:
                a = text.findParent(lambda t: t.name == "a" and t.has_key("href"))
                if a:
                    url = urljoin(base, a["href"])
                    (new_html, new_base, real_url) = self.__get_page_html(url, headers={"Referer": referer})
                    return (new_html, new_base, get_body_or_html(new_html))
        except Exception, ex:
            self.__report(exception = ex)
        return (html, base, None)


    def get_url_for_reformatter(self, url):
        if self.__reformatter_url:
            return self.__reformatter_url % urllib.quote(url, "%")
        return url


    def __extract_tag(self):
        """ This performs main Item's job.

        By extracting html document and reformatting it.
        """
        (html, self.__base, real_url) = self.__get_page_html(self.get_url_for_reformatter(self.__url))
        if self.__print_version_keywords:
            (html, self.__base, self.__tag) = self.__find_print_version(html, self.__base, real_url)
        sp = False
        if not self.__tag and self.__single_page_keywords:
            (html, self.__base, sp) = self.__find_single_page(html, self.__base, real_url)
        if not sp and not self.__tag and self.__next_page_keywords:
            self.__next_page_a = self.__cache_next_page(html, self.__base)
        body = get_body_or_html(html)
        if not self.__reformatter_url:
            if self.__content_kwargs:
                self.__tag = body.find(**self.__content_kwargs)
            if not self.__tag:
                self.__tag = self.__find_the_tag(body)
        if not self.__tag:
            self.__tag = body
        title = html.find("title")
        if title:
            self.title = title.string.strip()


    def __fix_url(self, tag, attr, base):
        if tag.has_key(attr):
            tag[attr] = urljoin(base, tag[attr])


    def __fix_urls(self, tags, attr, base):
        if self.__next_page_a:
            href = self.__next_page_a["href"]
        [self.__fix_url(tag, attr, base) for tag in tags]
        if self.__next_page_a:
            self.__next_page_a["href"] = href


    def __cache_url(self, tag, attr, base):
        if tag.has_key(attr):
            try:
                url = urljoin(base, tag[attr])
                tag[attr] = self.__download_queue.queue(url, 
                    self.__get_cache_directory(), self.__cache_path)
            except Exception, ex:
                self.__report(exception = ex)


    def __cache_urls(self, tags, attr, base):
        [self.__cache_url(tag, attr, base) for tag in tags]


    def __fixup_tag(self):
        """ Collection of misc postprocessing tasks.

        1. Removing scripts.
        2. Caching resources and changing references to them.
        3. Keeping consistency by preventing extracting tags from context,
        e.g. td should always be nested into tr etc.
        """
        [t.extract() for t in self.__tag.findAll(self.__tags_for_removal)]
        [self.__fix_urls(self.__tag.findAll(x[0]), x[1], self.__base) for x in self.__attrs_for_fix]
        [self.__cache_urls(self.__tag.findAll(x[0]), x[1], self.__base) for x in self.__attrs_for_cache]
        if self.__tag.name in ["table", "dir", "menu", "dl", "ul", "ol"]:
            content = [self.__tag]
        elif self.__tag.name in ["tr", "caption", "colgroup", "thead", "tfoot", "tbody"]:
            content = [self.__tag.findParent("table")]
        else:
            content = [node for node in self.__tag.contents]
        if self.__next_page_a and not tag_contains(self.__tag, self.__next_page_a):
            self.__next_page_a.extract()
            content.append(self.__next_page_a)
        return content


    def save(self, title = None, filename = None):
        """ Saves item into cache. """
        html = parse_html(self.__page_template)
        if not title:
            title = self.title
        get_or_create_tag(html, ["head", "title"]).append(NavigableString(title))
        root = get_feedcircuit_root(html)
        [root.append(tag) for tag in self.__contents]
        if not filename:
            filename = uuid.uuid1().hex + ".html"
        if not os.path.exists(self.__path):
            os.makedirs(self.__path)
        f = open(os.path.join(self.__path, filename), "w")
        f.write(html.prettify())
        f.close()
        return filename


    def process(self):
        """ Entry point. """
        self.__contents = []
        if self.__url:
            self.__extract_tag()
        self.__contents = self.__fixup_tag()
        return self.__contents


class Feed:
    """ Collection of feed related operations. """
    
    def __init__(self, url, path = "", cache_path = "", callback = None, title = None, filename = None,
        feed_template = None, feed_item_template = None, feed_update_template = None, page_template = None,
        print_version_keywords = print_version_keywords_default,
        single_page_keywords = single_page_keywords_default,
        next_page_keywords = next_page_keywords_default,
        delete_before_date = None, allow_scripts = False, inline_items = False, cache_items = False,
        include = None, exclude = None, content_kwargs = None, reformatter_url = None,
        new_window = False, always_clear_cache = False, ignore_images = False):
        """ Constructor.

        url - feed url.
        path - base directory used to store cached item
        cache_path - subdirectory of path used to cache related resources (such as images).
        callback - for reporting progress.
        title - if this is not provided it is tried to figure out automatically.
        filename - if this is not provided title.html is used to store feed
        feed_template - used for creating cached items.
        feed_item_template - html used to construct feed item.
        print_version_keywords - a list of phrases to scan html document searching for reference to
        printable version.
        delete_before_date - all items before this date are deleted during feed update.
        allow_scripts - do not remove script tags from item before caching.
        inline_items - inline referenced resources into feed html.
        cache_items - download and cache referenced resources.
        include - regexp for item urls to include.
        exclude - regexp for item urls to exclude.
        content_kwargs - it is passed to BeautifulSoup's find method. It is usually
        tag name and its attributes.
        reformatter_url - format string (%s is replaced with url from item).
        next_page_keywords - a list of names of links which leads to the next page of document if any
        single_page_keywords - a list of names of links which leads to the single page version
        new_window - True if you want all links in the feed to open new browser window.
        ignore_images - True if you want to strip images from the feed (not articles)
        Used to process html document through online services such as Skweezer.
        """
        self.__url = url
        self.__path = path
        self.__cache_path = cache_path
        self.__callback = callback
        self.__title = title
        self.__filename = filename
        self.__feed_template = page_template_default
        if feed_template:
            self.__feed_template = feed_template
        self.__feed_item_template = feed_item_template_default
        if feed_item_template:
            self.__feed_item_template = feed_item_template
        self.__page_template = page_template_default
        if page_template:
            self.__page_template = page_template
        self.__feed_update_template = feed_update_template_default
        if feed_update_template:
            self.__feed_update_template = feed_update_template
        self.__print_version_keywords = print_version_keywords
        self.__single_page_keywords = single_page_keywords
        self.__next_page_keywords = next_page_keywords
        self.__inline_items = inline_items
        self.__cache_items = cache_items
        self.__delete_before_date = delete_before_date
        self.__allow_scripts = allow_scripts
        self.__ignore_images = ignore_images
        self.__include = None
        if include:
            self.__include = re.compile(include)
        self.__exclude = None
        if exclude:
            self.__exclude = re.compile(exclude)
        self.__content_kwargs = content_kwargs
        self.__reformatter_url = reformatter_url
        self.__new_window = new_window
        self.__always_clear_cache = always_clear_cache
        self.__download_queue = DownloadQueue(self.__report)
        self.new_item_count = 0


    def __report(self, action = None, item = None, percent = None, exception = None):
        details = None
        if exception:
            action = "error"
            item = exception
            details = ''.join(traceback.format_tb(sys.exc_info()[2]))
        if self.__callback:
            self.__callback((action, item, percent, details))


    def __unquote_special(self, str):
        if str:
            return str.replace("&lt;", "<").replace("&gt;", ">").replace("&quot;", '"').replace("&amp;", "&")
        return str


    def __get_item_title(self, item):
        if item.title.string:
            return self.__unquote_special(item.title.string.strip())
        for node in item.title:
            if node.string:
                str = node.string.strip()
                if str:
                    return str
        return ""


    def __get_item_link(self, item):
        if item.guid:
            if not item.guid.has_key("ispermalink") or item.guid["ispermalink"].lower() == "true":
                return item.guid.string
        if item.link:
            if item.link.has_key("href"):
                return item.link["href"] #Atom uses href to store links
            elif item.link.string:
                return item.link.string
            else:
                # Once there was a feed with url in CData section
                for node in item.link:
                    if node.string:
                        str = node.string.strip()
                        if str:
                            return str


    def __get_item_date(self, item):
        tag = item.find(["pubdate", "published"])
        if tag and tag.string:
            try:
                (date, zone) = parse_date(tag.string)
                return date
            except Exception:
                self.__report("warning", "failed to parse " + tag.string)
        return None


    def __get_item_description(self, item):
        desc = item.find(["description", "content"])
        if desc:
            str = desc.string
            if str:
                if not isinstance(str, CData):
                    str = self.__unquote_special(str)
                return parse_html_fragment(str)
        return parse_html()


    def __item_exists(self, tag, link):
        return (item_comment_prefix + link) in self.__existing_items


    def __outdated(self, item):
        date = self.__get_item_date(item)
        if date and self.__delete_before_date:
            return date.date() < self.__delete_before_date
        return False


    def __match_link(self, link):
        """ Filter out link using include and exclude patterns. """
        if self.__include and not self.__include.search(link):
            return False
        if self.__exclude and self.__exclude.search(link):
            return False
        return True


    def __create_feed_item(self, url = None, tag = None, base = None, path = None,
        cache_path = None, ignore_images = False):
        """ This is just not to type all this code when you need new Item instance. """
        if path == None:
            path = self.__path
        if cache_path == None:
            cache_path = self.__cache_path
        return Item(url = url, tag = tag, base = base, path = path, cache_path = cache_path,
            callback = self.__callback,
            print_version_keywords = self.__print_version_keywords,
            single_page_keywords = self.__single_page_keywords,
            next_page_keywords = self.__next_page_keywords,
            allow_scripts = self.__allow_scripts, content_kwargs = self.__content_kwargs,
            reformatter_url = self.__reformatter_url,
            download_queue = self.__download_queue, page_template = self.__page_template, ignore_images = ignore_images)
        

    def __get_update_html(self, html):
        div = Tag(html, "div")
        div["class"] = "feedcircuit-update"
        now = datetime.today()
        div["id"] = now.strftime("%Y%m%d%H%M%S") + "." + str(now.microsecond)
        template = self.__feed_update_template.replace("%title", self.__title.strip())
        template = template.replace("%timestamp",
            unicode(datetime.today().strftime("%c").decode(locale.getpreferredencoding())))
        div.append(parse_html_fragment(template))
        return div


    def __get_item_html(self):
        template = parse_html_fragment(self.__feed_item_template)
        item = Tag(template, "div")
        item["class"] = "feedcircuit-item"
        item.append(template)
        item_a = item.find("a", {"class": "feedcircuit-item-link"})
        if not item_a:
            item_a = item.find("a")
            if not item_a:
                item_a = Tag(template, "a")
                item.insert(0, item_a)
            item_a["class"] = "feedcircuit-item-link"
        item_div = item.find("div", {"class": "feedcircuit-item-body"})
        if not item_div:
            item_div = item.find("div")
            if not item_div:
                item_div = Tag(template, "div")
                item.append(item_div)
            item_div["class"] = "feedcircuit-item-body"
        return (item, item_a, item_div)


    def __add_item(self, div, item, percent):
        """ Here Item is get included to feed after executing all dependant operations. """
        link = self.__get_item_link(item)
        title = self.__get_item_title(item)
        self.__report("processing", title, percent)
        cache_path = self.__cache_path
        if div.has_key("id"):
            cache_path = os.path.join(cache_path, div["id"])
        (item_html, item_a, item_div) = self.__get_item_html()
        item_a["href"] = link
        item_a.append(NavigableString(title))
        if self.__inline_items:
            feed_item = self.__create_feed_item(url = link, ignore_images = self.__ignore_images, cache_path = cache_path)
            [item_div.append(t) for t in feed_item.process()]
        else:
            feed_item = self.__create_feed_item(tag = self.__get_item_description(item),
                base = self._real_url, ignore_images = self.__ignore_images, cache_path = cache_path)
            [item_div.append(t) for t in feed_item.process()]
            if self.__cache_items:
                path = os.path.join(self.__path, cache_path)
                feed_item = self.__create_feed_item(url = link, path = path, cache_path = "")
                feed_item.process()
                item_a["href"] = os.path.join(cache_path, feed_item.save(title))
                if self.__new_window:
                    item_a["target"] = "_blank"
        div.append(Comment(NavigableString(item_comment_prefix + link)))
        div.append(item_html)


    def __delete_div(self, div):
        if div.has_key("id"):
            div_path = os.path.join(os.path.join(self.__path, self.__cache_path), div["id"])
            if os.path.exists(div_path):
                shutil.rmtree(div_path, True)
        div.extract()


    def __delete_all_divs(self, html):
        for div in get_feedcircuit_root(html).findAll(name = "div", recursive = False):
            self.__delete_div(div)


    def __delete_before(self, html):
        if self.__delete_before_date:
            divs = filter(lambda div:
                div.has_key("id") and datetime.strptime(div["id"][:14], "%Y%m%d%H%M%S").date() < self.__delete_before_date,
                get_feedcircuit_root(html).findAll("div", recursive = False))
            [self.__delete_div(div) for div in divs]


    def __save_html(self, path, html):
        f = open(path, "w")
        f.write(html.prettify())
        f.close()


    def update(self):
        """ Updates feed and returns numbers of new items.

        When there are now new items feed file stays untouched.
        Outdated items are deleted only when there are some new ones.
        """
        self.new_item_count = 0
        div = None
        try:
            self.__report("downloading", self.__url, 1)
            downloader = PageDownloader(self.__report, self.__url)
            (page, data) = downloader.download()
            try:
                self._real_url = page.geturl()
                self.__report("parsing", self.__url, 10)
                xml = parse_xml(data)
            finally:
                page.close()
            if not self.__title:
                self.__title = get_feed_title(xml)
            path = os.path.join(self.__path, self.__title + ".html")
            if self.__filename:
                path = os.path.join(self.__path, self.__filename)
            if os.path.exists(path):
                f = open(path, "r")
                html = parse_html(f)
                if self.__always_clear_cache:
                    self.__report("deleting", "", 15)
                    self.__delete_all_divs(html)
                f.close()
            else:
                html = parse_html(self.__feed_template)
                get_or_create_tag(html, ["head", "title"]).append(NavigableString(self.__title))
            self.__existing_items = html.findAll(text = lambda text:
                isinstance(text, Comment) and text.startswith(item_comment_prefix))
            div = self.__get_update_html(html)
            get_feedcircuit_root(html).insert(0, div)
            to_process = []
            for item in xml.findAll(["item", "entry"]):
                link = self.__get_item_link(item)
                if self.__match_link(link) and not self.__outdated(item) \
                    and not self.__item_exists(div.parent, link):
                    to_process.append(item)
            count = len(to_process)
            if count:
                self.__save_html(path, html)
            for i in range(count):
                try:
                    self.__add_item(div, to_process[i], 20 + (i * 60 / count))
                    self.new_item_count += 1
                except Exception, ex:
                    self.__report(exception = ex)
            self.__report("downloading images", "", 90)
            self.__download_queue.wait_completion()
        except BaseException, ex:
            self.__report(exception = ex)
            raise
        finally:
            if self.new_item_count:
                if not self.__always_clear_cache:
                    self.__report("deleting", "", 80)
                    self.__delete_before(html)
                self.__save_html(path, html)


    def delete(self):
        """ Deletes everything related to feed from cache. """
        if self.__filename:
            path = os.path.join(self.__path, self.__filename)
        else:
            path = os.path.join(self.__path, self.__title + ".html")
        if os.path.exists(path):
            f = open(path, "r")
            html = parse_html(f)
            f.close()
            self.__delete_all_divs(html)
            os.remove(path)


    def test(self):
        start = datetime.today()
        for i in range(1000):
            self.__get_item_html()
        print "self.__get_item_html() 1000 times", datetime.today() - start
