first commit, builds already

2010-04-28 02:48:29 +02:00 · 2010-04-28 02:48:29 +02:00 · 87271ebae9
commit 87271ebae9
5 changed files with 2252 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,2 @@
+out/
+*.pyc
--- a/PyRSS2Gen.py
+++ b/PyRSS2Gen.py
@ -0,0 +1,440 @@
+"""PyRSS2Gen - A Python library for generating RSS 2.0 feeds."""
+
+__name__ = "PyRSS2Gen"
+__version__ = (1, 0, 0)
+__author__ = "Andrew Dalke <dalke@dalkescientific.com>"
+
+_generator_name = __name__ + "-" + ".".join(map(str, __version__))
+
+import datetime
+
+# Could make this the base class; will need to add 'publish'
+class WriteXmlMixin:
+    def write_xml(self, outfile, encoding = "iso-8859-1"):
+        from xml.sax import saxutils
+        handler = saxutils.XMLGenerator(outfile, encoding)
+        handler.startDocument()
+        self.publish(handler)
+        handler.endDocument()
+
+    def to_xml(self, encoding = "iso-8859-1"):
+        import io
+        f = io.StringIO()
+        self.write_xml(f, encoding)
+        return f.getvalue()
+
+
+def _element(handler, name, obj, d = {}):
+    if isinstance(obj, str) or obj is None:
+        # special-case handling to make the API easier
+        # to use for the common case.
+        handler.startElement(name, d)
+        if obj is not None:
+            handler.characters(obj)
+        handler.endElement(name)
+    else:
+        # It better know how to emit the correct XML.
+        obj.publish(handler)
+
+def _opt_element(handler, name, obj):
+    if obj is None:
+        return
+    _element(handler, name, obj)
+
+
+def _format_date(dt):
+    """convert a datetime into an RFC 822 formatted date
+
+    Input date must be in GMT.
+    """
+    # Looks like:
+    #   Sat, 07 Sep 2002 00:00:01 GMT
+    # Can't use strftime because that's locale dependent
+    #
+    # Isn't there a standard way to do this for Python?  The
+    # rfc822 and email.Utils modules assume a timestamp.  The
+    # following is based on the rfc822 module.
+    return "%s, %02d %s %04d %02d:%02d:%02d GMT" % (
+            ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"][dt.weekday()],
+            dt.day,
+            ["Jan", "Feb", "Mar", "Apr", "May", "Jun",
+             "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"][dt.month-1],
+            dt.year, dt.hour, dt.minute, dt.second)
+
+        
+##
+# A couple simple wrapper objects for the fields which
+# take a simple value other than a string.
+class IntElement:
+    """implements the 'publish' API for integers
+
+    Takes the tag name and the integer value to publish.
+    
+    (Could be used for anything which uses str() to be published
+    to text for XML.)
+    """
+    element_attrs = {}
+    def __init__(self, name, val):
+        self.name = name
+        self.val = val
+    def publish(self, handler):
+        handler.startElement(self.name, self.element_attrs)
+        handler.characters(str(self.val))
+        handler.endElement(self.name)
+
+class DateElement:
+    """implements the 'publish' API for a datetime.datetime
+
+    Takes the tag name and the datetime to publish.
+
+    Converts the datetime to RFC 2822 timestamp (4-digit year).
+    """
+    def __init__(self, name, dt):
+        self.name = name
+        self.dt = dt
+    def publish(self, handler):
+        _element(handler, self.name, _format_date(self.dt))
+####
+
+class Category:
+    """Publish a category element"""
+    def __init__(self, category, domain = None):
+        self.category = category
+        self.domain = domain
+    def publish(self, handler):
+        d = {}
+        if self.domain is not None:
+            d["domain"] = self.domain
+        _element(handler, "category", self.category, d)
+
+class Cloud:
+    """Publish a cloud"""
+    def __init__(self, domain, port, path,
+                 registerProcedure, protocol):
+        self.domain = domain
+        self.port = port
+        self.path = path
+        self.registerProcedure = registerProcedure
+        self.protocol = protocol
+    def publish(self, handler):
+        _element(handler, "cloud", None, {
+            "domain": self.domain,
+            "port": str(self.port),
+            "path": self.path,
+            "registerProcedure": self.registerProcedure,
+            "protocol": self.protocol})
+
+class Image:
+    """Publish a channel Image"""
+    element_attrs = {}
+    def __init__(self, url, title, link,
+                 width = None, height = None, description = None):
+        self.url = url
+        self.title = title
+        self.link = link
+        self.width = width
+        self.height = height
+        self.description = description
+        
+    def publish(self, handler):
+        handler.startElement("image", self.element_attrs)
+
+        _element(handler, "url", self.url)
+        _element(handler, "title", self.title)
+        _element(handler, "link", self.link)
+
+        width = self.width
+        if isinstance(width, int):
+            width = IntElement("width", width)
+        _opt_element(handler, "width", width)
+        
+        height = self.height
+        if isinstance(height, int):
+            height = IntElement("height", height)
+        _opt_element(handler, "height", height)
+
+        _opt_element(handler, "description", self.description)
+
+        handler.endElement("image")
+
+class Guid:
+    """Publish a guid
+
+    Defaults to being a permalink, which is the assumption if it's
+    omitted.  Hence strings are always permalinks.
+    """
+    def __init__(self, guid, isPermaLink = 1):
+        self.guid = guid
+        self.isPermaLink = isPermaLink
+    def publish(self, handler):
+        d = {}
+        if self.isPermaLink:
+            d["isPermaLink"] = "true"
+        else:
+            d["isPermaLink"] = "false"
+        _element(handler, "guid", self.guid, d)
+
+class TextInput:
+    """Publish a textInput
+
+    Apparently this is rarely used.
+    """
+    element_attrs = {}
+    def __init__(self, title, description, name, link):
+        self.title = title
+        self.description = description
+        self.name = name
+        self.link = link
+
+    def publish(self, handler):
+        handler.startElement("textInput", self.element_attrs)
+        _element(handler, "title", self.title)
+        _element(handler, "description", self.description)
+        _element(handler, "name", self.name)
+        _element(handler, "link", self.link)
+        handler.endElement("textInput")
+        
+
+class Enclosure:
+    """Publish an enclosure"""
+    def __init__(self, url, length, type):
+        self.url = url
+        self.length = length
+        self.type = type
+    def publish(self, handler):
+        _element(handler, "enclosure", None,
+                 {"url": self.url,
+                  "length": str(self.length),
+                  "type": self.type,
+                  })
+
+class Source:
+    """Publish the item's original source, used by aggregators"""
+    def __init__(self, name, url):
+        self.name = name
+        self.url = url
+    def publish(self, handler):
+        _element(handler, "source", self.name, {"url": self.url})
+
+class SkipHours:
+    """Publish the skipHours
+
+    This takes a list of hours, as integers.
+    """
+    element_attrs = {}
+    def __init__(self, hours):
+        self.hours = hours
+    def publish(self, handler):
+        if self.hours:
+            handler.startElement("skipHours", self.element_attrs)
+            for hour in self.hours:
+                _element(handler, "hour", str(hour))
+            handler.endElement("skipHours")
+
+class SkipDays:
+    """Publish the skipDays
+
+    This takes a list of days as strings.
+    """
+    element_attrs = {}
+    def __init__(self, days):
+        self.days = days
+    def publish(self, handler):
+        if self.days:
+            handler.startElement("skipDays", self.element_attrs)
+            for day in self.days:
+                _element(handler, "day", day)
+            handler.endElement("skipDays")
+
+class RSS2(WriteXmlMixin):
+    """The main RSS class.
+
+    Stores the channel attributes, with the "category" elements under
+    ".categories" and the RSS items under ".items".
+    """
+    
+    rss_attrs = {"version": "2.0"}
+    element_attrs = {}
+    def __init__(self,
+                 title,
+                 link,
+                 description,
+
+                 language = None,
+                 copyright = None,
+                 managingEditor = None,
+                 webMaster = None,
+                 pubDate = None,  # a datetime, *in* *GMT*
+                 lastBuildDate = None, # a datetime
+                 
+                 categories = None, # list of strings or Category
+                 generator = _generator_name,
+                 docs = "http://blogs.law.harvard.edu/tech/rss",
+                 cloud = None,    # a Cloud
+                 ttl = None,      # integer number of minutes
+
+                 image = None,     # an Image
+                 rating = None,    # a string; I don't know how it's used
+                 textInput = None, # a TextInput
+                 skipHours = None, # a SkipHours with a list of integers
+                 skipDays = None,  # a SkipDays with a list of strings
+
+                 items = None,     # list of RSSItems
+                 ):
+        self.title = title
+        self.link = link
+        self.description = description
+        self.language = language
+        self.copyright = copyright
+        self.managingEditor = managingEditor
+
+        self.webMaster = webMaster
+        self.pubDate = pubDate
+        self.lastBuildDate = lastBuildDate
+        
+        if categories is None:
+            categories = []
+        self.categories = categories
+        self.generator = generator
+        self.docs = docs
+        self.cloud = cloud
+        self.ttl = ttl
+        self.image = image
+        self.rating = rating
+        self.textInput = textInput
+        self.skipHours = skipHours
+        self.skipDays = skipDays
+
+        if items is None:
+            items = []
+        self.items = items
+
+    def publish(self, handler):
+        handler.startElement("rss", self.rss_attrs)
+        handler.startElement("channel", self.element_attrs)
+        _element(handler, "title", self.title)
+        _element(handler, "link", self.link)
+        _element(handler, "description", self.description)
+
+        self.publish_extensions(handler)
+        
+        _opt_element(handler, "language", self.language)
+        _opt_element(handler, "copyright", self.copyright)
+        _opt_element(handler, "managingEditor", self.managingEditor)
+        _opt_element(handler, "webMaster", self.webMaster)
+
+        pubDate = self.pubDate
+        if isinstance(pubDate, datetime.datetime):
+            pubDate = DateElement("pubDate", pubDate)
+        _opt_element(handler, "pubDate", pubDate)
+
+        lastBuildDate = self.lastBuildDate
+        if isinstance(lastBuildDate, datetime.datetime):
+            lastBuildDate = DateElement("lastBuildDate", lastBuildDate)
+        _opt_element(handler, "lastBuildDate", lastBuildDate)
+
+        for category in self.categories:
+            if isinstance(category, str):
+                category = Category(category)
+            category.publish(handler)
+
+        _opt_element(handler, "generator", self.generator)
+        _opt_element(handler, "docs", self.docs)
+
+        if self.cloud is not None:
+            self.cloud.publish(handler)
+
+        ttl = self.ttl
+        if isinstance(self.ttl, int):
+            ttl = IntElement("ttl", ttl)
+        _opt_element(handler, "tt", ttl)
+
+        if self.image is not None:
+            self.image.publish(handler)
+
+        _opt_element(handler, "rating", self.rating)
+        if self.textInput is not None:
+            self.textInput.publish(handler)
+        if self.skipHours is not None:
+            self.skipHours.publish(handler)
+        if self.skipDays is not None:
+            self.skipDays.publish(handler)
+
+        for item in self.items:
+            item.publish(handler)
+
+        handler.endElement("channel")
+        handler.endElement("rss")
+
+    def publish_extensions(self, handler):
+        # Derived classes can hook into this to insert
+        # output after the three required fields.
+        pass
+
+    
+    
+class RSSItem(WriteXmlMixin):
+    """Publish an RSS Item"""
+    element_attrs = {}
+    def __init__(self,
+                 title = None,  # string
+                 link = None,   # url as string
+                 description = None, # string
+                 author = None,      # email address as string
+                 categories = None,  # list of string or Category
+                 comments = None,  # url as string
+                 enclosure = None, # an Enclosure
+                 guid = None,    # a unique string
+                 pubDate = None, # a datetime
+                 source = None,  # a Source
+                 ):
+        
+        if title is None and description is None:
+            raise TypeError(
+                "must define at least one of 'title' or 'description'")
+        self.title = title
+        self.link = link
+        self.description = description
+        self.author = author
+        if categories is None:
+            categories = []
+        self.categories = categories
+        self.comments = comments
+        self.enclosure = enclosure
+        self.guid = guid
+        self.pubDate = pubDate
+        self.source = source
+        # It sure does get tedious typing these names three times...
+
+    def publish(self, handler):
+        handler.startElement("item", self.element_attrs)
+        _opt_element(handler, "title", self.title)
+        _opt_element(handler, "link", self.link)
+        self.publish_extensions(handler)
+        _opt_element(handler, "description", self.description)
+        _opt_element(handler, "author", self.author)
+
+        for category in self.categories:
+            if isinstance(category, str):
+                category = Category(category)
+            category.publish(handler)
+        
+        _opt_element(handler, "comments", self.comments)
+        if self.enclosure is not None:
+            self.enclosure.publish(handler)
+        _opt_element(handler, "guid", self.guid)
+
+        pubDate = self.pubDate
+        if isinstance(pubDate, datetime.datetime):
+            pubDate = DateElement("pubDate", pubDate)
+        _opt_element(handler, "pubDate", pubDate)
+
+        if self.source is not None:
+            self.source.publish(handler)
+        
+        handler.endElement("item")
+
+    def publish_extensions(self, handler):
+        # Derived classes can hook into this to insert
+        # output after the title and link elements
+        pass
--- a/37
+++ b/37
@ -0,0 +1,37 @@
+Ceci n'est pas un lisez-moi.
+
+Just a few short notes how this stuff works so I don't forget it over time.
+
+Structure
+---------
+    layout -> templates
+    out -> result, upload this
+    src -> all content files
+    styles -> css files
+
+Each directory in src is a separate (sub-)category. It contains each content
+page as a separate text file (*.pdc) of the following form:
+    % title
+    [content in (pandoc-enhanced) Markdown markup]
+
+Each directory also contains a meta.yaml specifying the necessary metadata, like
+the category name, some layout data and so on. Note that each directory inherits
+all unset data from its parent.
+
+Any additional data, like images, reside in the same dir as their .pdc and are
+just copied with them. (Everything except *.pdc and meta.yaml is.)
+
+Additionally, changelog.pdc is used to build the rss.xml.
+
+Process
+-------
+webifier.py goes through all directories in src, reads meta.yaml, generates all
+necessary metadata out of it and then takes every *.pdc, pipes it through pandoc
+and tidy (adding data if necessary) and finally puts the result into out. 
+
+Also, an index.html is generated for every category.
+
+Finally, changelog.html is parsed, transformed into an RSS feed and put at
+out/rss.xml.
+
+Done!
--- a/clevercss.py
+++ b/clevercss.py
--- a/webifier.py
+++ b/webifier.py
@ -0,0 +1,167 @@
+#!/usr/bin/env python3
+# Copyright muflax <mail@muflax.com>, 2010
+# License: GNU GPL 3 <http://www.gnu.org/copyleft/gpl.html>
+
+import datetime
+import glob
+import hashlib
+import os
+import os.path
+import re
+import subprocess
+
+import PyRSS2Gen as RSS2
+import clevercss
+import yaml
+try:
+    from yaml import CLoader as Loader
+except ImportError:
+    from yaml import Loader
+    
+def _breadcrumbtagify(file, name=None, depth=0):
+    """turn an address and name into a proper link"""
+    if not name:
+        name = file
+    relpath = "../" * depth
+    r = "<a href='{}{}' class='crumb'>{}</a>".format(relpath, file, name)
+    return r
+
+def make_breadcrumb(file, meta):
+    """turn current path into breadcrumb navigation""" 
+    crumbs = []
+    depth = len(meta["cats"])
+    for catfile, cat in meta["cats"]:
+        crumbs.append(_breadcrumbtagify(catfile, cat, depth=depth))
+        depth -= 1
+    
+    crumbs.append(_breadcrumbtagify(os.path.basename(file), "<>"))
+    return " &#187; ".join(crumbs)
+
+def templatify(file, meta, out):
+    """templatify file using meta and save it at out"""
+    print("\ttemplatifying {}...".format(file))
+    dest = os.path.join(out, os.path.basename(file).replace(".pdc", ".html"))
+    breadcrumb = make_breadcrumb(dest, meta)
+    
+    pandoc = ["pandoc",
+              "--template", os.path.join("layout", meta["layout"]),
+              "--css", os.path.join("style", meta["style"]),
+              "--variable", "breadcrumb:{}".format(breadcrumb),
+              "-o", dest,
+              file
+             ]
+    subprocess.call(pandoc)
+    print("\tsaving as {}...".format(dest))
+
+def make_html_files(src, out, meta=None):
+    """turn all *.pdc in src into html files in out"""
+    
+    # we'll have to manually walk this shit...
+    # read the metadata and update the old one   
+    meta = {} if meta == None else meta.copy() 
+    print("reading metadata in {}...".format(src))
+    meta_file = os.path.join(src, "meta.yaml")
+    with open(meta_file, "r") as f:
+        data = yaml.load(f, Loader=Loader)
+        meta.update(data)
+
+    # add breadcrumb information to metadata
+    if "cats" in meta:
+        crumb = (os.path.basename(out), meta["title"])
+        meta["cats"].append(crumb)
+    else: # root path, needs to be renamed
+        crumb = ("", meta["title"])
+        meta["cats"] = [crumb]
+        
+    # templatify all files here
+    if not os.path.exists(out):
+        os.mkdir(out)
+    for file in glob.glob(src+"/*.pdc"):
+        templatify(file, meta, out)
+
+    # generate an index files
+    #TODO
+    
+    # do the same for all subdirectories 
+    for dir in [d for d in os.listdir(src) 
+                if os.path.isdir(os.path.join(src, d))]:
+        make_html_files(src=os.path.join(src, dir), 
+                        out=os.path.join(out, dir),
+                        meta=meta)
+
+def make_css(src, out):
+    if not os.path.exists(out):
+        os.mkdir(out)
+    for file in glob.glob(os.path.join(src, "*.clevercss")):
+        print("cssifying {}...".format(file))
+        with open(file, "r") as f:
+            conv = clevercss.convert(f.read())
+        dest = os.path.join(out, os.path.basename(file).replace(".clevercss",
+                                                                ".css"))
+        with open(dest, "w") as f:
+            print("\tsaving as {}...".format(dest))
+            f.write(conv)
+
+def make_rss_feed(changelog):
+    """generate an RSS feed out of the Changelog"""
+
+    with open(changelog, "r") as f:
+        print("parsing {}...".format(changelog))
+        txt = f.read()
+    relist = re.compile("""
+                        <li>
+                        (?P<y>\d+) / (?P<m>\d+) / (?P<d>\d+):\ 
+                        (?P<desc>.+?)
+                        </li>
+                        """, re.X|re.S)
+        
+    items = []
+    for entry in relist.finditer(txt):
+        items.append(
+            RSS2.RSSItem(
+                title = "omg new stuff!!w!",
+                link = "http://www.muflax.com/changelog.html",
+                description = entry.group("desc"),
+                pubDate = datetime.datetime(
+                    int(entry.group("y")),
+                    int(entry.group("m")),
+                    int(entry.group("d"))
+                ),
+                guid = RSS2.Guid(
+                    hashlib.md5(entry.group("desc").encode("utf8")).hexdigest()
+                )
+            )
+        )
+    
+    feed = RSS2.RSS2(
+        title = "muflax.com",
+        link = "http://www.muflax.com",
+        description = "lies and wonderland",
+        lastBuildDate = datetime.datetime.now(),
+        items = items[:10]
+    )
+
+    with open("out/rss.xml", "w") as f:
+        print("writing RSS feed...")
+        feed.write_xml(f, encoding="utf8")
+
+def tidy_up(dir):
+    """clean up all the (ht|x)ml we generated earlier..."""
+
+    for root, dirs, files in os.walk(dir):
+        for f in files:
+            if re.match(".*\.xml", f):
+                subprocess.call(["tidy", "-i", "-xml", "-m", "-q", "-utf8",
+                                 os.path.join(root, f)])
+            elif re.match(".*\.html", f):
+                subprocess.call(["tidy", "-i", "--tidy-mark", "f", "-m", "-q", "-utf8",
+                                 os.path.join(root, f)])
+    
+def main():
+    make_html_files("src", "out")
+    make_css("styles", "out/styles")
+    make_rss_feed("out/changelog.html")
+    tidy_up("out")
+
+if __name__ == "__main__":
+    main()