search: enable tipue_search as default search engine

tipue_search plugin generate the tipuesearch_content.json which is used from tipuesearch engine to do search in static pages. It is also necessary the sitemap plugin which generates the sitemap.xml in order to have the urls for each result. A basic search.html template is also included to provide the results.
author: Victor Toso <victortoso@redhat.com> 2015-10-30 11:49:46 +0100
committer: Victor Toso <victortoso@redhat.com> 2015-10-30 11:49:46 +0100
commit: e4d5e93c0174577988c25d2de89158def5054e4f (patch)
tree: 5a3595af33eb3e85abdeb11da22282db780e740f /plugins
parent: 9b36a32b5208455bc91fcce64186869be62888bb (diff)
6 files changed, 513 insertions, 0 deletions
diff --git a/plugins/sitemap/Readme.rst b/plugins/sitemap/Readme.rst
new file mode 100644
index 0000000..719c38b
--- /dev/null
+++ b/plugins/sitemap/Readme.rst
@@ -0,0 +1,74 @@
+Sitemap
+-------
+
+This plugin generates plain-text or XML sitemaps. You can use the ``SITEMAP``
+variable in your settings file to configure the behavior of the plugin.
+
+The ``SITEMAP`` variable must be a Python dictionary and can contain these keys:
+
+- ``format``, which sets the output format of the plugin (``xml`` or ``txt``)
+
+- ``priorities``, which is a dictionary with three keys:
+
+  - ``articles``, the priority for the URLs of the articles and their
+    translations
+
+  - ``pages``, the priority for the URLs of the static pages
+
+  - ``indexes``, the priority for the URLs of the index pages, such as tags,
+     author pages, categories indexes, archives, etc...
+
+  All the values of this dictionary must be decimal numbers between ``0`` and ``1``.
+
+- ``changefreqs``, which is a dictionary with three items:
+
+  - ``articles``, the update frequency of the articles
+
+  - ``pages``, the update frequency of the pages
+
+  - ``indexes``, the update frequency of the index pages
+
+  Valid frequency values are ``always``, ``hourly``, ``daily``, ``weekly``, ``monthly``,
+  ``yearly`` and ``never``.
+
+You can exclude URLs from being included in the sitemap via regular expressions.
+For example, to exclude all URLs containing ``tag/`` or ``category/`` you can
+use the following ``SITEMAP`` setting.
+
+.. code-block:: python
+
+    SITEMAP = {
+        'exclude': ['tag/', 'category/']
+    }
+
+If a key is missing or a value is incorrect, it will be replaced with the
+default value.
+
+The sitemap is saved in ``<output_path>/sitemap.<format>``.
+
+.. note::
+   ``priorities`` and ``changefreqs`` are information for search engines.
+   They are only used in the XML sitemaps.
+   For more information: <http://www.sitemaps.org/protocol.html#xmlTagDefinitions>
+
+**Example**
+
+Here is an example configuration (it's also the default settings):
+
+.. code-block:: python
+
+    PLUGINS=['pelican.plugins.sitemap',]
+
+    SITEMAP = {
+        'format': 'xml',
+        'priorities': {
+            'articles': 0.5,
+            'indexes': 0.5,
+            'pages': 0.5
+        },
+        'changefreqs': {
+            'articles': 'monthly',
+            'indexes': 'daily',
+            'pages': 'monthly'
+        }
+    }
diff --git a/plugins/sitemap/__init__.py b/plugins/sitemap/__init__.py
new file mode 100644
index 0000000..6523d3a
--- /dev/null
+++ b/plugins/sitemap/__init__.py
@@ -0,0 +1 @@
+from .sitemap import *
+\ No newline at end of file
diff --git a/plugins/sitemap/sitemap.py b/plugins/sitemap/sitemap.py
new file mode 100644
index 0000000..ccd9bfc
--- /dev/null
+++ b/plugins/sitemap/sitemap.py
@@ -0,0 +1,254 @@
+# -*- coding: utf-8 -*-
+'''
+Sitemap
+-------
+
+The sitemap plugin generates plain-text or XML sitemaps.
+'''
+
+from __future__ import unicode_literals
+
+import re
+import collections
+import os.path
+
+from datetime import datetime
+from logging import warning, info
+from codecs import open
+from pytz import timezone
+
+from pelican import signals, contents
+from pelican.utils import get_date
+
+TXT_HEADER = """{0}/index.html
+{0}/archives.html
+{0}/tags.html
+{0}/categories.html
+"""
+
+XML_HEADER = """<?xml version="1.0" encoding="utf-8"?>
+<urlset xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9 http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd"
+xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
+"""
+
+XML_URL = """
+<url>
+<loc>{0}/{1}</loc>
+<lastmod>{2}</lastmod>
+<changefreq>{3}</changefreq>
+<priority>{4}</priority>
+</url>
+"""
+
+XML_FOOTER = """
+</urlset>
+"""
+
+
+def format_date(date):
+    if date.tzinfo:
+        tz = date.strftime('%z')
+        tz = tz[:-2] + ':' + tz[-2:]
+    else:
+        tz = "-00:00"
+    return date.strftime("%Y-%m-%dT%H:%M:%S") + tz
+
+class SitemapGenerator(object):
+
+    def __init__(self, context, settings, path, theme, output_path, *null):
+
+        self.output_path = output_path
+        self.context = context
+        self.now = datetime.now()
+        self.siteurl = settings.get('SITEURL')
+
+
+        self.default_timezone = settings.get('TIMEZONE', 'UTC')
+        self.timezone = getattr(self, 'timezone', self.default_timezone)
+        self.timezone = timezone(self.timezone)
+
+        self.format = 'xml'
+
+        self.changefreqs = {
+            'articles': 'monthly',
+            'indexes': 'daily',
+            'pages': 'monthly'
+        }
+
+        self.priorities = {
+            'articles': 0.5,
+            'indexes': 0.5,
+            'pages': 0.5
+        }
+
+        self.sitemapExclude = []
+
+        config = settings.get('SITEMAP', {})
+
+        if not isinstance(config, dict):
+            warning("sitemap plugin: the SITEMAP setting must be a dict")
+        else:
+            fmt = config.get('format')
+            pris = config.get('priorities')
+            chfreqs = config.get('changefreqs')
+            self.sitemapExclude = config.get('exclude', [])
+
+            if fmt not in ('xml', 'txt'):
+                warning("sitemap plugin: SITEMAP['format'] must be `txt' or `xml'")
+                warning("sitemap plugin: Setting SITEMAP['format'] on `xml'")
+            elif fmt == 'txt':
+                self.format = fmt
+                return
+
+            valid_keys = ('articles', 'indexes', 'pages')
+            valid_chfreqs = ('always', 'hourly', 'daily', 'weekly', 'monthly',
+                    'yearly', 'never')
+
+            if isinstance(pris, dict):
+                # We use items for Py3k compat. .iteritems() otherwise
+                for k, v in pris.items():
+                    if k in valid_keys and not isinstance(v, (int, float)):
+                        default = self.priorities[k]
+                        warning("sitemap plugin: priorities must be numbers")
+                        warning("sitemap plugin: setting SITEMAP['priorities']"
+                                "['{0}'] on {1}".format(k, default))
+                        pris[k] = default
+                self.priorities.update(pris)
+            elif pris is not None:
+                warning("sitemap plugin: SITEMAP['priorities'] must be a dict")
+                warning("sitemap plugin: using the default values")
+
+            if isinstance(chfreqs, dict):
+                # .items() for py3k compat.
+                for k, v in chfreqs.items():
+                    if k in valid_keys and v not in valid_chfreqs:
+                        default = self.changefreqs[k]
+                        warning("sitemap plugin: invalid changefreq `{0}'".format(v))
+                        warning("sitemap plugin: setting SITEMAP['changefreqs']"
+                                "['{0}'] on '{1}'".format(k, default))
+                        chfreqs[k] = default
+                self.changefreqs.update(chfreqs)
+            elif chfreqs is not None:
+                warning("sitemap plugin: SITEMAP['changefreqs'] must be a dict")
+                warning("sitemap plugin: using the default values")
+
+    def write_url(self, page, fd):
+
+        if getattr(page, 'status', 'published') != 'published':
+            return
+
+        # We can disable categories/authors/etc by using False instead of ''
+        if not page.save_as:
+            return
+
+        page_path = os.path.join(self.output_path, page.save_as)
+        if not os.path.exists(page_path):
+            return
+
+        lastdate = getattr(page, 'date', self.now)
+        try:
+            lastdate = self.get_date_modified(page, lastdate)
+        except ValueError:
+            warning("sitemap plugin: " + page.save_as + " has invalid modification date,")
+            warning("sitemap plugin: using date value as lastmod.")
+        lastmod = format_date(lastdate)
+
+        if isinstance(page, contents.Article):
+            pri = self.priorities['articles']
+            chfreq = self.changefreqs['articles']
+        elif isinstance(page, contents.Page):
+            pri = self.priorities['pages']
+            chfreq = self.changefreqs['pages']
+        else:
+            pri = self.priorities['indexes']
+            chfreq = self.changefreqs['indexes']
+
+        pageurl = '' if page.url == 'index.html' else page.url
+        
+        #Exclude URLs from the sitemap:
+        if self.format == 'xml':
+            flag = False
+            for regstr in self.sitemapExclude:
+                if re.match(regstr, pageurl):
+                    flag = True
+                    break
+            if not flag:
+                fd.write(XML_URL.format(self.siteurl, pageurl, lastmod, chfreq, pri))
+        else:
+            fd.write(self.siteurl + '/' + pageurl + '\n')
+
+    def get_date_modified(self, page, default):
+        if hasattr(page, 'modified'):
+            if isinstance(page.modified, datetime):
+                return page.modified
+            return get_date(page.modified)
+        else:
+            return default
+
+    def set_url_wrappers_modification_date(self, wrappers):
+        for (wrapper, articles) in wrappers:
+            lastmod = datetime.min.replace(tzinfo=self.timezone)
+            for article in articles:
+                lastmod = max(lastmod, article.date.replace(tzinfo=self.timezone))
+                try:
+                    modified = self.get_date_modified(article, datetime.min).replace(tzinfo=self.timezone)
+                    lastmod = max(lastmod, modified)
+                except ValueError:
+                    # Supressed: user will be notified.
+                    pass
+            setattr(wrapper, 'modified', str(lastmod))
+
+    def generate_output(self, writer):
+        path = os.path.join(self.output_path, 'sitemap.{0}'.format(self.format))
+
+        pages = self.context['pages'] + self.context['articles'] \
+                + [ c for (c, a) in self.context['categories']] \
+                + [ t for (t, a) in self.context['tags']] \
+                + [ a for (a, b) in self.context['authors']]
+
+        self.set_url_wrappers_modification_date(self.context['categories'])
+        self.set_url_wrappers_modification_date(self.context['tags'])
+        self.set_url_wrappers_modification_date(self.context['authors'])
+
+        for article in self.context['articles']:
+            pages += article.translations
+
+        info('writing {0}'.format(path))
+
+        with open(path, 'w', encoding='utf-8') as fd:
+
+            if self.format == 'xml':
+                fd.write(XML_HEADER)
+            else:
+                fd.write(TXT_HEADER.format(self.siteurl))
+
+            FakePage = collections.namedtuple('FakePage',
+                                              ['status',
+                                               'date',
+                                               'url',
+                                               'save_as'])
+
+            for standard_page_url in ['index.html',
+                                      'archives.html',
+                                      'tags.html',
+                                      'categories.html']:
+                fake = FakePage(status='published',
+                                date=self.now,
+                                url=standard_page_url,
+                                save_as=standard_page_url)
+                self.write_url(fake, fd)
+
+            for page in pages:
+                self.write_url(page, fd)
+
+            if self.format == 'xml':
+                fd.write(XML_FOOTER)
+
+
+def get_generators(generators):
+    return SitemapGenerator
+
+
+def register():
+    signals.get_generators.connect(get_generators)
diff --git a/plugins/tipue_search/README.md b/plugins/tipue_search/README.md
new file mode 100644
index 0000000..1a2d615
--- /dev/null
+++ b/plugins/tipue_search/README.md
@@ -0,0 +1,67 @@
+Tipue Search
+============
+
+A Pelican plugin to serialize generated HTML to JSON that can be used by jQuery plugin - Tipue Search.
+
+Copyright (c) Talha Mansoor
+
+Author          | Talha Mansoor
+----------------|-----
+Author Email    | talha131@gmail.com 
+Author Homepage | http://onCrashReboot.com 
+Github Account  | https://github.com/talha131 
+
+Why do you need it?
+===================
+
+Static sites do not offer search feature out of the box. [Tipue Search](http://www.tipue.com/search/)
+is a jQuery plugin that search the static site without using any third party service, like DuckDuckGo or Google.
+
+Tipue Search offers 4 search modes. Its [JSON search mode](http://www.tipue.com/search/docs/json/) is the best search mode
+especially for large sites.
+
+Tipue's JSON search mode requires the textual content of site in JSON format.
+
+Requirements
+============
+
+Tipue Search requires BeautifulSoup.
+
+```bash
+pip install beautifulsoup4
+```
+
+How Tipue Search works
+=========================
+
+Tipue Search serializes the generated HTML into JSON. Format of JSON is as follows
+
+```python
+{
+    "pages": [
+        { 
+            "text": "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Integer nec odio. Praesent libero. Sed cursus ante dapibus diam. Sed nisi. Nulla quis sem at nibh elementum imperdiet. Duis sagittis ipsum. Praesent mauris. Fusce nec tellus sed augue semper porta. Mauris massa. Vestibulum lacinia arcu eget nulla. Class aptent taciti sociosqu ad litora torquent per conubia nostra, per inceptos himenaeos. Curabitur sodales ligula in libero.",
+            "tags": "Example Category",
+            "url" : "http://oncrashreboot.com/plugin-example.html",
+            "title": "Everything you want to know about Lorem Ipsum"
+        },
+        { 
+            "text": "Sed dignissim lacinia nunc. Curabitur tortor. Pellentesque nibh. Aenean quam. In scelerisque sem at dolor. Maecenas mattis. Sed convallis tristique sem. Proin ut ligula vel nunc egestas porttitor. Morbi lectus risus, iaculis vel, suscipit quis, luctus non, massa. Fusce ac turpis quis ligula lacinia aliquet. Mauris ipsum. Nulla metus metus, ullamcorper vel, tincidunt sed, euismod in, nibh.",
+            "tags": "Example Category",
+            "url" : "http://oncrashreboot.com/plugin-example-2.html",
+            "title": "Review of the book Lorem Ipsum"
+        }
+    ]
+}
+```
+
+JSON is written to file `tipuesearch_content.json` which is created in the root of `output` directory.
+
+How to use
+==========
+
+To utilize JSON Search mode, your theme needs to have Tipue Search properly configured in it. [Official documentation](http://www.tipue.com/search/docs/#json) has the required details.
+
+Pelican [Elegant Theme](https://github.com/talha131/pelican-elegant) and [Plumage
+theme](https://github.com/kdeldycke/plumage) have Tipue Search configured. You can view their
+code to understand the configuration.
diff --git a/plugins/tipue_search/__init__.py b/plugins/tipue_search/__init__.py
new file mode 100644
index 0000000..ebd6c06
--- /dev/null
+++ b/plugins/tipue_search/__init__.py
@@ -0,0 +1 @@
+from .tipue_search import *
diff --git a/plugins/tipue_search/tipue_search.py b/plugins/tipue_search/tipue_search.py
new file mode 100644
index 0000000..b27ef77
--- /dev/null
+++ b/plugins/tipue_search/tipue_search.py
@@ -0,0 +1,116 @@
+# -*- coding: utf-8 -*-
+"""
+Tipue Search
+============
+
+A Pelican plugin to serialize generated HTML to JSON
+that can be used by jQuery plugin - Tipue Search.
+
+Copyright (c) Talha Mansoor
+"""
+
+from __future__ import unicode_literals
+
+import os.path
+import json
+from bs4 import BeautifulSoup
+from codecs import open
+try:
+    from urlparse import urljoin
+except ImportError:
+    from urllib.parse import urljoin
+
+from pelican import signals
+
+
+class Tipue_Search_JSON_Generator(object):
+
+    def __init__(self, context, settings, path, theme, output_path, *null):
+
+        self.output_path = output_path
+        self.context = context
+        self.siteurl = settings.get('SITEURL')
+        self.tpages = settings.get('TEMPLATE_PAGES')
+        self.output_path = output_path
+        self.json_nodes = []
+
+
+    def create_json_node(self, page):
+
+        if getattr(page, 'status', 'published') != 'published':
+            return
+
+        soup_title = BeautifulSoup(page.title.replace('&nbsp;', ' '), 'html.parser')
+        page_title = soup_title.get_text(' ', strip=True).replace('“', '"').replace('”', '"').replace('’', "'").replace('^', '&#94;')
+
+        soup_text = BeautifulSoup(page.content, 'html.parser')
+        page_text = soup_text.get_text(' ', strip=True).replace('“', '"').replace('”', '"').replace('’', "'").replace('¶', ' ').replace('^', '&#94;')
+        page_text = ' '.join(page_text.split())
+
+        if getattr(page, 'category', 'None') == 'None':
+            page_category = ''
+        else:
+            page_category = page.category.name
+
+        page_url = self.siteurl + '/' + page.url
+
+        node = {'title': page_title,
+                'text': page_text,
+                'tags': page_category,
+                'url': page_url,
+                'loc': page_url}
+
+        self.json_nodes.append(node)
+
+
+    def create_tpage_node(self, srclink):
+
+        srcfile = open(os.path.join(self.output_path, self.tpages[srclink]), encoding='utf-8')
+        soup = BeautifulSoup(srcfile, 'html.parser')
+        page_text = soup.get_text()
+
+        # What happens if there is not a title.
+        if soup.title is not None:
+            page_title = soup.title.string
+        else:
+            page_title = ''
+
+        # Should set default category?
+        page_category = ''
+
+        page_url = urljoin(self.siteurl, self.tpages[srclink])
+
+        node = {'title': page_title,
+                'text': page_text,
+                'tags': page_category,
+                'url': page_url,
+                'loc': page_url}
+
+        self.json_nodes.append(node)
+
+
+    def generate_output(self, writer):
+        path = os.path.join(self.output_path, 'tipuesearch_content.json')
+
+        pages = self.context['pages'] + self.context['articles']
+
+        for article in self.context['articles']:
+            pages += article.translations
+
+        for srclink in self.tpages:
+            self.create_tpage_node(srclink)
+
+        for page in pages:
+            self.create_json_node(page)
+        root_node = {'pages': self.json_nodes}
+
+        with open(path, 'w', encoding='utf-8') as fd:
+            json.dump(root_node, fd, separators=(',', ':'), ensure_ascii=False)
+
+
+def get_generators(generators):
+    return Tipue_Search_JSON_Generator
+
+
+def register():
+    signals.get_generators.connect(get_generators)
author	Victor Toso <victortoso@redhat.com>	2015-10-30 11:49:46 +0100
committer	Victor Toso <victortoso@redhat.com>	2015-10-30 11:49:46 +0100
commit	e4d5e93c0174577988c25d2de89158def5054e4f (patch)
tree	5a3595af33eb3e85abdeb11da22282db780e740f /plugins
parent	9b36a32b5208455bc91fcce64186869be62888bb (diff)