Source code for pywiktionary.wiktionary

"""Wiktionary class for IPA extraction from XML dump or MediaWiki API.
"""

from __future__ import absolute_import
from __future__ import unicode_literals
from __future__ import print_function

import json
try:
    from urllib import urlencode, urlopen
except ImportError:
    from urllib.parse import urlencode
    from urllib.request import urlopen

import mwxml
from .parser import Parser


[docs]class Wiktionary(object):
    """Wiktionary class for IPA extraction from XML dump or MediaWiki API.

    To extraction IPA for a certain language, specify ``lang`` parameter,
    default is extracting IPA for all available languages.

    To convert IPA text to X-SAMPA text, use ``XSAMPA`` parameter.

    Parameters
    ----------
    lang : string
        String of language type.
    XSAMPA : boolean
        Option for IPA to X-SAMPA conversion.
    """
    def __init__(self, lang=None, XSAMPA=False):
        self.lang = lang
        self.XSAMPA = XSAMPA
        self.set_parser()
        self.api = "https://en.wiktionary.org/w/api.php"
        self.param = {
            "action": "query",
            "titles": None,
            "prop": "revisions",
            "rvprop": "content",
            "rvlimit": 1,
            "format": "json"
        }

[docs]    def set_lang(self, lang):
        """Set language.

        Parameters
        ----------
        lang : string
            String of language name.
        """
        self.lang = lang
        self.set_parser()

[docs]    def set_XSAMPA(self, XSAMPA):
        """Set X-SAMPA conversion option.

        Parameters
        ----------
        XSAMPA : boolean
            Option for IPA to X-SAMPA conversion.
        """
        self.XSAMPA = XSAMPA
        self.set_parser()

[docs]    def set_parser(self):
        """Set parser for Wiktionary.

        Use the Wiktionary ``lang`` and ``XSAMPA`` parameters.
        """
        self.parser = Parser(
            lang=self.lang,
            XSAMPA=self.XSAMPA,
        )

[docs]    def get_entry_pronunciation(self, wiki_text, title=None):
        """Extraction IPA for entry in Wiktionary XML dump.

        Parameters
        ----------
        wiki_text : string
            String of XML entry wiki text.
        title: string
            String of wiki entry title.

        Returns
        -------
        dict
            Dict of word's IPA results.
            Key: language name; Value: list of IPA text.
        """
        if self.lang:
            return self.parser.parse(wiki_text, title=title)[self.lang]
        return self.parser.parse(wiki_text, title=title)

[docs]    def extract_IPA(self, dump_file):
        """Extraction IPA list from Wiktionary XML dump.

        Parameters
        ----------
        dump_file : string
            Path of Wiktionary XML dump file.

        Returns
        -------
        list
            List of extracted IPA results in
            ``{"id": "", "title": "", "pronunciation": ""}`` format.
        """
        dump = mwxml.Dump.from_file((open(dump_file, "rb")))
        lst = []
        for page in dump:
            for revision in page:
                if revision.page.namespace == 0:
                    pronunciation = self.get_entry_pronunciation(
                        revision.text,
                        title=revision.page.title,
                    )
                    lst.append({
                        "id": revision.page.id,
                        "title": revision.page.title,
                        "pronunciation": pronunciation,
                    })
        return lst

[docs]    def lookup(self, word):
        """Look up IPA of word through Wiktionary API.

        Parameters
        ----------
        word : string
            String of a word to be looked up.

        Returns
        -------
        dict
            Dict of word's IPA results.
            Key: language name; Value: list of IPA text.
        """
        self.param["titles"] = word.encode("utf-8")
        param = urlencode(self.param).encode()
        res = urlopen(self.api, param).read()
        content = json.loads(res.decode("utf-8"))
        try:
            val = list(content["query"]["pages"].values())
            wiki_text = val[0]["revisions"][0]["*"]
        except (KeyError, IndexError):
            return "Word not found."
        return self.get_entry_pronunciation(wiki_text, title=word)