Source code for pywiktionary.parser

# pylint: disable=anomalous-backslash-in-string
# pylint: disable=too-many-locals, too-many-branches, too-many-statements
"""Parser to extract IPA text from pronunciation section in wiki text.
"""

from __future__ import absolute_import
from __future__ import unicode_literals

import json
try:
    from urllib import urlencode, urlopen
except ImportError:
    from urllib.parse import urlencode
    from urllib.request import urlopen

import regex as re
from bs4 import BeautifulSoup
from .IPA import IPA
from .IPA import fr_pron
from .IPA import ru_pron
from .IPA import hi_pron
from .IPA import es_pron
from .IPA import cmn_pron


[docs]class Parser(object):
    """Wiktionary parser to extract IPA text from pronunciation section.

    To extraction IPA for a certain language, specify ``lang`` parameter,
    default is extracting IPA for all available languages.

    To convert IPA text to X-SAMPA text, use ``XSAMPA`` parameter.

    Parameters
    ----------
    lang : string
        String of language type.
    XSAMPA : boolean
        Option for IPA to X-SAMPA conversion.
    """
    def __init__(self, lang=None, XSAMPA=False):
        self.lang = lang
        self.XSAMPA = XSAMPA
        self.api = "https://en.wiktionary.org/w/api.php"
        self.param = {
            "action": "expandtemplates",
            "text": None,
            "prop": "wikitext",
            "format": "json"
        }
        self.regex = {
            "lang": re.compile("\|lang=([^\|]+)"),
            "node": re.compile("(?<brackets>{{(?:[^{}]+|(?&brackets))*}})"),
            "IPA-node": re.compile("^(([\w]+\-)?(IPA|pron))(?=\||\n|\Z)"),
            "h2": re.compile("(?:\A|\n)={2}([\p{L}0-9 -]+)={2}\n"),
            "h3": re.compile("\n={3}([\p{L}0-9 -]+)={3}\n"),
            "h4": re.compile("\n={4}([\p{L}0-9 -]+)={4}\n"),
            "IPA": re.compile("<span[^>]*>([^<]+)<\/span>")
        }

[docs]    def expand_template(self, text):
        """Expand IPA Template through Wiktionary API.

        Used to expand ``{{*-IPA}}`` template in parser and return IPA list.

        Parameters
        ----------
        text : string
            String of template text inside "{{" and "}}".

        Returns
        -------
        list of string
            List of expanded IPA text.

        Examples
        --------
        >>> parser = Parser()
        >>> template = "{{la-IPA|eccl=yes|thēsaurus}}"
        >>> parser.expand_template(template)
        ['/tʰeːˈsau̯.rus/', '[tʰeːˈsau̯.rʊs]', '/teˈsau̯.rus/']
        """
        self.param["text"] = text.encode("utf-8")
        res = urlopen(self.api, urlencode(self.param).encode()).read()
        content = json.loads(res.decode("utf-8"))
        html = content["expandtemplates"]["wikitext"]
        # Use BeautifulSoup instead of raw regex expr
        # return self.regex["IPA"].findall(html)
        soup = BeautifulSoup(html, "html.parser")
        span = soup.find_all("span", {"class": "IPA"})
        return list(map(lambda x: x.text, span))

[docs]    def parse(self, wiki_text, title=None):
        """Parse Wiktionary wiki text.

        Split Wiktionary wiki text into different langugaes and return
        parseed IPA result.

        Parameters
        ----------
        wiki_text : string
            String of Wiktionary wiki text, from XML dump or Wiktionary API.
        title: string
            String of wiki entry title.

        Returns
        -------
        dict
            Dict of parsed IPA results.
            Key: language name; Value: list of IPA text.
        """
        self.title = title
        parse_result = {}
        h2_lst = self.regex["h2"].findall(wiki_text)
        if self.lang and self.lang not in h2_lst:
            parse_result = {self.lang: "Language not found."}
            return parse_result
        h2_split = self.regex["h2"].split(wiki_text)
        i = 0
        while i < len(h2_split):
            if h2_split[i] in h2_lst:
                if not self.lang or h2_split[i] == self.lang:
                    pronunciation = self.parse_detail(h2_split[i+1])
                    if not pronunciation:
                        pronunciation = "IPA not found."
                    parse_result[h2_split[i]] = pronunciation
                i += 1
            i += 1
        return parse_result

[docs]    def parse_detail(self, wiki_text, depth=3):
        """Parse the section of a certain language in wiki text.

        Parse pronunciation section of the certain language recursively.

        Parameters
        ----------
        wiki_text : string
            String of wiki text in a language section.
        depth : int
            Integer indicated depth of pronunciation section.

        Returns
        -------
        list of dict
            List of extracted IPA text in
            ``{"IPA": "", "X-SAMPA": "", "lang": ""}`` format.
        """
        parse_result = []
        detail_lst = self.regex["h" + str(depth)].findall(wiki_text)
        detail_split = self.regex["h" + str(depth)].split(wiki_text)
        # To avoid maximum recursion depth exceeded.
        if len(detail_split) > 99999:
            return "Maximum recursion depth exceeded in wiki text."
        i = 0
        while i < len(detail_split):
            if detail_split[i] in detail_lst:
                header_name = detail_split[i].lower()
                if header_name == "pronunciation":
                    parse_result += \
                        self.parse_pronunciation(detail_split[i+1])
                elif ("etymology" in header_name and
                      header_name != "etymology"):
                    parse_result += \
                        self.parse_detail(detail_split[i+1], depth=4)
                i += 1
            i += 1
        return parse_result

[docs]    def parse_pronunciation(self, wiki_text):
        """Parse pronunciation section in wiki text.

        Parse IPA text from pronunciation section and convert to X-SAMPA.

        Parameters
        ----------
        wiki_text : string
            String of pronunciation section in wiki text.

        Returns
        -------
        list of dict
            List of extracted IPA text in
            ``{"IPA": "", "X-SAMPA": "", "lang": ""}`` format.
        """
        parse_result = []
        node_lst = re.findall(self.regex["node"], wiki_text)
        for node in node_lst:
            node = node[2:-2]
            tag = re.findall(self.regex["IPA-node"], node)
            if tag:
                tag = tag[0][0]
                if tag in [
                    "IPA", "fr-IPA", "ru-IPA", "hi-IPA", "zh-pron",
                ]:
                    node = re.sub("\n", "", node)
                    node = re.sub(self.regex["IPA-node"], "", node)
                    node = re.sub(self.regex["node"], "", node) #FIXME
                    lang = re.findall(self.regex["lang"], node)
                    lang = lang[0] if lang else "Unknown"
                    node = re.sub(self.regex["lang"], "", node)
                    node = re.sub("\|qual\d?=[^\|]*", "", node)
                    node = re.sub("\|n\d?=[^\|]*", "", node)
                    if tag == "IPA":
                        node = node[1:]
                        node_detail = node.split("|")
                        for each_ipa in node_detail:
                            if not each_ipa:
                                continue
                            parse_result.append({
                                "IPA": each_ipa,
                                "lang": lang,
                            })
                    elif tag == "fr-IPA":
                        lang = "fr"
                        pos = re.findall("\|pos=([^\|]+)", node)
                        pos = pos[0] if pos else ""
                        node = re.sub("\|pos=([^\|]+)", "", node)
                        node = node[1:]
                        if not node and self.title:
                            node = self.title
                        node_detail = node.split("|")
                        for each_ipa in node_detail:
                            if not each_ipa:
                                continue
                            parse_result.append({
                                "IPA": fr_pron.to_IPA(
                                    each_ipa,
                                    pos=pos
                                ),
                                "lang": lang,
                            })
                    elif tag == "ru-IPA":
                        lang = "ru"
                        node = re.sub("\|phon=", "", node)
                        noadj = re.findall("\|noadj=([^\|]+)", node)
                        noadj = noadj[0] if noadj else ""
                        node = re.sub("\|noadj=([^\|]+)", "", node)
                        noshto = re.findall("\|noshto=([^\|]+)", node)
                        noshto = noshto[0] if noshto else ""
                        node = re.sub("\|noshto=([^\|]+)", "", node)
                        gem = re.findall("\|gem=([^\|]+)", node)
                        gem = gem[0] if gem else ""
                        node = re.sub("\|gem=([^\|]+)", "", node)
                        pos = re.findall("\|pos=([^\|]+)", node)
                        pos = pos[0] if pos else ""
                        node = re.sub("\|pos=([^\|]+)", "", node)
                        node = re.sub("\|raw=([^\|]+)", "", node)
                        node = re.sub("\|ann=([^\|]+)", "", node)
                        bracket = re.findall("\|bracket=([^\|]+)", node)
                        bracket = bracket[0] if bracket else ""
                        node = re.sub("\|bracket=([^\|]+)", "", node)
                        node = node[1:]
                        if not node and self.title:
                            node = self.title
                        node_detail = node.split("|")
                        for each_ipa in node_detail:
                            if not each_ipa:
                                continue
                            parse_result.append({
                                "IPA": ru_pron.to_IPA(
                                    each_ipa,
                                    adj=noadj,
                                    gem=gem,
                                    bracket=bracket,
                                    pos=pos
                                ),
                                "lang": lang,
                            })
                    elif tag == "hi-IPA":
                        lang = "hi"
                        node = node[1:]
                        if not node and self.title:
                            node = self.title
                        node_detail = node.split("|")
                        for each_ipa in node_detail:
                            if not each_ipa:
                                continue
                            parse_result.append({
                                "IPA": hi_pron.to_IPA(each_ipa),
                                "lang": lang,
                            })
                    elif tag == "es-IPA":
                        lang = "es"
                        node = node[1:]
                        if not node and self.title:
                            node = self.title
                        node_detail = node.split("|")
                        for each_ipa in node_detail:
                            if not each_ipa:
                                continue
                            parse_result.append({
                                "IPA": es_pron.to_IPA(each_ipa),
                                "lang": lang,
                            })
                    elif tag == "zh-pron":
                        lang = "zh"
                        node = re.findall("\|m=([^\|]+)", node)
                        node = node[0] if node else ""
                        node_detail = node.split(",")
                        for each_ipa in node_detail:
                            if not each_ipa or "=" in each_ipa:
                                continue
                            parse_result.append({
                                "IPA": cmn_pron.to_IPA(each_ipa),
                                "lang": lang,
                            })
                else:
                    if "|" not in node:
                        node = "{}|{}".format(node, self.title)
                    extend_lst = self.expand_template("{{%s}}" % node)
                    lang = tag.split("-")
                    lang = lang[0] if lang else "Unknown"
                    for each_ipa in extend_lst:
                        parse_result.append({
                            "IPA": each_ipa,
                            "lang": lang,
                        })
        if self.XSAMPA:
            for item in parse_result:
                item.update({
                    "X-SAMPA": IPA.IPA_to_XSAMPA(item["IPA"]),
                })
        return parse_result