naruhodo.backends.cabocha module
from naruhodo.utils.dicts import ProDict, MeaninglessDict, VerbLikeFuncDict, VerbLikeExclude from naruhodo.utils.misc import preprocessText import re class CaboChunk(object): """Class for cabocha chunks""" def __init__(self, chunk_id, parent): """Initialize a chunk.""" self.id = chunk_id """ id of the chunk. """ self.parent = parent """ parent id of this chunk. """ self.children = None """ list of children of this chunk. """ self.nouns = list() """ list of nouns 名詞 """ self.verbs = list() """ list of verbs 動詞 """ self.adjs = list() """ list of adjectives 形容詞 """ self.postps = list() """ list of postpositions 助詞 """ self.auxvs = list() """ list of auxilary verbs 助動詞 """ self.conjs = list() """ list of conjection 接続詞 """ self.interjs = list() """ list of interjections 感動詞 """ self.signs = list() """ list of signs 記号 """ self.advs = list() """ list of adverbs 副詞 """ self.connects = list() """ list of connects 連体詞 """ self.headings = list() """ list of headings 接頭詞 """ self.main = "" """ Main component of the chunk. """ self.main_surface = "" """ Surface of the main component. """ self.func = "" """ Functional component of the chunk. """ self.surface = "" """ Original surface of the chunk. """ self.negative = 0 """ If chunk is negative 1, elif chunk double negtive(strongly positive) -1, else 0 """ self.passive = 0 """ If chunk is passive 1, else 0. """ self.compulsory = 0 """ If chunk is compulsory 1, else 0. """ self.question = 0 """ If chunk contains ? 1, else 0. """ self.yomi = "" """ Contains the yomi of this chunk's surface. """ self.tense = 0 """ If chunk has no tense or present 0, elif past -1, elif present continuous 1 """ self.type = -1 """ Type of this chunk. ------------------- -1: unknown type 0: noun 1: adjective 2: verb 3: conjective 4: interjection 5: adverb 6: connect """ self.type2 = -1 """ 2nd type of this chunk. ----------------------- -1: no 2nd type 0: noun 1: adjective 2: verb """ self.NE = 0 """ Named entity type of this chunk. The name of NE type can be retrieved using 'NEList' in naruhodo.utils.dicts like NEtype = NEList[NE]. -------------------------------- 0: no named entity(or unknown) 1: person 2: location 3: organization 4: number 5: general """ self.pro = -1 """ Pronoun type of this chunk. --------------------------- -1: no pronoun(or unknown) 0: demonstrative-loc 1: demonstrative-obj 2: personal(1st) 3: personal(2nd) 4: personal(3rd) 5: indefinite 6: inclusive 7: omitted *This type is assigned by naruhodo.core.KnowledgeCoreJa. """ self.npro = 0 """ Rank of this pronoun in the sentence it appears. """ self.meaning = "" """ If the main of this chunk is in MeaninglessDict, this variable will be set to the main of its child node that contains its meaning. """ def add(self, inp): """Add components to chunk lists.""" # if inp[1] != "記号" or inp[0] == "?": # self.surface += inp[0] self.surface += inp[0] if len(inp) > 8: self.yomi += inp[8] else: self.yomi += inp[0] elem = { 'surface': inp[0], 'lemma' : inp[7], 'labels': inp[2:7], } if inp[1] == "名詞": self.nouns.append(elem) elif inp[1] == "動詞": self.verbs.append(elem) elif inp[1] == "形容詞": self.adjs.append(elem) elif inp[1] == "助詞": self.postps.append(elem) elif inp[1] == "助動詞": self.auxvs.append(elem) elif inp[1] == "接続詞": self.conjs.append(elem) elif inp[1] == "感動詞": self.interjs.append(elem) elif inp[1] == "記号": self.signs.append(elem) elif inp[1] == "副詞": self.advs.append(elem) elif inp[1] == "連体詞": self.connects.append(elem) elif inp[1] == "接頭詞": self.headings.append(elem) else: pass def _cleanUp(self): """Clean up all the lists stored in the object that is no longer needed.""" del self.nouns del self.verbs del self.adjs del self.postps del self.auxvs del self.conjs del self.interjs del self.signs del self.advs del self.connects del self.headings def _getMain(self): """Get the main component of the chunk.""" if len(self.nouns) > 0 and self.nouns[0]['labels'][0] not in ['非自立', '接尾']: self.main = "".join([x['lemma'] for x in self.nouns if x['labels'][0] != '非自立']) self.main_surface = "".join([x['surface'] for x in self.nouns if x['labels'][0] != '非自立']) self.type = 0 if len(self.adjs) > 0: if self.adjs[0]['lemma'] == "ない": self.negative = 1 # Corrections for special patterns. if self.nouns[0]['labels'][0] == 'サ変接続': if len(self.nouns) > 1 and len(self.verbs) == 0: self.type = 0 else: self.type = 2 self.type2 = 0 elif self.nouns[0]['labels'][0] == '形容動詞語幹': if len(self.nouns) > 1: self.type = 0 else: self.type = 1 self.type2 = 2 # NE recognition. elif self.nouns[0]['labels'][0] == '固有名詞': if self.nouns[0]['labels'][1] == '人名': self.NE = 1 elif self.nouns[0]['labels'][1] == '地域': self.NE = 2 elif self.nouns[0]['labels'][1] == '組織': self.NE = 3 elif self.nouns[0]['labels'][1] == '一般': self.NE = 5 else: pass # Pronoun identification(for correference analysis.) elif self.nouns[0]['labels'][0] == '代名詞': if self.nouns[0]['lemma'] in ProDict['demonstrative-loc']: self.pro = 0 elif self.nouns[0]['lemma'] in ProDict['demonstrative-obj']: self.pro = 1 elif self.nouns[0]['lemma'] in ProDict['personal1st']: self.pro = 2 elif self.nouns[0]['lemma'] in ProDict['personal2nd']: self.pro = 3 elif self.nouns[0]['lemma'] in ProDict['personal3rd']: self.pro = 4 elif self.nouns[0]['lemma'] in ProDict['indefinite']: self.pro = 5 elif self.nouns[0]['lemma'] in ProDict['inclusive']: self.pro = 6 else: pass elif self.nouns[0]['labels'][0] == '数': self.main = "".join([x['lemma'] for x in self.nouns]) self.main_surface = "".join([x['surface'] for x in self.nouns]) self.NE = 4 else: pass elif len(self.nouns) > 0 and self.nouns[0]['lemma'] in MeaninglessDict: if len(self.verbs) > 0: self.main = self.verbs[0]['surface'] self.main_surface = self.verbs[0]['surface'] self.main += self.nouns[0]['lemma'] self.main_surface += self.nouns[0]['surface'] self.type = 0 elif len(self.adjs) > 0: self.main = self.adjs[0]['lemma'] self.main_surface = self.adjs[0]['surface'] self.type = 1 if self.adjs[0]['lemma'] == "ない": self.negative = 1 elif len(self.verbs) > 0: self.main = self.verbs[0]['lemma'] self.main_surface = self.verbs[0]['surface'] self.type = 2 elif len(self.advs) > 0: self.main = self.advs[0]['lemma'] self.main_surface = self.advs[0]['surface'] self.type = 5 elif len(self.conjs) > 0: self.main = self.conjs[0]['lemma'] self.main_surface = self.conjs[0]['surface'] self.type = 3 elif len(self.interjs) > 0: self.main = self.interjs[0]['lemma'] self.main_surface = self.interjs[0]['surface'] self.type = 4 elif len(self.connects) > 0: self.main = self.connects[0]['lemma'] self.main_surface = self.connects[0]['surface'] self.type = 6 elif len(self.postps) > 0: self.main = self.postps[0]['lemma'] self.main_surface = self.postps[0]['surface'] elif len(self.auxvs) > 0: self.main = self.auxvs[0]['lemma'] self.main_surface = self.auxvs[0]['surface'] elif len(self.signs) > 0: if len(self.nouns) > 0: self.main = self.nouns[0]['lemma'] self.main_surface = self.nouns[0]['surface'] else: self.main = self.signs[0]['lemma'] self.main_surface = self.signs[0]['surface'] elif len(self.nouns) > 0 and self.nouns[0]['labels'][0] == '非自立': self.main = self.nouns[0]['lemma'] self.main_surface = self.nouns[0]['surface'] self.type = 0 else: self.main = 'UNKNOWN' if len(self.headings) > 0: self.main = "\n".join([x['lemma'] for x in self.headings]) + self.main self.main_surface = "\n".join([x['surface'] for x in self.headings]) + self.main_surface # Convert main with no lemma to surface if self.main.find("*") != -1: self.main = self.main_surface def _getFunc(self): """Get the func component of the chunk.""" # Get func by excluding main from surface. self.func = self.surface.replace(self.main_surface, "") # Process func to get properties if len(self.verbs) > 0: for item in self.verbs: if item['labels'][0] == '接尾': if item['lemma'] == "れる" or item['lemma'] == "られる": self.passive = 1 elif item['lemma'] == "させる": self.compulsory = 1 elif item['labels'][0] == "非自立": if item['lemma'] == "いる": self.tense = 1 if len(self.postps) > 0: if self.parent == -1: for item in self.postps: if item['lemma'] in ["の", "なの", "か"]: self.question = 1 if len(self.auxvs) > 0: neg = sum([ [x['lemma'] for x in self.auxvs].count('ん'), [x['lemma'] for x in self.auxvs].count('ない'), [x['lemma'] for x in self.auxvs].count('ぬ'), [x['lemma'] for x in self.auxvs].count('まい') ]) if neg == 1: if len(self.signs) > 0 and any([self.signs[x]['surface'] == '?' for x in range(len(self.signs))]): pass else: self.negative = 1 elif neg > 1: if neg % 2 == 0: self.negative = -1 else: self.negative = 1 else: pass if any([self.auxvs[x]['lemma'] == "た" for x in range(len(self.auxvs))]): self.tense = -1 # Fix for nouns used as verbs. for item in VerbLikeFuncDict: if self.func.find(item) != -1 and self.func not in VerbLikeExclude: self.type = 2 if len(self.signs) > 0: for item in self.signs: if item['surface'] == '?': self.question = 1 # Fix for special words. if self.main == "できる" and self.func not in ["た", "ます", "いるて"]: self.type = 5 def processChunk(self, pos, npro): """Process the chunk to get main and func component of it.""" self._getMain() self._getFunc() # Modify pronouns if self.pro != -1: self.main += "[{0}@{1}]".format(pos, npro) self.npro = npro # Add tense label to main if self.tense == -1: self.main += "\n(過去)" elif self.tense == 1: self.main += "\n(現在)" # Add compulsory label to main if self.compulsory == 1: self.main += "\n(強制)" if self.passive == 1: self.main += "\n(被動)" # Add question label to main if self.question == 1: self.main += "\n(質問)" # Add negative label to main if self.negative == 1: self.main += "\n(否定)" elif self.negative == -1: self.main += "\n(二重否定)" self._cleanUp() class CabochaClient(object): """Class for CaboCha backend.""" def __init__(self): """Initialize a native database.""" self.rsplit = re.compile(r'[,]+|\t') self.chunks = list() self.root = None self.npro = 0 def add(self, inp, pos=0): """Takes in the block output from CaboCha and add it to native database.""" ck = None for elem in inp.splitlines(): if elem[0] == '*': if ck is not None: ck.processChunk(pos, self.npro) if ck.pro != -1: self.npro += 1 self.chunks.append(ck) ck = CaboChunk(*self._processHead(elem)) else: ck.add(self.rsplit.split(elem)) ck.processChunk(pos, self.npro) if ck.pro != -1: self.npro += 1 self.chunks.append(ck) # Get children list and store in self.childrenList self._getChildrenList() self._processMeaningless() self._processNegative() def _processHead(self, inp): """Takes in the head of the chunk and process ids / parents.""" elem = inp.split() return int(elem[1]), int(elem[2][:-1]) def _getChildrenList(self): """Process to get the list of children for each chunk.""" nck = len(self.chunks) self.childrenList = [list() for x in range(nck)] for i in range(nck): pid = self.chunks[i].parent if pid == -1: self.root = i else: self.childrenList[pid].append(i) for i in range(nck): self.chunks[i].children = self.childrenList[i] def _processMeaningless(self): """This function makes meaningless words tagged with its meaning.""" nck = len(self.chunks) for i in range(nck): if preprocessText(self.chunks[i].main) in MeaninglessDict: if len(self.childrenList[i]) > 0: self.chunks[i].meaning = self.chunks[self.childrenList[i][-1]].main self.chunks[i].main = "({0})\n{1}".format( self.chunks[self.childrenList[i][-1]].surface, self.chunks[i].main ) def _processNegative(self): """This function makes the words that has negative child tagged negative.""" nck = len(self.chunks) for i in range(nck): if preprocessText(self.chunks[i].main) in ["ない", ]: if len(self.childrenList[i]) > 0: self.chunks[self.childrenList[i][-1]].main += "\n(否定)" self.chunks[self.childrenList[i][-1]].negative = 1 self.chunks[i].meaning = self.chunks[self.childrenList[i][-1]].main self.chunks[i].main = self.chunks[i].main.replace("\n(否定)", "")
Module variables
var MeaninglessDict
var ProDict
var VerbLikeExclude
var VerbLikeFuncDict
Classes
class CaboChunk
Class for cabocha chunks
class CaboChunk(object): """Class for cabocha chunks""" def __init__(self, chunk_id, parent): """Initialize a chunk.""" self.id = chunk_id """ id of the chunk. """ self.parent = parent """ parent id of this chunk. """ self.children = None """ list of children of this chunk. """ self.nouns = list() """ list of nouns 名詞 """ self.verbs = list() """ list of verbs 動詞 """ self.adjs = list() """ list of adjectives 形容詞 """ self.postps = list() """ list of postpositions 助詞 """ self.auxvs = list() """ list of auxilary verbs 助動詞 """ self.conjs = list() """ list of conjection 接続詞 """ self.interjs = list() """ list of interjections 感動詞 """ self.signs = list() """ list of signs 記号 """ self.advs = list() """ list of adverbs 副詞 """ self.connects = list() """ list of connects 連体詞 """ self.headings = list() """ list of headings 接頭詞 """ self.main = "" """ Main component of the chunk. """ self.main_surface = "" """ Surface of the main component. """ self.func = "" """ Functional component of the chunk. """ self.surface = "" """ Original surface of the chunk. """ self.negative = 0 """ If chunk is negative 1, elif chunk double negtive(strongly positive) -1, else 0 """ self.passive = 0 """ If chunk is passive 1, else 0. """ self.compulsory = 0 """ If chunk is compulsory 1, else 0. """ self.question = 0 """ If chunk contains ? 1, else 0. """ self.yomi = "" """ Contains the yomi of this chunk's surface. """ self.tense = 0 """ If chunk has no tense or present 0, elif past -1, elif present continuous 1 """ self.type = -1 """ Type of this chunk. ------------------- -1: unknown type 0: noun 1: adjective 2: verb 3: conjective 4: interjection 5: adverb 6: connect """ self.type2 = -1 """ 2nd type of this chunk. ----------------------- -1: no 2nd type 0: noun 1: adjective 2: verb """ self.NE = 0 """ Named entity type of this chunk. The name of NE type can be retrieved using 'NEList' in naruhodo.utils.dicts like NEtype = NEList[NE]. -------------------------------- 0: no named entity(or unknown) 1: person 2: location 3: organization 4: number 5: general """ self.pro = -1 """ Pronoun type of this chunk. --------------------------- -1: no pronoun(or unknown) 0: demonstrative-loc 1: demonstrative-obj 2: personal(1st) 3: personal(2nd) 4: personal(3rd) 5: indefinite 6: inclusive 7: omitted *This type is assigned by naruhodo.core.KnowledgeCoreJa. """ self.npro = 0 """ Rank of this pronoun in the sentence it appears. """ self.meaning = "" """ If the main of this chunk is in MeaninglessDict, this variable will be set to the main of its child node that contains its meaning. """ def add(self, inp): """Add components to chunk lists.""" # if inp[1] != "記号" or inp[0] == "?": # self.surface += inp[0] self.surface += inp[0] if len(inp) > 8: self.yomi += inp[8] else: self.yomi += inp[0] elem = { 'surface': inp[0], 'lemma' : inp[7], 'labels': inp[2:7], } if inp[1] == "名詞": self.nouns.append(elem) elif inp[1] == "動詞": self.verbs.append(elem) elif inp[1] == "形容詞": self.adjs.append(elem) elif inp[1] == "助詞": self.postps.append(elem) elif inp[1] == "助動詞": self.auxvs.append(elem) elif inp[1] == "接続詞": self.conjs.append(elem) elif inp[1] == "感動詞": self.interjs.append(elem) elif inp[1] == "記号": self.signs.append(elem) elif inp[1] == "副詞": self.advs.append(elem) elif inp[1] == "連体詞": self.connects.append(elem) elif inp[1] == "接頭詞": self.headings.append(elem) else: pass def _cleanUp(self): """Clean up all the lists stored in the object that is no longer needed.""" del self.nouns del self.verbs del self.adjs del self.postps del self.auxvs del self.conjs del self.interjs del self.signs del self.advs del self.connects del self.headings def _getMain(self): """Get the main component of the chunk.""" if len(self.nouns) > 0 and self.nouns[0]['labels'][0] not in ['非自立', '接尾']: self.main = "".join([x['lemma'] for x in self.nouns if x['labels'][0] != '非自立']) self.main_surface = "".join([x['surface'] for x in self.nouns if x['labels'][0] != '非自立']) self.type = 0 if len(self.adjs) > 0: if self.adjs[0]['lemma'] == "ない": self.negative = 1 # Corrections for special patterns. if self.nouns[0]['labels'][0] == 'サ変接続': if len(self.nouns) > 1 and len(self.verbs) == 0: self.type = 0 else: self.type = 2 self.type2 = 0 elif self.nouns[0]['labels'][0] == '形容動詞語幹': if len(self.nouns) > 1: self.type = 0 else: self.type = 1 self.type2 = 2 # NE recognition. elif self.nouns[0]['labels'][0] == '固有名詞': if self.nouns[0]['labels'][1] == '人名': self.NE = 1 elif self.nouns[0]['labels'][1] == '地域': self.NE = 2 elif self.nouns[0]['labels'][1] == '組織': self.NE = 3 elif self.nouns[0]['labels'][1] == '一般': self.NE = 5 else: pass # Pronoun identification(for correference analysis.) elif self.nouns[0]['labels'][0] == '代名詞': if self.nouns[0]['lemma'] in ProDict['demonstrative-loc']: self.pro = 0 elif self.nouns[0]['lemma'] in ProDict['demonstrative-obj']: self.pro = 1 elif self.nouns[0]['lemma'] in ProDict['personal1st']: self.pro = 2 elif self.nouns[0]['lemma'] in ProDict['personal2nd']: self.pro = 3 elif self.nouns[0]['lemma'] in ProDict['personal3rd']: self.pro = 4 elif self.nouns[0]['lemma'] in ProDict['indefinite']: self.pro = 5 elif self.nouns[0]['lemma'] in ProDict['inclusive']: self.pro = 6 else: pass elif self.nouns[0]['labels'][0] == '数': self.main = "".join([x['lemma'] for x in self.nouns]) self.main_surface = "".join([x['surface'] for x in self.nouns]) self.NE = 4 else: pass elif len(self.nouns) > 0 and self.nouns[0]['lemma'] in MeaninglessDict: if len(self.verbs) > 0: self.main = self.verbs[0]['surface'] self.main_surface = self.verbs[0]['surface'] self.main += self.nouns[0]['lemma'] self.main_surface += self.nouns[0]['surface'] self.type = 0 elif len(self.adjs) > 0: self.main = self.adjs[0]['lemma'] self.main_surface = self.adjs[0]['surface'] self.type = 1 if self.adjs[0]['lemma'] == "ない": self.negative = 1 elif len(self.verbs) > 0: self.main = self.verbs[0]['lemma'] self.main_surface = self.verbs[0]['surface'] self.type = 2 elif len(self.advs) > 0: self.main = self.advs[0]['lemma'] self.main_surface = self.advs[0]['surface'] self.type = 5 elif len(self.conjs) > 0: self.main = self.conjs[0]['lemma'] self.main_surface = self.conjs[0]['surface'] self.type = 3 elif len(self.interjs) > 0: self.main = self.interjs[0]['lemma'] self.main_surface = self.interjs[0]['surface'] self.type = 4 elif len(self.connects) > 0: self.main = self.connects[0]['lemma'] self.main_surface = self.connects[0]['surface'] self.type = 6 elif len(self.postps) > 0: self.main = self.postps[0]['lemma'] self.main_surface = self.postps[0]['surface'] elif len(self.auxvs) > 0: self.main = self.auxvs[0]['lemma'] self.main_surface = self.auxvs[0]['surface'] elif len(self.signs) > 0: if len(self.nouns) > 0: self.main = self.nouns[0]['lemma'] self.main_surface = self.nouns[0]['surface'] else: self.main = self.signs[0]['lemma'] self.main_surface = self.signs[0]['surface'] elif len(self.nouns) > 0 and self.nouns[0]['labels'][0] == '非自立': self.main = self.nouns[0]['lemma'] self.main_surface = self.nouns[0]['surface'] self.type = 0 else: self.main = 'UNKNOWN' if len(self.headings) > 0: self.main = "\n".join([x['lemma'] for x in self.headings]) + self.main self.main_surface = "\n".join([x['surface'] for x in self.headings]) + self.main_surface # Convert main with no lemma to surface if self.main.find("*") != -1: self.main = self.main_surface def _getFunc(self): """Get the func component of the chunk.""" # Get func by excluding main from surface. self.func = self.surface.replace(self.main_surface, "") # Process func to get properties if len(self.verbs) > 0: for item in self.verbs: if item['labels'][0] == '接尾': if item['lemma'] == "れる" or item['lemma'] == "られる": self.passive = 1 elif item['lemma'] == "させる": self.compulsory = 1 elif item['labels'][0] == "非自立": if item['lemma'] == "いる": self.tense = 1 if len(self.postps) > 0: if self.parent == -1: for item in self.postps: if item['lemma'] in ["の", "なの", "か"]: self.question = 1 if len(self.auxvs) > 0: neg = sum([ [x['lemma'] for x in self.auxvs].count('ん'), [x['lemma'] for x in self.auxvs].count('ない'), [x['lemma'] for x in self.auxvs].count('ぬ'), [x['lemma'] for x in self.auxvs].count('まい') ]) if neg == 1: if len(self.signs) > 0 and any([self.signs[x]['surface'] == '?' for x in range(len(self.signs))]): pass else: self.negative = 1 elif neg > 1: if neg % 2 == 0: self.negative = -1 else: self.negative = 1 else: pass if any([self.auxvs[x]['lemma'] == "た" for x in range(len(self.auxvs))]): self.tense = -1 # Fix for nouns used as verbs. for item in VerbLikeFuncDict: if self.func.find(item) != -1 and self.func not in VerbLikeExclude: self.type = 2 if len(self.signs) > 0: for item in self.signs: if item['surface'] == '?': self.question = 1 # Fix for special words. if self.main == "できる" and self.func not in ["た", "ます", "いるて"]: self.type = 5 def processChunk(self, pos, npro): """Process the chunk to get main and func component of it.""" self._getMain() self._getFunc() # Modify pronouns if self.pro != -1: self.main += "[{0}@{1}]".format(pos, npro) self.npro = npro # Add tense label to main if self.tense == -1: self.main += "\n(過去)" elif self.tense == 1: self.main += "\n(現在)" # Add compulsory label to main if self.compulsory == 1: self.main += "\n(強制)" if self.passive == 1: self.main += "\n(被動)" # Add question label to main if self.question == 1: self.main += "\n(質問)" # Add negative label to main if self.negative == 1: self.main += "\n(否定)" elif self.negative == -1: self.main += "\n(二重否定)" self._cleanUp()
Ancestors (in MRO)
- CaboChunk
- builtins.object
Static methods
def __init__(
self, chunk_id, parent)
Initialize a chunk.
def __init__(self, chunk_id, parent): """Initialize a chunk.""" self.id = chunk_id """ id of the chunk. """ self.parent = parent """ parent id of this chunk. """ self.children = None """ list of children of this chunk. """ self.nouns = list() """ list of nouns 名詞 """ self.verbs = list() """ list of verbs 動詞 """ self.adjs = list() """ list of adjectives 形容詞 """ self.postps = list() """ list of postpositions 助詞 """ self.auxvs = list() """ list of auxilary verbs 助動詞 """ self.conjs = list() """ list of conjection 接続詞 """ self.interjs = list() """ list of interjections 感動詞 """ self.signs = list() """ list of signs 記号 """ self.advs = list() """ list of adverbs 副詞 """ self.connects = list() """ list of connects 連体詞 """ self.headings = list() """ list of headings 接頭詞 """ self.main = "" """ Main component of the chunk. """ self.main_surface = "" """ Surface of the main component. """ self.func = "" """ Functional component of the chunk. """ self.surface = "" """ Original surface of the chunk. """ self.negative = 0 """ If chunk is negative 1, elif chunk double negtive(strongly positive) -1, else 0 """ self.passive = 0 """ If chunk is passive 1, else 0. """ self.compulsory = 0 """ If chunk is compulsory 1, else 0. """ self.question = 0 """ If chunk contains ? 1, else 0. """ self.yomi = "" """ Contains the yomi of this chunk's surface. """ self.tense = 0 """ If chunk has no tense or present 0, elif past -1, elif present continuous 1 """ self.type = -1 """ Type of this chunk. ------------------- -1: unknown type 0: noun 1: adjective 2: verb 3: conjective 4: interjection 5: adverb 6: connect """ self.type2 = -1 """ 2nd type of this chunk. ----------------------- -1: no 2nd type 0: noun 1: adjective 2: verb """ self.NE = 0 """ Named entity type of this chunk. The name of NE type can be retrieved using 'NEList' in naruhodo.utils.dicts like NEtype = NEList[NE]. -------------------------------- 0: no named entity(or unknown) 1: person 2: location 3: organization 4: number 5: general """ self.pro = -1 """ Pronoun type of this chunk. --------------------------- -1: no pronoun(or unknown) 0: demonstrative-loc 1: demonstrative-obj 2: personal(1st) 3: personal(2nd) 4: personal(3rd) 5: indefinite 6: inclusive 7: omitted *This type is assigned by naruhodo.core.KnowledgeCoreJa. """ self.npro = 0 """ Rank of this pronoun in the sentence it appears. """ self.meaning = "" """ If the main of this chunk is in MeaninglessDict, this variable will be set to the main of its child node that contains its meaning. """
def add(
self, inp)
Add components to chunk lists.
def add(self, inp): """Add components to chunk lists.""" # if inp[1] != "記号" or inp[0] == "?": # self.surface += inp[0] self.surface += inp[0] if len(inp) > 8: self.yomi += inp[8] else: self.yomi += inp[0] elem = { 'surface': inp[0], 'lemma' : inp[7], 'labels': inp[2:7], } if inp[1] == "名詞": self.nouns.append(elem) elif inp[1] == "動詞": self.verbs.append(elem) elif inp[1] == "形容詞": self.adjs.append(elem) elif inp[1] == "助詞": self.postps.append(elem) elif inp[1] == "助動詞": self.auxvs.append(elem) elif inp[1] == "接続詞": self.conjs.append(elem) elif inp[1] == "感動詞": self.interjs.append(elem) elif inp[1] == "記号": self.signs.append(elem) elif inp[1] == "副詞": self.advs.append(elem) elif inp[1] == "連体詞": self.connects.append(elem) elif inp[1] == "接頭詞": self.headings.append(elem) else: pass
def processChunk(
self, pos, npro)
Process the chunk to get main and func component of it.
def processChunk(self, pos, npro): """Process the chunk to get main and func component of it.""" self._getMain() self._getFunc() # Modify pronouns if self.pro != -1: self.main += "[{0}@{1}]".format(pos, npro) self.npro = npro # Add tense label to main if self.tense == -1: self.main += "\n(過去)" elif self.tense == 1: self.main += "\n(現在)" # Add compulsory label to main if self.compulsory == 1: self.main += "\n(強制)" if self.passive == 1: self.main += "\n(被動)" # Add question label to main if self.question == 1: self.main += "\n(質問)" # Add negative label to main if self.negative == 1: self.main += "\n(否定)" elif self.negative == -1: self.main += "\n(二重否定)" self._cleanUp()
Instance variables
var NE
Named entity type of this chunk. The name of NE type can be retrieved using 'NEList' in naruhodo.utils.dicts like NEtype = NEList[NE].
0: no named entity(or unknown) 1: person 2: location 3: organization 4: number 5: general
var adjs
list of adjectives 形容詞
var advs
list of adverbs 副詞
var auxvs
list of auxilary verbs 助動詞
var children
list of children of this chunk.
var compulsory
If chunk is compulsory 1, else 0.
var conjs
list of conjection 接続詞
var connects
list of connects 連体詞
var func
Functional component of the chunk.
var headings
list of headings 接頭詞
var id
id of the chunk.
var interjs
list of interjections 感動詞
var main
Main component of the chunk.
var main_surface
Surface of the main component.
var meaning
If the main of this chunk is in MeaninglessDict, this variable will be set to the main of its child node that contains its meaning.
var negative
If chunk is negative 1, elif chunk double negtive(strongly positive) -1, else 0
var nouns
list of nouns 名詞
var npro
Rank of this pronoun in the sentence it appears.
var parent
parent id of this chunk.
var passive
If chunk is passive 1, else 0.
var postps
list of postpositions 助詞
var pro
Pronoun type of this chunk.
-1: no pronoun(or unknown) 0: demonstrative-loc 1: demonstrative-obj 2: personal(1st) 3: personal(2nd) 4: personal(3rd) 5: indefinite 6: inclusive 7: omitted *This type is assigned by naruhodo.core.KnowledgeCoreJa.
var question
If chunk contains ? 1, else 0.
var signs
list of signs 記号
var surface
Original surface of the chunk.
var tense
If chunk has no tense or present 0, elif past -1, elif present continuous 1
var type
Type of this chunk.
-1: unknown type 0: noun 1: adjective 2: verb 3: conjective 4: interjection 5: adverb 6: connect
var type2
2nd type of this chunk.
-1: no 2nd type 0: noun 1: adjective 2: verb
var verbs
list of verbs 動詞
var yomi
Contains the yomi of this chunk's surface.
class CabochaClient
Class for CaboCha backend.
class CabochaClient(object): """Class for CaboCha backend.""" def __init__(self): """Initialize a native database.""" self.rsplit = re.compile(r'[,]+|\t') self.chunks = list() self.root = None self.npro = 0 def add(self, inp, pos=0): """Takes in the block output from CaboCha and add it to native database.""" ck = None for elem in inp.splitlines(): if elem[0] == '*': if ck is not None: ck.processChunk(pos, self.npro) if ck.pro != -1: self.npro += 1 self.chunks.append(ck) ck = CaboChunk(*self._processHead(elem)) else: ck.add(self.rsplit.split(elem)) ck.processChunk(pos, self.npro) if ck.pro != -1: self.npro += 1 self.chunks.append(ck) # Get children list and store in self.childrenList self._getChildrenList() self._processMeaningless() self._processNegative() def _processHead(self, inp): """Takes in the head of the chunk and process ids / parents.""" elem = inp.split() return int(elem[1]), int(elem[2][:-1]) def _getChildrenList(self): """Process to get the list of children for each chunk.""" nck = len(self.chunks) self.childrenList = [list() for x in range(nck)] for i in range(nck): pid = self.chunks[i].parent if pid == -1: self.root = i else: self.childrenList[pid].append(i) for i in range(nck): self.chunks[i].children = self.childrenList[i] def _processMeaningless(self): """This function makes meaningless words tagged with its meaning.""" nck = len(self.chunks) for i in range(nck): if preprocessText(self.chunks[i].main) in MeaninglessDict: if len(self.childrenList[i]) > 0: self.chunks[i].meaning = self.chunks[self.childrenList[i][-1]].main self.chunks[i].main = "({0})\n{1}".format( self.chunks[self.childrenList[i][-1]].surface, self.chunks[i].main ) def _processNegative(self): """This function makes the words that has negative child tagged negative.""" nck = len(self.chunks) for i in range(nck): if preprocessText(self.chunks[i].main) in ["ない", ]: if len(self.childrenList[i]) > 0: self.chunks[self.childrenList[i][-1]].main += "\n(否定)" self.chunks[self.childrenList[i][-1]].negative = 1 self.chunks[i].meaning = self.chunks[self.childrenList[i][-1]].main self.chunks[i].main = self.chunks[i].main.replace("\n(否定)", "")
Ancestors (in MRO)
- CabochaClient
- builtins.object
Static methods
def __init__(
self)
Initialize a native database.
def __init__(self): """Initialize a native database.""" self.rsplit = re.compile(r'[,]+|\t') self.chunks = list() self.root = None self.npro = 0
def add(
self, inp, pos=0)
Takes in the block output from CaboCha and add it to native database.
def add(self, inp, pos=0): """Takes in the block output from CaboCha and add it to native database.""" ck = None for elem in inp.splitlines(): if elem[0] == '*': if ck is not None: ck.processChunk(pos, self.npro) if ck.pro != -1: self.npro += 1 self.chunks.append(ck) ck = CaboChunk(*self._processHead(elem)) else: ck.add(self.rsplit.split(elem)) ck.processChunk(pos, self.npro) if ck.pro != -1: self.npro += 1 self.chunks.append(ck) # Get children list and store in self.childrenList self._getChildrenList() self._processMeaningless() self._processNegative()
Instance variables
var chunks
var npro
var root
var rsplit