naruhodo.utils.misc module
Module for miscellaneous utility functions.
""" Module for miscellaneous utility functions. """ import re import json from math import sqrt import numpy as np import networkx as nx from nxpd import draw from naruhodo.utils.dicts import NodeType2StyleDict, NodeType2ColorDict, NodeType2FontColorDict, EdgeType2StyleDict, EdgeType2ColorDict _re_sent = re.compile(r'([^ !?。]*[!?。])') """ Precompiled regular expression for separating sentences. """ _re1 = re.compile(r'\(.*?\)') _re2 = re.compile(r'\[.*?\]') _re3 = re.compile(r'\(.*?\)') _re4 = re.compile(r'\<.*?\>') """ Precompiled regular expressions for getting rid of parenthesis. """ def preprocessText(text): """Get rid of weird parts from the text that interferes analysis.""" text = text.replace("\n", "").replace("|", "、").replace(" ", "").strip() text = _re1.sub("", text) text = _re2.sub("", text) text = _re3.sub("", text) text = _re4.sub("", text) return text def parseToSents(context): """Parse given context into list of individual sentences.""" return [sent.strip().replace('*', "-") for sent in _re_sent.split(context) if sent.strip() != ""] def exportToJsonObj(G): """Export given networkx graph to JSON object(dict object in python).""" return nx.node_link_data(G) def exportToJsonFile(G, filename): """Export given networkx graph to JSON file.""" with open(filename, 'w') as outfile: json.dump(exportToJsonObj(G), outfile) def getNodeProperties(info, depth=False): """Convert node properties for node drawing using nxpd.""" ret = dict() ret['shape'] = NodeType2StyleDict[info['type']] ret['fillcolor'] = NodeType2ColorDict[info['type']] ret['fontcolor'] = NodeType2FontColorDict[info['type']] ret['label'] = info['label'] ret['style'] = 'filled' ret['fixedsize'] = True ret['fontsize'] = (5.0 + 20.0 / len(info['label'])) * info['count'] ret['width'] = info['count']*0.75 ret['count'] = info['count'] if depth: d = np.average(info['depth']) # Average depth of the node d = min(d, 5.) # Normalize d to a range of [0, 6] cs = [255, 80, 0] # Base reference color at start ct = [255, 255, 255] # Base reference color at end cn = [0, 0, 0] # Average depth scaled node color for i in range(3): cn[i] = cs[i] + int((ct[i] - cs[i]) / 5. * d) ret['fillcolor'] = rgb2Hex(cn) ret['fontcolor'] = '#000000' return ret def getEdgeProperties(info): """Convert edge properties for node drawing using nxpd.""" ret = dict() ret['label'] = info['label'] ret['penwidth'] = info['weight'] * 2.0 ret['weight'] = info['weight'] ret['style'] = EdgeType2StyleDict[info['type']] ret['color'] = EdgeType2ColorDict[info['type']] return ret def inclusive(A, B): """Find if one of string A and B includes the other.""" if len(A) > len(B): if A.find(B) != -1: ret = 1 else: ret = 0 elif len(A) < len(B): if B.find(A) != -1: ret = -1 else: ret = 0 else: ret = 0 return ret def cosSimilarity(A, B): """Compute the cosine similarity between vectors A and B.""" return np.dot(A, B) / sqrt(np.dot(A, A) * np.dot(B, B)) def harmonicSim(AG, B): """Return the harmonic distance between a group of vectors AG and vector B.""" size = len(AG) ret = 0. for i in range(size): ret += 1. / cosSimilarity(AG[i], B) return float(size) / ret def decorate(G, depth, rankdir): """Generate temporal graph with drawing properties added for nxpd.""" ret = nx.DiGraph() ret.graph['rankdir'] = rankdir for key, val in G.nodes.items(): ret.add_node(key, **getNodeProperties(val, depth)) for key, val in G.edges.items(): ret.add_edge(*key, **getEdgeProperties(val)) return ret def show(G, depth=False, rankdir='TB'): """Decorate and draw given graph using nxpd in notebook.""" return draw(decorate(G, depth, rankdir), show='ipynb') def plotToFile(G, filename, depth=False, rankdir='TB'): """Output given graph to a png file using nxpd.""" return draw(decorate(G, depth, rankdir), filename=filename) def _mergeGraph(A, B): """Return the merged graph of A and B.""" for key, val in B.nodes.items(): if A.has_node(key): A.nodes[key]['count'] += val['count'] for i in range(len(val['pos'])): if val['pos'][i] not in A.nodes[key]['pos']: A.nodes[key]['pos'].append(val['pos'][i]) A.nodes[key]['lpos'].append(val['lpos'][i]) A.nodes[key]['func'].append(val['func'][i]) A.nodes[key]['surface'].append(val['surface'][i]) A.nodes[key]['yomi'].append(val['yomi'][i]) if 'depth' in A.nodes[key]: A.nodes[key]['depth'].append(val['depth'][i]) else: A.add_node(key, **val) for key, val in B.edges.items(): if A.has_edge(*key): A.edges[key[0], key[1]]['weight'] += val['weight'] else: A.add_edge(*key, **val) return A def _mergeEntityList(A, B): """Return merged entityList os A and B.""" for i in range(len(B)): for key, val in B[i].items(): if key in A[i]: for item in val: A[i][key].append(item) else: A[i][key] = val return A def _mergeProList(A, B): """Return merged proList os A and B.""" for item in B: A.append(item) return A def _mergeAll(A, B): """Return merged result of graph, entity list and pronoun list.""" A[0] = _mergeGraph(A[0], B[0]) A[1] = _mergeEntityList(A[1], B[1]) A[2] = _mergeProList(A[2], B[2]) return A def hex2Rgb(c): """ Convert hex color in #XXXXXX format to RGB list. """ return [int(c.lstrip("#")[i:i+2], 16) for i in (0, 2, 4)] def rgb2Hex(c): """ Convert color in RGB format to hex format. """ return "#{0:02x}{1:02x}{2:02x}".format(clamp(c[0]), clamp(c[1]), clamp(c[2])) def clamp(x): """ Clamp x to 0 <= x <= 255. """ return max(0, min(x, 255))
Module variables
var EdgeType2ColorDict
var EdgeType2StyleDict
var NodeType2ColorDict
var NodeType2FontColorDict
var NodeType2StyleDict
Functions
def clamp(
x)
Clamp x to 0 <= x <= 255.
def clamp(x): """ Clamp x to 0 <= x <= 255. """ return max(0, min(x, 255))
def cosSimilarity(
A, B)
Compute the cosine similarity between vectors A and B.
def cosSimilarity(A, B): """Compute the cosine similarity between vectors A and B.""" return np.dot(A, B) / sqrt(np.dot(A, A) * np.dot(B, B))
def decorate(
G, depth, rankdir)
Generate temporal graph with drawing properties added for nxpd.
def decorate(G, depth, rankdir): """Generate temporal graph with drawing properties added for nxpd.""" ret = nx.DiGraph() ret.graph['rankdir'] = rankdir for key, val in G.nodes.items(): ret.add_node(key, **getNodeProperties(val, depth)) for key, val in G.edges.items(): ret.add_edge(*key, **getEdgeProperties(val)) return ret
def exportToJsonFile(
G, filename)
Export given networkx graph to JSON file.
def exportToJsonFile(G, filename): """Export given networkx graph to JSON file.""" with open(filename, 'w') as outfile: json.dump(exportToJsonObj(G), outfile)
def exportToJsonObj(
G)
Export given networkx graph to JSON object(dict object in python).
def exportToJsonObj(G): """Export given networkx graph to JSON object(dict object in python).""" return nx.node_link_data(G)
def getEdgeProperties(
info)
Convert edge properties for node drawing using nxpd.
def getEdgeProperties(info): """Convert edge properties for node drawing using nxpd.""" ret = dict() ret['label'] = info['label'] ret['penwidth'] = info['weight'] * 2.0 ret['weight'] = info['weight'] ret['style'] = EdgeType2StyleDict[info['type']] ret['color'] = EdgeType2ColorDict[info['type']] return ret
def getNodeProperties(
info, depth=False)
Convert node properties for node drawing using nxpd.
def getNodeProperties(info, depth=False): """Convert node properties for node drawing using nxpd.""" ret = dict() ret['shape'] = NodeType2StyleDict[info['type']] ret['fillcolor'] = NodeType2ColorDict[info['type']] ret['fontcolor'] = NodeType2FontColorDict[info['type']] ret['label'] = info['label'] ret['style'] = 'filled' ret['fixedsize'] = True ret['fontsize'] = (5.0 + 20.0 / len(info['label'])) * info['count'] ret['width'] = info['count']*0.75 ret['count'] = info['count'] if depth: d = np.average(info['depth']) # Average depth of the node d = min(d, 5.) # Normalize d to a range of [0, 6] cs = [255, 80, 0] # Base reference color at start ct = [255, 255, 255] # Base reference color at end cn = [0, 0, 0] # Average depth scaled node color for i in range(3): cn[i] = cs[i] + int((ct[i] - cs[i]) / 5. * d) ret['fillcolor'] = rgb2Hex(cn) ret['fontcolor'] = '#000000' return ret
def harmonicSim(
AG, B)
Return the harmonic distance between a group of vectors AG and vector B.
def harmonicSim(AG, B): """Return the harmonic distance between a group of vectors AG and vector B.""" size = len(AG) ret = 0. for i in range(size): ret += 1. / cosSimilarity(AG[i], B) return float(size) / ret
def hex2Rgb(
c)
Convert hex color in #XXXXXX format to RGB list.
def hex2Rgb(c): """ Convert hex color in #XXXXXX format to RGB list. """ return [int(c.lstrip("#")[i:i+2], 16) for i in (0, 2, 4)]
def inclusive(
A, B)
Find if one of string A and B includes the other.
def inclusive(A, B): """Find if one of string A and B includes the other.""" if len(A) > len(B): if A.find(B) != -1: ret = 1 else: ret = 0 elif len(A) < len(B): if B.find(A) != -1: ret = -1 else: ret = 0 else: ret = 0 return ret
def parseToSents(
context)
Parse given context into list of individual sentences.
def parseToSents(context): """Parse given context into list of individual sentences.""" return [sent.strip().replace('*', "-") for sent in _re_sent.split(context) if sent.strip() != ""]
def plotToFile(
G, filename, depth=False, rankdir='TB')
Output given graph to a png file using nxpd.
def plotToFile(G, filename, depth=False, rankdir='TB'): """Output given graph to a png file using nxpd.""" return draw(decorate(G, depth, rankdir), filename=filename)
def preprocessText(
text)
Get rid of weird parts from the text that interferes analysis.
def preprocessText(text): """Get rid of weird parts from the text that interferes analysis.""" text = text.replace("\n", "").replace("|", "、").replace(" ", "").strip() text = _re1.sub("", text) text = _re2.sub("", text) text = _re3.sub("", text) text = _re4.sub("", text) return text
def rgb2Hex(
c)
Convert color in RGB format to hex format.
def rgb2Hex(c): """ Convert color in RGB format to hex format. """ return "#{0:02x}{1:02x}{2:02x}".format(clamp(c[0]), clamp(c[1]), clamp(c[2]))
def show(
G, depth=False, rankdir='TB')
Decorate and draw given graph using nxpd in notebook.
def show(G, depth=False, rankdir='TB'): """Decorate and draw given graph using nxpd in notebook.""" return draw(decorate(G, depth, rankdir), show='ipynb')