In my current job we are build a Data Dictionary System and one need is to make sure that the names of tables and attributes maintain a certain level of correctness.

The D.A. (Data Admin) team already have a glossary to use, but they don't trust those table, and with reason, there are duplicated terms and their meaning.

Talking to my manager i suggest to build a glossary using a bunch of words from our language (portuguese in case, because we are from Brazil) and create a rule to build this.

We came with a simple rule:

  • Term should have a max of 4 letters;
  • Words that have more than 4 letters should stick to the first 4;
  • Words with less than 4 letters enter in full mode;

In the talk, they said that if a word with a 4 letters term already exists, than we should get the next letter of the world and create a new term. This sounds good, but came became a little strange, specially with know words against not well know like this:

Im putting a translation here
DESCER => come down
DESCRICAO => Description


See that the 4 first letters of both words are DESC! Description in much more used in tech than come down... To solve this issue we could use nltk Freq object.

So, this is my implementation:

from unicodedata import normalize
from collections import defaultdict
import logging
import codecs
import nltk
import requests
from bs4 import BeautifulSoup
import json

dicionario = set()
palavras = nltk.FreqDist()

stopwords = nltk.corpus.stopwords.words('portuguese') + nltk.corpus.stopwords.words()
names = nltk.corpus.names.words()
names = [name.lower() for name in names]
stopwords = stopwords + names

def remover_non_alpha(text):
    """Remove all non alpha chars from our text"""
    text = ''.join([i for i in text if not i.isdigit()])
    text = ''.join(c for c in text if c.isalnum())
    return text

def remover_acentos(txt, codif='utf-8'):
    """Remove all accent from our text"""
    return normalize('NFKD', txt.decode(codif)).encode('ASCII','ignore')

def import_machado_words():
    """Import a FreqDist of all words (except stopwords) that exist in machado work"""
    machado_obras = nltk.corpus.machado.fileids()
    machado_texto = nltk.FreqDist()
    for i, text in enumerate(machado_obras):
        machado_texto += nltk.FreqDist(nltk.corpus.machado.words(machado_obras[i]))
    machado_texto = nltk.FreqDist(w.lower() for w in machado_texto if w not in stopwords)
    return machado_texto

def import_macmorpho_words():
    """Same as above, but for MacMorpho set"""
    return nltk.FreqDist(w.lower() for w in nltk.corpus.mac_morpho.words() if w not in stopwords)

def import_floresta_words():
    """Same as above, but for Floresta set"""
    return nltk.FreqDist(w.lower() for w in nltk.corpus.floresta.words() if w not in stopwords)

def generate_word_list(filename):
    """Given a file, lets import all words in it and return a set"""
    palavras = set()
    with, 'r', 'utf-8') as dic:
        for word in dic:
            index = len(word)
            if '/' in word:
                index = word.index('/')
            if '-' not in word:
                word = word[:index].replace('\n', '').lower()
    return palavras

def generate_text(url):
    """Given a url, lets import all words (except stopwords) in it and return a FreqDist"""
        r = requests.get(url)
        if r.status_code == 200:
            soup = BeautifulSoup(r.text, 'lxml')
            raw = soup.find('body').get_text().lower().replace('.', '')
            words = sorted(raw.split())
            words = nltk.FreqDist(remover_non_alpha(w.lower()) for w in words if w not in stopwords)
            return words
        return nltk.FreqDist()
    except:"Erro ao carregar url %s " % (url))

# Import all words from these 2 files
dicionario |= generate_word_list('pt-BR.dic')
dicionario |= generate_word_list('palavras.txt')
dicionario = set(w.lower() for w in dicionario if w not in stopwords)

palavras += import_machado_words()
palavras += import_macmorpho_words()
palavras += import_floresta_words()


palavras += generate_text('')
palavras += generate_text('')
palavras += generate_text('')
palavras += generate_text('')

palavras += generate_text('')
palavras += generate_text('')
palavras += generate_text('') palavras += generate_text('')
palavras += generate_text('')
palavras += generate_text('')
palavras += generate_text('')
palavras += generate_text('')
palavras += generate_text('')

for i in xrange(0, 100):
    palavras += generate_text('' % (str(i)))

def run(dicionario, palavras):
    for pal in dicionario:
        if pal in palavras and len(pal) > 2:
            yield {remover_acentos(pal.encode('utf-8')): (pal, palavras[pal])}

# Generate a list with all words that appears in the FreqDist set
new_dict = list(run(dicionario, palavras))
new_dict.sort(key = lambda s: s.keys()[0]) # sorts normally by alphabetical order
new_dict.sort(key = lambda s: s.values()[0][1], reverse=True) # sorts by freq
new_dict.sort(key = lambda s: len(s.keys()[0]), reverse=False) # sorts by descending length

def varre_final_dict(key, final_dic):
    return True if key in final_dic.keys() else False

def get_valid_glossary(key, final_dic):
    # Get the term from the key
    sort_key = key[0:4]
    return sort_key

final_dic = dict()
# For each word in the dict for w in new_dict[0:2000]:
    value = w.values()[0]
    # Get a word, to generate a future term
    key = remover_acentos(value[0].encode('utf-8'))
    # Check if this word term already exists in the final dict
    key = get_valid_glossary(key, final_dic)
    value = w.values()[0]
    # If we could not generate a new term, well we must compact the words that represents this term together
    if key in final_dic.keys():
        if not final_dic[key]:
            final_dic[key] = []
        final_dic[key] = []
    final_dic[key] += [({'palavra': value[0], 'freq': value[1]})]

final_dic = sorted(final_dic.items(), key=lambda x: x[1][0].values()[1], reverse=True)
# Generate a json file with all terms and words! with open('palavras.json', 'wb') as js:

In the code above im not doing the complex part mentioned above, im just returning the 4 first letters of a word and done, new term created.
In next weeks i'll improve the script above making it more like a class base or ready to use app on heroku.

