#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see .
import linguistic
from Nb_noy_voc import Nb_noy_voc
def words_weights(texts):
weights = {}
for i, text in enumerate(texts):
counts, total = linguistic.count(text, linguistic.word_re_no_numbers)
# Divide the weight of words that are in the db but not in the current document
for word in set(weights).difference(set(counts)):
weights[word] /= Nb_noy_voc(word)+1
for word, count in counts.items():
a = Nb_noy_voc(word)
if len(word) == 1 or a == 0:
continue
weight = count/((a+1)**2*total)*10000
if word in weights:
# The word is already in the db, compute the average of both weights
weights[word] = (weights[word]+weight)/2
else:
# Take into account the number of texts already analized
weights[word] = weight/((i+1)**(1/1.5))
return weights
def find_stop_words(texts):
ret = {}
weights = words_weights(texts)
for text in texts:
for w1, w2 in linguistic.iter_word_groups(text, linguistic.word_re_no_numbers, 2, 2):
w1, w2 = w1.lower(), w2.lower()
try:
c1, c2 = weights[w1], weights[w2]
except KeyError:
continue
if c1 > 1 and c1 > c2*1000 and Nb_noy_voc(w1) < 3:
ret[w1] = w1 in ret and ret[w1]+1 or 1
if c1 > 1 and c2 > c1*1000 and Nb_noy_voc(w2) < 3:
ret[w2] = w2 in ret and ret[w2]+1 or 1
return ret