import tweepy import csv import os import pandas as pd import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns import re import spacy from sklearn.model_selection import train_test_split import nltk nltk.download('stopwords') nltk.download('wordnet') from nltk.tokenize import RegexpTokenizer, WhitespaceTokenizer from nltk.stem import WordNetLemmatizer from nltk.corpus import stopwords import string from string import punctuation import collections from collections import Counter from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer import en_core_web_sm from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer from sklearn.metrics.pairwise import cosine_similarity from sklearn.metrics import jaccard_score
[nltk_data] Downloading package stopwords to /root/nltk_data... [nltk_data] Package stopwords is already up-to-date! [nltk_data] Downloading package wordnet to /root/nltk_data... [nltk_data] Package wordnet is already up-to-date!
df.head(2)
Текстовая классификация
- Очистка твитов, лемматизация, токенизация, удаление стоп-слов, знаков препинания, хэштегов и упоминаний
# remove the hashtags, mentions and unwanted characters from the tweet texts
def clean_text(df, text_field):
df[text_field] = df[text_field].str.lower()
df[text_field] = df[text_field].apply(lambda elem: re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", elem))
return df
clean_tweets = clean_text(df, 'content')
clean_tweets.head(2)
#remove stopwords, punctuations, lemmatize and tokenize word
# nlp = spacy.load("en_core_web_sm") #working with english only, no support for swahili
nlp = en_core_web_sm.load()
tokenizer = RegexpTokenizer(r'\w+')
lemmatizer = WordNetLemmatizer()
stop = set(stopwords.words('english'))
punctuation = list(string.punctuation) #already taken care of with the cleaning function.
stop.update(punctuation)
w_tokenizer = WhitespaceTokenizer()
def furnished(text):
final_text = []
for i in w_tokenizer.tokenize(text):
# for i in text.split():
if i.lower() not in stop:
word = lemmatizer.lemmatize(i)
final_text.append(word.lower())
return " ".join(final_text)
df.content = df.content.apply(furnished)
- Определение набора слов
economy_related_words = '''agriculture infrastructure capitalism trading service sector technology economical supply
industrialism efficiency frugality retrenchment downsizing credit debit value
economize save economically
economies sluggish rise rising spending conserve trend
low-management decline industry impact poor
profession surplus fall
declining accelerating interest sectors balance stability productivity increase rates
pushing expanding stabilize rate industrial borrowing struggling
deficit predicted increasing data
economizer analysts investment market-based economy debt free enterprise
medium exchange metric savepoint scarcity capital bank company stockholder fund business
asset treasury tourism incomes contraction employment jobs upturn deflation macroeconomics
bankruptcies exporters hyperinflation dollar entrepreneurship upswing marketplace commerce devaluation
quicksave deindustrialization stockmarket reflation downspin dollarization withholder bankroll venture capital
mutual fund plan economy mortgage lender unemployment rate credit crunch central bank financial institution
bank rate custom duties mass-production black-market developing-countries developing economic-growth gdp trade barter
distribution downturn economist'''
social_related_words = '''sociable, gregarious societal friendly society socialization political sociality
interpersonal ethnic socially party welfare public community socialist societies development
network humans socialism collective personal corporation social constructivism
relations volition citizenship brute attitude rights socio
socioeconomic ethics civic communal marital sociale socialized communities
policy unions
institutions values governmental organizations jamboree
festivity fairness support care
sides activism unsocial psychosocial
socializing psychological distributional demographic participation reunion
partygoer partyism festive power network gala housewarming celebration counterparty social-war
particularist interactional ideational asocial'''
culture_related_words = ''' ethnicity heritage modernity spirituality marxismmaterial culture
ethos nationality humanism romanticism civilisation traditionalism genetics
kinship heredity marriage indigenous archeology acculturate
ontogenesis viniculture modern clothes rooted
cicero societies history roots influence geography historical folk origins
phenomenon teleology ancient aspects perspective liberalism nowadays community style unique prevalent describes
today origin modernity beliefs genre barbarian ethnic
colonization cultural universal organization western-civilization structuralism culture
heathen pagan transculturation culture peasant classicist nativism anarchy ungrown philosophic cult
consciousness islamist bro-culture evolve cultic diaspora aftergrowth native cultural-relativism
mongolian cosmopolitan epistemology lifestyles diversity chauvinism westernization materialism vernacular
homogeneity otherness holism tusculanae disputationes primitivism superficiality hedonism discourse
puritanism modernism intellectualism exclusiveness elitism colonialism
pentecostalism paganism nationwide expansion rural auxesis kimono
culturize alethophobia nettlebed japanification dongyi clannishness insularity hybridity
westernisation foreignness worldview exclusionism enculturation ethnocentrism confucianist vulgarization
shintoism westernism denominationalism deracination
eurocentrism cosmologies emotiveness bohemianism territorialism
philosophical-doctrine ethnic minority social-darwinism theory cultural evolution belief systemfolk music
traditional art house karl-marx theorymedia
film-theory art history museum studies cultural artifact'''
health_related_words = '''disease obesity world health organization medicine nutrition well-being exercise welfare wellness health care public health
nursing stress safety hygiene research social healthy condition aids epidemiology healthiness wellbeing
care illness medical dieteducation infectious disease environmental healthcare physical fitness hospitals
health care provider doctors healthy community design insurance sanitation human body patient mental health
medicare agriculture health science fitnesshealth policy weight loss physical therapy psychology pharmacy
metabolic organism human lifestyle status unhealthy upbeat vaccination sleep condom alcohol smoking water family
eudaimonia eudaemonia air house prevention genetics public families poor needs treatment communicable disease
study protection malaria development food priority management healthful mental provide department administration
programs help assistance funding environment improving emergency need program affected schools private mental illness
treat diseases preparedness perinatal fertility sickness veterinary sanitary pharmacists behavioral midwives
gerontology infertility hospitalization midwifery cholesterol childcare pediatrician pediatrics medicaid asthma
pensions sicknesses push-up physical education body-mass-index eat well gymnastic apparatus tune up good morning
bathing low blood-pressure heart attack health club ride-bike you feel good eczema urticaria dermatitis sunburn overwork
manufacturing medical sociology need exercise run'''
- Предварительная обработка наборов. Токенизация и удаление стоп-слов из наборов
nlp = en_core_web_sm.load() tokenizer = RegexpTokenizer(r'\w+') lemmatizer = WordNetLemmatizer() stop = set(stopwords.words('english')) punctuation = list(string.punctuation) stop.update(punctuation) w_tokenizer = WhitespaceTokenizer() # clean the set of words def furnished(text): final_text = [] for i in text.split(): if i.lower() not in stop: word = lemmatizer.lemmatize(i) final_text.append(word.lower()) return " ".join(final_text)
economy = furnished(economy_related_words) social = furnished(social_related_words) culture = furnished(culture_related_words) health = furnished(health_related_words)
Удаление дубликатов
# delete duplicates
string1 = economy
words = string1.split()
economy = " ".join(sorted(set(words), key=words.index))
string1 = social
words = string1.split()
social = " ".join(sorted(set(words), key=words.index))
string1 = culture
words = string1.split()
culture = " ".join(sorted(set(words), key=words.index))
string1 = health
words = string1.split()
health = " ".join(sorted(set(words), key=words.index))
- Векторизация и стандартизация. Необходимо, если будет использоваться сходство косинусов
'''Vectorizing the sets of words, then standardizing them. TFIDF will be used in order to take care of the least frequent words. Standardizing is cause TFIDF favors long sentences and there'll be inconsistencies between the length of the tweets and the length of set of words.''' def get_vectors(*strs): text = [t for t in strs] vectorizer = TfidfVectorizer() vectorizer.fit(text) return vectorizer.transform(text).toarray()
social
'sociable, gregarious societal friendly society socialization political sociality interpersonal ethnic socially party welfare public community socialist development network human socialism collective personal corporation social constructivism relation volition citizenship brute attitude right socio socioeconomic ethic civic communal marital sociale socialized policy union institution value governmental organization jamboree festivity fairness support care side activism unsocial psychosocial socializing psychological distributional demographic participation reunion partygoer partyism festive power gala housewarming celebration counterparty social-war particularist interactional ideational asocial'
socialvector = get_vectors(social) economic_vector = get_vectors(economy) culture_vector = get_vectors(culture) health_vector = get_vectors(health)
## Vectorizing the tweets tv=TfidfVectorizer() tfidf_tweets =tv.fit_transform(df.content)
'''Jaccard similarity is good for cases where duplication does not matter, cosine similarity is good for cases where duplication matters while analyzing text similarity. For two product descriptions, it will be better to use Jaccard similarity as repetition of a word does not reduce their similarity.''' def jaccard_similarity(query, document): intersection = set(query).intersection(set(document)) union = set(query).union(set(document)) return len(intersection)/len(union) # jaccard_score(socialvector, economic_vector) #for similarity of 1 and 2 of column1 # jaccard_similarity('dog lion a dog','dog is cat') def get_scores(group,tweets): scores = [] for tweet in tweets: s = jaccard_similarity(group, tweet) scores.append(s) return scores
# economic scores e_scores = get_scores(economy, df.content.to_list()) e_scores[-10:]
[0.7333333333333333, 0.7666666666666667, 0.7333333333333333, 0.6571428571428571, 0.7, 0.7333333333333333, 0.7666666666666667, 0.7333333333333333, 0.7666666666666667, 0.7333333333333333]
s_scores = get_scores(social, df.content.to_list()) s_scores[-10:]
[0.7, 0.7931034482758621, 0.7586206896551724, 0.6764705882352942, 0.7241379310344828, 0.7586206896551724, 0.7333333333333333, 0.7586206896551724, 0.7333333333333333, 0.7586206896551724]
c_scores = get_scores(culture, df.content.to_list()) c_scores[-10:]
[0.7333333333333333, 0.7666666666666667, 0.7333333333333333, 0.6571428571428571, 0.7, 0.7333333333333333, 0.7666666666666667, 0.7333333333333333, 0.7666666666666667, 0.7333333333333333]
h_scores = get_scores(health,df.content.to_list()) h_scores[:6]
[0.8076923076923077, 0.6153846153846154, 0.8076923076923077, 0.696969696969697, 0.6857142857142857, 0.7857142857142857]
'''new df with names, and the jaccard scores for each group''' data = {'names':df.username.to_list(), 'economic_score':e_scores, 'social_score': s_scores, 'culture_score':c_scores, 'health_scores':h_scores} scores_df = pd.DataFrame(data) scores_df.head(20)
'''Actual assigning of classes to the tweets''' def get_clusters(l1, l2, l3, l4): econ = [] socio = [] cul = [] heal = [] for i, j, k, l in zip(l1, l2, l3, l4): m = max(i, j, k, l) if m == i: econ.append(1) else: econ.append(0) if m == j: socio.append(1) else: socio.append(0) if m == k: cul.append(1) else: cul.append(0) if m == l: heal.append(1) else: heal.append(0) return econ, socio, cul, heal
l1 = scores_df.economic_score.to_list() l2 = scores_df.social_score.to_list() l3 = scores_df.culture_score.to_list() l4 = scores_df.health_scores.to_list() econ, socio, cul, heal = get_clusters(l1, l2, l3, l4)
data = {'name': scores_df.names.to_list(), 'economic':econ, 'social':socio, 'culture':cul, 'health': heal} cluster_df = pd.DataFrame(data) cluster_df.head(5)
'''Due to the close similarity between the economic, social and health tweets, some tweets have multiple categories, so to take care of that the rows with a sum > 1 will have to share the categories. After this, clustering will take care of the fractions''' a = cluster_df[['economic', 'social', 'culture', 'health']].sum(axis = 1) > 1 c = cluster_df[['economic', 'social', 'culture', 'health']].sum(axis = 1) # b = cluster_df.copy() cluster_df.loc[(a), ['economic','social', 'culture', 'health']] = 1/c
pivot_clusters = cluster_df.groupby(['name']).sum() pivot_clusters['economic'] = pivot_clusters['economic'].astype(int) pivot_clusters['social'] = pivot_clusters['social'].astype(int) pivot_clusters['culture'] = pivot_clusters['culture'].astype(int) pivot_clusters['health'] = pivot_clusters['health'].astype(int) pivot_clusters['total'] = pivot_clusters['health'] + pivot_clusters['culture'] + pivot_clusters['social'] + pivot_clusters['economic'] pivot_clusters.loc["Total"] = pivot_clusters.sum() #add a totals row print(pivot_clusters.shape) pivot_clusters.tail()
(705, 5)
'''A pie chart to show the total number of tweets in each category'''
fig = plt.figure(figsize =(10, 7))
a = pivot_clusters.drop(['total'], axis = 1)
plt.pie(a.loc['Total'], labels = a.columns)
plt.title('A pie chart showing the volumes of tweets under different categories.')
plt.show()
d = pivot_clusters.sort_values(by = 'total', ascending = False)
e = d.head(12)
e.drop(e.head(2).index, inplace=True)
plt.figure(figsize=(12,10))
sns.barplot(x = e.index, y = e.total)
plt.title('A bar plot showing top tweeps based on volume of tweets')
plt.xticks(rotation=45)
plt.xlabel('screen names')
plt.ylabel('total tweets')
d = pivot_clusters.sort_values(by = 'economic', ascending = False)
e = d.head(11)
e.drop(e.head(1).index, inplace=True)
plt.figure(figsize=(12,10))
sns.barplot(x = e.index, y = e.economic)
plt.title('A bar plot showing top tweeps based on volume of economy tweets')
plt.xticks(rotation=45)
plt.xlabel('screen names')
plt.ylabel('economy tweets')
''' Users with most social tweets'''
d = pivot_clusters.sort_values(by = 'social', ascending = False)
e = d.head(12)
e.drop(e.head(2).index, inplace=True)
plt.figure(figsize=(12,10))
sns.barplot(x = e.index, y = e.social)
plt.title('A bar plot showing top tweeps based on volume of social tweets')
plt.xticks(rotation=45)
plt.xlabel('screen names')
plt.ylabel('social tweets')
d = pivot_clusters.sort_values(by = 'culture', ascending = False)
e = d.head(11)
e.drop(e.head(1).index, inplace=True)
plt.figure(figsize=(12,10))
sns.barplot(x = e.index, y = e.culture)
plt.title('A bar plot showing top tweeps based on volume of culture tweets')
plt.xticks(rotation=45)
plt.xlabel('screen names')
plt.ylabel('culture tweets')
Кластеризация
from sklearn.cluster import KMeans
# X = pivot_clusters.iloc[:, [0,1]].values
X = pivot_clusters[['economic', 'social', 'culture', 'health']].values
# Elbow Method
from sklearn.cluster import KMeans
wcss = []
for i in range(1, 11):
kmeans = KMeans(n_clusters=i, init='k-means++', n_init=10, max_iter=300, random_state=0)
kmeans.fit(X)
wcss.append(kmeans.inertia_)
plt.plot(range(1,11), wcss)
plt.title('Elbow Method')
plt.xlabel('Number of Clusters')
plt.ylabel('wcss')
plt.show()
# fitting kmeans to dataset
kmeans = KMeans(n_clusters=4, init='k-means++', n_init=10, max_iter=300, random_state=0)
Y_kmeans = kmeans.fit_predict(X)
# Visualising the clusters
plt.scatter(X[Y_kmeans==0, 0], X[Y_kmeans==0, 1], s=100, c='violet', label= 'Cluster 1')
plt.scatter(X[Y_kmeans==1, 0], X[Y_kmeans==1, 1], s=100, c='cyan', label= 'Cluster 2')
plt.scatter(X[Y_kmeans==2, 0], X[Y_kmeans==2, 1], s=100, c='green', label= 'Cluster 3')
plt.scatter(X[Y_kmeans==3, 0], X[Y_kmeans==3, 1], s=100, c='blue', label= 'Cluster 4')
# plt.scatter(X[Y_kmeans==4, 0], X[Y_kmeans==4, 1], s=100, c='magenta', label= 'Cluster 5')
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s=100, c='black', label='Centroids' )
plt.title('Clusters of tweets in economic, culture, health and social groups')
plt.xlabel('economic tweets')
plt.ylabel('social tweets')
plt.legend()
plt.show()
Это все для этой статьи. Некоторые ссылки:
1. https://github.com/pksohn/tweet-clustering
2. https://github.com/ada-k/TweetsClassification