In [20]:
#code for WP posting
<code><script>
var myUrl = 'http://analytics.business/?p=153&preview=true';
 
if(window.top.location.href !== myUrl) {
    window.top.location.href = myUrl;
}
</script></code>

#Analyze article clean, keywords, count, lexical dispersion & diversity April 11, 2017 Christa Taylor 
#ref article http://www.cbc.ca/news/technology/trump-climate-change-executive-order-1.4043650
import nltk
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('genesis')
nltk.download('wordnet')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')
from nltk.collocations import *   #nltk.download()
import numpy
import pandas as pd
import matplotlib
import sys  #reload(sys) #sys.setdefaultencoding("utf-8")
import os
import pickle
#ref article http://www.cbc.ca/news/technology/trump-climate-change-executive-order-1.4043650

os.chdir('d:/text/')  #folder
with open('cbcnewstrump.txt', 'r') as f:   #filename
 sample = f.read()

sentences = nltk.sent_tokenize(sample)
tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=True)

def extract_entity_names(t):
    entity_names = []

    if hasattr(t, 'label') and t.label:
        if t.label() == 'NE':
            entity_names.append(' '.join([child[0] for child in t]))
        else:
            for child in t:
                entity_names.extend(extract_entity_names(child))
    return entity_names

entity_names = []
for tree in chunked_sentences:
    # Print results per sentence
    # print extract_entity_names(tree)

    entity_names.extend(extract_entity_names(tree))

# Print unique entity names
#print set(entity_names)
set(['climate change', 'Trump', 'American', 'power plant', 'wind power industry', 'Environmental', 'coal miner', 'jobs', 'Environmental Protection Agency', 'America', 'Clean Power Plan', 'order'])
##Sentiment
##Using vader: http://www.nltk.org/_modules/nltk/sentiment/vader.html
sent = pd.DataFrame(index = range(0, 27),columns=["full_sentence","compound","negative","neutral","positive"])

sid = SentimentIntensityAnalyzer()
i=0
for sentence in sentences:
 sent["full_sentence"][i] = sentence
ss = sid.polarity_scores(sentence)
sent["compound"][i] = ss['compound']
sent["negative"][i] = ss['neg']
sent["positive"][i] = ss['pos']
sent["neutral"][i] =ss['neu']
i = i + 1

#print sent
print ("Mean Neutral Score:", sent["neutral"].mean() )
print ("Mean Positive Score:" , sent["positive"].mean())
print ("Mean Negative Score:" , sent["negative"].mean())
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\Chris\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\Chris\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Chris\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Chris\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Chris\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package genesis to
[nltk_data]     C:\Users\Chris\AppData\Roaming\nltk_data...
[nltk_data]   Package genesis is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Chris\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Chris\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
Mean Neutral Score: 1.0
Mean Positive Score: 0.0
Mean Negative Score: 0.0
In [18]:
#NLTK text ref: http://www.nltk.org/book/ch01.html -- Chapter 1
from nltk.corpus import stopwords

sample = sample.lower() #lowercase everything
tokens = nltk.word_tokenize(sample) # tokenize it
mytext = nltk.Text(tokens) # turn text into a NLTK Text object
words = [w.lower() for w in mytext if w.isalpha()] #remove punctuation
filtered_words = [word for word in words if word not in stopwords.words('english')] #get rid of stop words

mytext2 = nltk.Text(filtered_words)

print (mytext2.dispersion_plot(["climate", "energy", "order", "coal", "trump"]))

#Get parts of speech
#Meanings: http://nishutayaltech.blogspot.in/2015/02/penn-treebank-pos-tags-in-natural.html
pos = nltk.pos_tag(mytext2)
pos_freq = nltk.FreqDist(tag for (word, tag) in pos)
pos_freq.most_common()

##In order:There are singular nouns, then adjectives, then plural nouns, 
# verb present participle, verb past tense, adverbs, Verbs non-3rd person singular present,
# verbs, verbs 3rd person singular present, verb past participle, cardinal number, preposition, modal and superlative

#Determine how many tokens there are
print ("Length of the Full Text:" , len(words))

#Determine how many tokens there are in filtered text
print ("How many tokens are in the Filtered Text:" , len(filtered_words))

#calculate a measure of the lexical richness of the text
from __future__ import division
None
Length of the Full Text: 977
How many tokens are in the Filtered Text: 636
In [12]:
def lexical_diversity(text):
 return len(set(text)) / len(text)

def percentage(count, total):
 return 100 * count / total

print ("This text has the following lexical diversity:" , lexical_diversity(mytext), "and Trump consitutes ", percentage(mytext.count("trump"), len(mytext)) , "% of the text")

#frequency distributions
fdist1 = nltk.FreqDist(filtered_words)
fdist1.most_common(25)
fdist1.plot(25, cumulative=True)

for word, frequency in fdist1.most_common(150):
 print(u'{};{}'.format(word, frequency))

fdist1.plot(25, cumulative=False)
This text has the following lexical diversity: 0.5204460966542751 and Trump consitutes  1.3011152416356877 % of the text
order;10
trump;7
climate;6
power;6
energy;6
said;6
clean;5
change;5
environmental;5
jobs;4
executive;4
emissions;3
paris;3
american;3
states;3
president;3
coal;3
undo;2
former;2
groups;2
plan;2
regulation;2
accord;2
oil;2
united;2
carbon;2
administration;2
green;2
per;2
obama;2
activist;2
rules;2
cent;2
broader;2
industries;2
rose;2
regulations;2
global;2
group;2
done;1
permitting;1
reached;1
intrusion;1
made;1
scorn;1
leasing;1
stage;1
repeatedly;1
generate;1
steps;1
executives;1
reckless;1
headquarters;1
legal;1
analysts;1
campaign;1
scientific;1
mining;1
tuesday;1
one;1
donald;1
economics;1
questioned;1
plentiful;1
arguing;1
going;1
target;1
epa;1
moves;1
hurled;1
reality;1
undoes;1
lands;1
since;1
whether;1
ignores;1
main;1
independence;1
values;1
big;1
industry;1
miners;1
yet;1
called;1
actions;1
fight;1
year;1
review;1
introduced;1
methane;1
drilling;1
renewable;1
helping;1
drillers;1
address;1
effect;1
policy;1
trip;1
christiana;1
lamented;1
presidential;1
supporting;1
goes;1
barack;1
curb;1
health;1
broker;1
endanger;1
decisions;1
technologies;1
trying;1
boldest;1
back;1
trend;1
sector;1
element;1
rolling;1
secretary;1
prosperity;1
assault;1
create;1
federal;1
safety;1
wind;1
restrictions;1
infrastructure;1
nextgen;1
meet;1
van;1
direct;1
reducing;1
last;1
study;1
countries;1
cancel;1
nearly;1
earthjustice;1
formal;1
remain;1
court;1
process;1
speaking;1
make;1
historic;1
commitments;1
decree;1
continues;1
reduces;1
critical;1
tom;1
In [16]:
##Look at bigrams & collocations
##http://www.nltk.org/howto/collocations.html
bigram_measures = nltk.collocations.BigramAssocMeasures()
trigram_measures = nltk.collocations.TrigramAssocMeasures()
b = BigramCollocationFinder.from_words(mytext2)
b.apply_freq_filter(2)
b.nbest(bigram_measures.pmi, 20)

stemmer = nltk.stem.snowball.SnowballStemmer('english')
EC = pd.DataFrame(filtered_words,columns=["term"])
stems = [stemmer.stem(word) for word in filtered_words]
EC['stem'] = stems

print (EC.groupby(['stem']).agg(['count']))
            term
           count
stem            
accord         6
action         2
activist       2
address        1
administr      5
agenc          6
ahead          1
aim            2
air            3
almost         1
also           3
american       6
analyst        1
arab           1
archiv         1
argu           2
ask            2
assault        2
associ         2
back           1
ban            2
barack         1
barrel         1
becker         1
believ         1
big            1
bill           1
billionair     1
boldest        1
boom           1
...          ...
total          1
toward         1
track          1
transit        1
trend          1
tri            1
trigger        1
trip           1
trump         20
tuesday        1
tyler          1
u              1
undo           3
unit           4
use            1
valu           2
van            1
violent        1
vow            1
washington     2
websit         1
weigh          1
weight         2
whether        1
white          2
wind           1
would          1
year           1
yet            1
zink           1

[339 rows x 1 columns]