In [5]:
filename = "../Gutenberg/Books_EngFr/English/shakespeare/A Midsummer Night's Dream.txt"

with open(filename,'r') as current_file:
    text = current_file.read()
    
text = text.lower()

#get ride of punctuations
punctuation = [',','.','"',"'",";",':']
In [6]:
for punc in punctuation:
    text = text.replace(punc,"")
text = text.replace("\n","") #replace newlines
text = text.split() #split long string into words
In [7]:
text_word_count_dict = {}
for word in text:
    if word in text_word_count_dict:
        text_word_count_dict[word] +=1
    else:
        text_word_count_dict[word] = 1
In [8]:
print("There are",len(text_word_count_dict),"  words")
There are 4344   words
In [9]:
import pandas as pd
In [16]:
text_word_count_df = pd.DataFrame.from_dict(text_word_count_dict,orient = 'index',columns = ['word_count'])
In [ ]:
help(pd.DataFrame.from_dict)
In [ ]:
text_word_count_df.iloc[0:10]
In [20]:
text_word_count_df_sorted = text_word_count_df.sort_values(by = ['word_count'],ascending=False)
In [ ]:
text_word_count_df_sorted.iloc[0:10]
In [23]:
def count_words_in_book(filename):
    """
    
    """
    with open(filename,'r') as current_file:
        text = current_file.read()
    
    text = text.lower()

    #get ride of punctuations
    punctuation = [',','.','"',"'",";",':']
    for punc in punctuation:
        text = text.replace(punc,"")
    text = text.replace("\n","") #replace newlines
    text = text.split() #split long string into words
    
    text_word_count_dict = {}
    for word in text:
        if word in text_word_count_dict:
            text_word_count_dict[word] +=1
        else:
            text_word_count_dict[word] = 1
            
    text_word_count_df = pd.DataFrame.from_dict(text_word_count_dict,orient = 'index',columns = ['word_count'])
    text_word_count_df_sorted = text_word_count_df.sort_values(by = ['word_count'],ascending=False)
    
    return(text_word_count_df_sorted)
    
myfile = "../Gutenberg/Books_EngFr/English/shakespeare/Romeo and Juliet.txt"
   #path to file
book_df = count_words_in_book(myfile) #book_df is a dataframe of words sorted by count
book_df.iloc[0:10]
Out[23]:
word_count
the 834
and 777
to 606
i 577
of 527
a 509
in 374
is 367
my 360
that 355
In [ ]: