filename = "../Gutenberg/Books_EngFr/English/shakespeare/A Midsummer Night's Dream.txt"

with open(filename,'r') as current_file:
    text = current_file.read()
    
text = text.lower()

#get ride of punctuations
punctuation = [',','.','"',"'",";",':']

for punc in punctuation:
    text = text.replace(punc,"")
text = text.replace("\n","") #replace newlines
text = text.split() #split long string into words

text_word_count_dict = {}
for word in text:
    if word in text_word_count_dict:
        text_word_count_dict[word] +=1
    else:
        text_word_count_dict[word] = 1

print("There are",len(text_word_count_dict),"  words")

There are 4344   words

import pandas as pd

text_word_count_df = pd.DataFrame.from_dict(text_word_count_dict,orient = 'index',columns = ['word_count'])

help(pd.DataFrame.from_dict)

text_word_count_df.iloc[0:10]

text_word_count_df_sorted = text_word_count_df.sort_values(by = ['word_count'],ascending=False)

text_word_count_df_sorted.iloc[0:10]

def count_words_in_book(filename):
    """
    
    """
    with open(filename,'r') as current_file:
        text = current_file.read()
    
    text = text.lower()

    #get ride of punctuations
    punctuation = [',','.','"',"'",";",':']
    for punc in punctuation:
        text = text.replace(punc,"")
    text = text.replace("\n","") #replace newlines
    text = text.split() #split long string into words
    
    text_word_count_dict = {}
    for word in text:
        if word in text_word_count_dict:
            text_word_count_dict[word] +=1
        else:
            text_word_count_dict[word] = 1
            
    text_word_count_df = pd.DataFrame.from_dict(text_word_count_dict,orient = 'index',columns = ['word_count'])
    text_word_count_df_sorted = text_word_count_df.sort_values(by = ['word_count'],ascending=False)
    
    return(text_word_count_df_sorted)
    
myfile = "../Gutenberg/Books_EngFr/English/shakespeare/Romeo and Juliet.txt"
   #path to file
book_df = count_words_in_book(myfile) #book_df is a dataframe of words sorted by count
book_df.iloc[0:10]

	word_count
the	834
and	777
to	606
i	577
of	527
a	509
in	374
is	367
my	360
that	355