import pandas as pd  
import numpy as np
import os  
import re
from string import digits
from gensim.models import word2vec
from gensim.models import phrases

current_directory = os.getcwd()
print("current_dictory", current_directory)

df = pd.read_csv('dreams.csv')  # Load a CSV file named 'dreams.csv'

#Initialize the corpus
corpus = []
# Iterate through each line of text in the CSV file
for text in df['dreams_text']:
    # Ensure the text is of string type
    if not isinstance(text, str):
        text = str(text)
    corpus.append(text)

# Check the length of the corpus
print(f"Total documents: {len(corpus)}")

Total documents: 30799

# Initialize the cleaned corpus
corpus_clean = []

for document in corpus:
    doc = re.sub(';?:!"', '.', document)  # Replace semicolons, colons, exclamation marks, and quotation marks with dots, as they will be used to split sentences
    doc = re.sub(r'[^\w\s.]', '', doc)  # Remove all remaining punctuation marks except for dots
    translation_table = str.maketrans('', '', digits)  # Create a translation table to remove digits
    doc = doc.translate(translation_table)  # Use the translation table to remove digits from the document
    doc = doc.lower()  # Convert all letters in the document to lowercase
    doc = re.sub(r'\s+', ' ', doc)  # Remove any extra spaces
    doc = doc.split('.')  # Split the document into sentences using dots as delimiters
    doc2 = [j.strip().split(' ') for j in doc]  # Split each sentence into words and remove any remaining extra spaces
    for i in doc2:
        filter(None, i)  # Remove empty words
    corpus_clean.extend(doc2)  # Extend the corpus_clean list with the processed sentences

bigram_transformer = phrases.Phrases(corpus_clean)

Model_100_10_1HS_samp1 = word2vec.Word2Vec( bigram_transformer[corpus_clean], workers=4, sg=1, hs=1, vector_size=100, window=10, sample=1e-3 )

#Let's see if the model can correctly recognize semantics.
Model_100_10_1HS_samp1.wv.doesnt_match("man woman summer girl".split())

'summer'

print(Model_100_10_1HS_samp1.wv.most_similar(positive=['girl', 'man'], negative=['boy'])) #boy:man as girl:_?___ WOMAN!

[('woman', 0.8032967448234558), ('person', 0.7134841680526733), ('lady', 0.6778110265731812), ('guy', 0.6606528759002686), ('young_man', 0.6053479313850403), ('stranger', 0.6048056483268738), ('lady_who', 0.6013890504837036), ('someone', 0.5913842916488647), ('young_woman', 0.5792549848556519), ('verbal', 0.5633144974708557)]

# demonstrate the five words most similar to the word "terrifying" in the Word2Vec model, along with their similarity scores to "terrifying".
WORD = "terrifying"

similar_words = Model_100_10_1HS_samp1.wv.most_similar(WORD, topn=5)

print(f"similar to ", WORD)
for word, similarity in similar_words:
    print(f"{word}: {similarity}")

similar to  terrifying
frightening: 0.733578622341156
scarey: 0.6859025359153748
nocturnal: 0.6689965128898621
nonetheless: 0.6657446026802063
very_realistic: 0.6455093622207642

WORD = "disturbing"
similar_words = Model_100_10_1HS_samp1.wv.most_similar(WORD, topn=5)
print(f"similar to ", WORD)
for word, similarity in similar_words:
    print(f"{word}: {similarity}")

similar to  disturbing
nostalgic: 0.5985854268074036
embarrassing: 0.5805899500846863
ecstatic: 0.5769983530044556
deja_vu: 0.5763196349143982
creeped_out: 0.571252703666687

WORD = "sweet"
similar_words = Model_100_10_1HS_samp1.wv.most_similar(WORD, topn=5)
print(f"similar to ", WORD)
for word, similarity in similar_words:
    print(f"{word}: {similarity}")

similar to  sweet
tasty: 0.6262686252593994
frail: 0.6100146174430847
soft: 0.5955066084861755
honey: 0.5822334289550781
bashful: 0.5809429883956909

#Find words that are semantically similar to the concept represented by the word "terrifying" when contrasted with the concept represented by the word "sweet"
print(Model_100_10_1HS_samp1.wv.most_similar(positive=['terrifying'], negative=['sweet'], topn=10))

[('control', 0.4562285244464874), ('outcome', 0.43268337845802307), ('impending', 0.41403695940971375), ('explosion', 0.4125122129917145), ('several_hundred', 0.409423828125), ('fully_awake', 0.406212717294693), ('frightening', 0.39035752415657043), ('crab', 0.38695216178894043), ('source', 0.38543036580085754), ('aware', 0.38370537757873535)]

from gensim.utils import simple_preprocess
import numpy as np  # Import numpy

def measure_dreams(model, text, feature):
    # Preprocess the text
    words = simple_preprocess(text)
    
    # Get the vector for the feature you want to calculate
    feature_vector = model.wv[feature]  # Use the feature variable
    
    # Calculate the average vector for the text
    text_vectors = [model.wv[word] for word in words if word in model.wv]
    if not text_vectors:
        return 0  # Return 0 if no words are in the vocabulary
    text_vector = np.mean(text_vectors, axis=0)
    
    # Calculate cosine similarity
    similarity = np.dot(feature_vector, text_vector) / (np.linalg.norm(feature_vector) * np.linalg.norm(text_vector))
    
    return similarity

# Example usage
text = " Nightmare in Cambodia. In the dream we are being overrun by sappers who have got past the Night Defensive Perimeter trips and claymores and now crawl forward. I wake up and see a boot tread close to my face. I slowly withdraw my .45 from its holster, pull the hammer back, then aim it at the boot. Just then the cloud-obscured moon comes out and I realize the boot is American and that it is Jerry Bieck's foot. In the pitch stillness I point the .45 straight up in the air. Pinching the hammer tightly I pull the trigger and settle the hammer back in place. I re-holster the pistol and go back to sleep. The next day, after a very difficult march, all the men are overjoyed to be out of Cambodia. I tell no one what almost happened."
feature_score = measure_dreams(Model_100_10_1HS_samp1, text, 'terrifying')  # Use quotes for the feature
print(f"The score of the text is: {feature_score}")

The score of the text is: 0.42425134778022766

text = "I know my ex-boyfriend from college, Tracey was in the dream, although not sure I remember much about that. I was at the mall walking with someone and I ran into these two people. One of them was Teri this girl I went to college with. We were really good friends in college. I was like 'TERI!!' and we hugged, it was very nice to see her. I remember telling her that I had been to Virginia beach a few times already and would be coming back this year. She is from va beach There was much more to this dream, just cannot remember."
feature_score = measure_dreams(Model_100_10_1HS_samp1, text, 'terrifying')  # Use quotes for the feature
print(f"The score of the text is: {feature_score}")

The score of the text is: 0.3600655198097229

	dreams_text
0	001 Nightmare in Cambodia. In the dream we are...
1	002 The enemy is above, in the sky...
2	003 We are on a firebase. It is night time...
3	004 We are on an LZ; I am. saying ...

Index	Value
0	['nightmare', 'in', 'cambodia']
1	['in', 'the', 'dream', 'we', 'are', 'being', 'overrun', 'by', 'sappers', 'who', 'have', 'got', 'past', 'the', 'night', 'defensive', 'perimeter', 'trips', 'and', 'claymores', 'and', 'now', 'crawl', 'forward']
2	['i', 'wake', 'up', 'and', 'see', 'a', 'boot', 'tread', 'close', 'to', 'my', 'face']
3	['i', 'slowly', 'withdraw', 'my']
4	['from', 'its', 'holster', 'pull', 'the', 'hammer', 'back', 'then', 'aim', 'it', 'at', 'the', 'boot']
5	['just', 'then', 'the', 'cloudobscured', 'moon', 'comes', 'out', 'and', 'i', 'realize', 'the', 'boot', 'is', 'american', 'and', 'that', 'it', 'is', 'jerry', 'biecks', 'foot']

Clean the corpus¶

Next, we can start training the model.¶

The model is trained; we can now examine the relationships between some words within this model.¶