import pandas as pd
import numpy as np
from scipy.spatial.distance import cosine
from scipy import stats

# Load word vector data
tokens_df = pd.read_csv('D:\Downloads\info_token.csv\info_token.csv', names=['token_id', 'word_vector'])

# Load answer data with structured column names
answer_columns = [
    'answer_id', 'question_id', 'anonymous', 'author_id', 'labeled_high_value',
    'recommended_by_editor', 'create_timestamp', 'contain_pictures', 'contain_videos',
    'thanks_count', 'likes_count', 'comments_count', 'collections_count',
    'dislikes_count', 'reports_count', 'helpless_count', 'token_ids', 'topic_ids'
]
answers_df = pd.read_csv('D:\Downloads\info_answer.csv\info_answer.csv', names=answer_columns)

print("Step 2: Building token vectors dictionary...")
token_vectors = {}
for _, row in tokens_df.iterrows():
    token_id = int(row['token_id'])
    vector = np.array([float(x) for x in row['word_vector'].split()])
    token_vectors[token_id] = vector

Step 2: Building token vectors dictionary...

print("Step 3: Computing answer vectors...")
answer_vectors = {}
for _, row in answers_df.iterrows():
    if pd.isna(row['token_ids']):
        continue
    
    try:
        tokens = [int(t) for t in str(row['token_ids']).split()]
        vectors = [token_vectors[t] for t in tokens if t in token_vectors]
        if vectors:
            answer_vectors[row['answer_id']] = np.mean(vectors, axis=0)
    except (ValueError, KeyError):
        continue

Step 3: Computing answer vectors...

distances = [
    cosine(V1, V2),  # ≈ 0.8 (high divergence)
    cosine(V1, V3)   # ≈ 0.3 (moderate divergence)
]
controversy_score_1 = mean(distances) = 0.55

print("Step 4: Computing answer controversy scores...")
answer_controversy_scores = {}
question_groups = {}

# Group answers by question
for aid, vector in answer_vectors.items():
    qid = answers_df[answers_df['answer_id'] == aid]['question_id'].iloc[0]
    if qid not in question_groups:
        question_groups[qid] = {}
    question_groups[qid][aid] = vector

# Calculate controversy scores
for qid, answers in question_groups.items():
    if len(answers) < 2:  # Skip questions with single answers
        continue
    
    for aid, vector in answers.items():
        differences = []
        for other_aid, other_vector in answers.items():
            if aid != other_aid:
                diff = cosine(vector, other_vector)
                differences.append(diff)
        
        if differences:
            controversy_score = np.mean(differences)
            answer_controversy_scores[aid] = controversy_score

Step 4: Computing answer controversy scores...

print("Step 5: Computing question-level metrics...")
question_metrics = {}

for qid in question_groups.keys():
    question_answers = answers_df[answers_df['question_id'] == qid]
    
  # Calculate total interactions
    total_interactions = question_answers.agg({
        'likes_count': 'sum',
        'dislikes_count': 'sum',
        'comments_count': 'sum',
        'collections_count': 'sum',
        'reports_count': 'sum',
        'helpless_count': 'sum',
        'thanks_count': 'sum'
    })

Step 5: Computing question-level metrics...

### Calculating Question-Level Controversy
    answer_scores = [answer_controversy_scores.get(aid, 0) 
                    for aid in question_answers['answer_id']
                    if aid in answer_controversy_scores]
# Collect controversy scores for all answers under the question
# Compute mean as the overall controversy level for the question   


    if answer_scores:
        question_metrics[qid] = {
            'avg_controversy': np.mean(answer_scores),
            'total_likes': total_interactions['likes_count'],
            'total_dislikes': total_interactions['dislikes_count'],
            'total_comments': total_interactions['comments_count'],
            'total_collections': total_interactions['collections_count'],
            'total_reports': total_interactions['reports_count'],
            'total_helpless': total_interactions['helpless_count'],
            'total_thanks': total_interactions['thanks_count'],
            'total_engagement': sum(total_interactions)
        }

print("\nStep 6: Analyzing correlations...")
metrics_df = pd.DataFrame.from_dict(question_metrics, orient='index')
# This transformation restructures the dictionary data into a tabular format where:
# Each row represents a question
# Each column represents a metric (controversy or interaction)
# We analyzed the relationship between controversy and eight different interaction indicators

# Calculate Pearson and Spearman correlation coefficients
correlations = {
    'Pearson': {
        'likes': stats.pearsonr(metrics_df['avg_controversy'], metrics_df['total_likes'])[0],
        'dislikes': stats.pearsonr(metrics_df['avg_controversy'], metrics_df['total_dislikes'])[0],
        'comments': stats.pearsonr(metrics_df['avg_controversy'], metrics_df['total_comments'])[0],
        'collections': stats.pearsonr(metrics_df['avg_controversy'], metrics_df['total_collections'])[0],
        'reports': stats.pearsonr(metrics_df['avg_controversy'], metrics_df['total_reports'])[0],
        'helpless': stats.pearsonr(metrics_df['avg_controversy'], metrics_df['total_helpless'])[0],
        'thanks': stats.pearsonr(metrics_df['avg_controversy'], metrics_df['total_thanks'])[0],
        'total_engagement': stats.pearsonr(metrics_df['avg_controversy'], metrics_df['total_engagement'])[0]
    },
    'Spearman': {
        'likes': stats.spearmanr(metrics_df['avg_controversy'], metrics_df['total_likes'])[0],
        'dislikes': stats.spearmanr(metrics_df['avg_controversy'], metrics_df['total_dislikes'])[0],
        'comments': stats.spearmanr(metrics_df['avg_controversy'], metrics_df['total_comments'])[0],
        'collections': stats.spearmanr(metrics_df['avg_controversy'], metrics_df['total_collections'])[0],
        'reports': stats.spearmanr(metrics_df['avg_controversy'], metrics_df['total_reports'])[0],
        'helpless': stats.spearmanr(metrics_df['avg_controversy'], metrics_df['total_helpless'])[0],
        'thanks': stats.spearmanr(metrics_df['avg_controversy'], metrics_df['total_thanks'])[0],
        'total_engagement': stats.spearmanr(metrics_df['avg_controversy'], metrics_df['total_engagement'])[0]
    }
}
# Output analysis results
print("\nCorrelation Analysis Results:")
print("\nPearson Correlations between Controversy and Engagement:")
for metric, corr in correlations['Pearson'].items():
    print(f"{metric}: {corr:.4f}")
print("\nSpearman Correlations between Controversy and Engagement:")
for metric, corr in correlations['Spearman'].items():
    print(f"{metric}: {corr:.4f}")
# Save results
metrics_df.to_csv('controversy_engagement_analysis.csv')
pd.DataFrame(correlations).to_csv('controversy_correlations.csv')
# Output basic statistics
print("\nBasic Statistics:")
print("\nControversy Score Statistics:")
print(metrics_df['avg_controversy'].describe())
print("\nEngagement Statistics:")
print(metrics_df['total_engagement'].describe())

Step 6: Analyzing correlations...

Correlation Analysis Results:

Pearson Correlations between Controversy and Engagement:
likes: 0.0234
dislikes: 0.0187
comments: 0.0188
collections: -0.0162
reports: 0.0055
helpless: 0.0144
thanks: -0.0037
total_engagement: 0.0104

Spearman Correlations between Controversy and Engagement:
likes: 0.1526
dislikes: 0.1665
comments: 0.1429
collections: 0.0938
reports: 0.0778
helpless: 0.1473
thanks: 0.1254
total_engagement: 0.1350

Basic Statistics:

Controversy Score Statistics:
count    54758.000000
mean         0.225330
std          0.162521
min          0.000000
25%          0.112797
50%          0.179288
75%          0.289855
max          1.464474
Name: avg_controversy, dtype: float64

Engagement Statistics:
count    5.475800e+04
mean     8.070194e+03
std      3.842108e+04
min      0.000000e+00
25%      1.700000e+01
50%      2.060000e+02
75%      2.149750e+03
max      1.524389e+06
Name: total_engagement, dtype: float64

Step 1: Data Preparation and Library Import¶

Step 2: Word Vector Dictionary Construction¶

Step 3: Answer Vector Computation¶

Vector Averaging in Semantic Analysis¶

Word Vector Representations:¶

Vector Averaging Computation:¶

Mathematical Properties and Semantic Implications¶

Geometric Interpretation in Vector Space¶

Step 4: Computing Answer Controversy Scores¶

Practical Example¶

Step 5: Computing Question-Level Metrics¶

Engagement Metrics Categories:¶

1. Positive Engagement¶

2. Negative Engagement¶

3. Neutral Engagement¶

Step 6: Correlation Analysis¶