-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsplit_narration.py
180 lines (122 loc) · 7.96 KB
/
split_narration.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Filename: split_narration.py
# Authors: #cf
# 2016-05-21
# TODO: We need to know how many lines (not: edits) are first / third person narration!
import re
import os
import glob
import numpy as np
import pandas as pd
from scipy import stats
WorkDir = "/media/christof/data/Dropbox/0-Analysen/2016/martians/narration/"
DiffTable = WorkDir+"DiffTable_narration.csv"
TextFirst = WorkDir+"DiffedFirst.txt"
TextThird = WorkDir+"DiffedThird.txt"
def split_narration(DiffTable, TextFirst, TextThird):
"""
Distinguish edits by their context regarding narrative perspective; visualize.
"""
print("get_lines...")
# Open and read the two text parts
with open(TextFirst, "r") as FirstFile:
TextFirst = FirstFile.read()
with open(TextThird, "r") as ThirdFile:
TextThird = ThirdFile.read()
print("\n== Basic data ==")
TextFirstLines = re.split("\n", TextFirst)
TextThirdLines = re.split("\n", TextThird)
print("Length of text in lines: First", len(TextFirstLines), "; Third", len(TextThirdLines))
TextFirstTokens = re.split("\W", TextFirst)
TextThirdTokens = re.split("\W", TextThird)
print("Length of text in tokens: First", len(TextFirstTokens), "; Third", len(TextThirdTokens))
AvgLineLengthFirst = len(TextFirstTokens) / len(TextFirstLines)
AvgLineLengthThird = len(TextThirdTokens) / len(TextThirdLines)
print("Average length of lines (in tokens): First", str(AvgLineLengthFirst), "; Third", str(AvgLineLengthThird))
# Open and read the DiffTable
with open(DiffTable, "r") as InFile:
Diffs = pd.DataFrame.from_csv(InFile, sep="\t")
#print(Diffs.head())
GroupedDiffs = Diffs.groupby("narration")
GD = GroupedDiffs
#print(GroupedDiffs.head())
#print(len(GroupedDiffs))
# The sums of various data about first and third together
EditsBoth = GD.sum()
#print(EditsBoth)
# The sums of various data separated into first and third
EditsFirst = GD.get_group("first")
EditsThird = GD.get_group("third")
# How many edits where there?
EditsFirstCount = len(EditsFirst)
EditsThirdCount = len(EditsThird)
print("Number of edits (absolute sum): First", EditsFirstCount, "; Third", EditsThirdCount)
# What was the cumulated levenshtein difference? Absolute difference of characters?
LevenshteinFirst = EditsBoth.loc["first","levenshtein"]
LevenshteinThird = EditsBoth.loc["third","levenshtein"]
print("Levenshtein distances (absolute sum): First", LevenshteinFirst, "; Third", LevenshteinThird)
CharDeltaAbsFirst = EditsBoth.loc["first","char-delta-abs"]
CharDeltaAbsThird = EditsBoth.loc["third","char-delta-abs"]
print("Absolute Char Delta (absolute sum): First", CharDeltaAbsFirst, "; Third", CharDeltaAbsThird)
print("\n== Relative counts first vs. third ==")
# What was the relative number of edits (per line of text)?
EditsFirstCountRel = len(EditsFirst) / len(TextFirstLines)
EditsThirdCountRel = len(EditsThird) / len(TextThirdLines)
print("Number of edits (relative to lines): First", EditsFirstCountRel, "; Third", EditsThirdCountRel)
# What was the relative number of edits (per token of text)?
EditsFirstCountRel = len(EditsFirst) / len(TextFirstTokens)
EditsThirdCountRel = len(EditsThird) / len(TextThirdTokens)
print("Number of edits (relative to tokens): First", EditsFirstCountRel, "; Third", EditsThirdCountRel)
# What was the relative levenshtein difference and absolute difference of characters relative to the number of edits?
LevenshteinRelFirst = LevenshteinFirst / EditsFirstCount
LevenshteinRelThird = LevenshteinThird / EditsThirdCount
print("Levenshtein distance (relative to edits): First", LevenshteinRelFirst, "; Third", LevenshteinRelThird)
CharDeltaAbsRelFirst = CharDeltaAbsFirst / EditsFirstCount
CharDeltaAbsRelThird = CharDeltaAbsThird / EditsThirdCount
print("Absolute Char Delta (relative to edits): First", CharDeltaAbsRelFirst, "; Third", CharDeltaAbsRelThird)
# What was the relative levenshtein difference and absolute difference of characters relative to the number of lines?
LevenshteinRelFirst = LevenshteinFirst / len(TextFirstLines)
LevenshteinRelThird = LevenshteinThird / len(TextThirdLines)
print("Levenshtein distance (relative to lines): First", LevenshteinRelFirst, "; Third", LevenshteinRelThird)
CharDeltaAbsRelFirst = CharDeltaAbsFirst / len(TextFirstLines)
CharDeltaAbsRelThird = LevenshteinThird / len(TextFirstLines)
print("Absolute Char Delta (relative to lines): First", CharDeltaAbsRelFirst, "; Third", CharDeltaAbsRelThird)
#print(EditsFirst.loc[:,"levenshtein"].mean())
#print(EditsThird.loc[:,"levenshtein"].mean())
#print(np.mean(EditsFirst.loc[:,"levenshtein"]))
#print(np.mean(EditsThird.loc[:,"levenshtein"]))
# What was the relative levenshtein difference and absolute difference of characters relative to the number of tokens?
LevenshteinRelFirst = LevenshteinFirst / len(TextFirstTokens)
LevenshteinRelThird = LevenshteinThird / len(TextThirdTokens)
print("Levenshtein distance (relative to tokens): First", LevenshteinRelFirst, "; Third", LevenshteinRelThird)
CharDeltaAbsRelFirst = CharDeltaAbsFirst / len(TextFirstTokens)
CharDeltaAbsRelThird = CharDeltaAbsThird / len(TextThirdTokens)
print("Absolute Char Delta (relative to tokens): First", CharDeltaAbsRelFirst, "; Third", CharDeltaAbsRelThird)
print("\n== Data on doubly grouped data: first/third and copyedit/significant edits ==")
DoubleGroupedDiffs = Diffs.groupby(["narration","category"])
DGD = DoubleGroupedDiffs
FirstCopy = DGD.get_group(("first","copyedit"))
ThirdCopy = DGD.get_group(("third","copyedit"))
FirstSign = DGD.get_group(("first","other"))
ThirdSign = DGD.get_group(("third","other"))
print("Number of edits: First copyedits", len(FirstCopy), "; First significant", len(FirstSign))
print("Number of edits: Third copyedits", len(ThirdCopy), "; Third significant", len(ThirdSign))
print("Proportion of significant edits: First", len(FirstSign)/len(EditsFirst), "; Third", len(ThirdSign)/len(EditsThird))
print("Proportion of copyedits: First", len(FirstCopy)/len(EditsFirst), "; Third", len(ThirdCopy)/len(EditsThird))
print("\n== Significance tests ==")
# See http://docs.scipy.org/doc/scipy-0.16.1/reference/generated/scipy.stats.ttest_ind.html
LevenshteinFirstList = EditsFirst.loc[:,"levenshtein"][-2200:]#.div(len(EditsFirst))
LevenshteinThirdList = EditsThird.loc[:,"levenshtein"][-2200:]#.div(len(EditsThird))
#print(LevenshteinFirstList, LevenshteinThirdList)
ttest = stats.ttest_ind(LevenshteinFirstList, LevenshteinThirdList, axis=0, equal_var=False)
print("Welch's t-test for the Levenshtein distances first vs. third: statistics", ttest[0], "p-value", ttest[1])
CharDeltaFirstList = EditsFirst.loc[:,"char-delta-abs"]#.div(len(EditsFirst))
CharDeltaThirdList = EditsThird.loc[:,"char-delta-abs"]#.div(len(EditsThird))
#print(CharDeltaFirstList, CharDeltaThirdList)
ttest = stats.ttest_ind(CharDeltaFirstList, CharDeltaThirdList, axis=0, equal_var=False)
print("Welch's t-test for the absolute character differences first vs. third: statistics", ttest[0], "p-value", ttest[1])
# See http://docs.scipy.org/doc/scipy-0.16.1/reference/generated/scipy.stats.chisquare.html
#scipy.stats.ttest_ind_from_stats(mean1, std1, nobs1, mean2, std2, nobs2, equal_var=True)
print("\nDone.")
split_narration(DiffTable, TextFirst, TextThird)