-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbuild_detail_table.R
184 lines (172 loc) · 12.9 KB
/
build_detail_table.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
library(data.table)
library(httr)
library(XML)
setwd('~/R/EDGAR')
# should load the file only if it is necessary - i.e. it is not already in memory
# filings <- readRDS('filings.rds')
getPrimaryDocUrl <- function(original_file_name) {
baseurl <- 'https://www.sec.gov/Archives/'
original_file_name <- gsub('-', '', original_file_name)
primaryDocUrl <- gsub('.txt', '/primary_doc.xml', original_file_name)
primaryDocUrl <- paste(baseurl, primaryDocUrl, sep = '')
return(primaryDocUrl)
}
checkForZeroStrings <- function(inString) {
if(is.null(inString) | length(inString) == 0) {
return('')
}
return(inString)
}
getFilingDetail <- function(form_d_file) {
form_d_file <- getPrimaryDocUrl(form_d_file)
zz <- xmlParse(rawToChar(GET(form_d_file)$content))
ns <- getNodeSet(zz, "/")
testOrLive <- xpathApply(ns[[1]], "//testOrLive", xmlValue, recursive = FALSE)
issuerName <- xpathApply(ns[[1]], "//primaryIssuer/entityName", xmlValue, recursive = TRUE)
cik <- xpathApply(ns[[1]], "//primaryIssuer/cik", xmlValue, recursive = TRUE)
issuerState <- xpathApply(ns[[1]], "//primaryIssuer/issuerAddress/stateOrCountry", xmlValue, recursive = TRUE)
issuerZip <- xpathApply(ns[[1]], "//primaryIssuer/issuerAddress/zipCode", xmlValue, recursive = TRUE)
issuerEntityType <- xpathApply(ns[[1]], "//primaryIssuer/entityType", xmlValue, recursive = TRUE)
issuerEntityTypeOtherDesc <- xpathApply(ns[[1]], "//primaryIssuer/entityTypeOtherDesc", xmlValue, recursive = TRUE)
industryGroup <- xpathApply(ns[[1]], "//offeringData/industryGroup", xmlValue, recursive = FALSE)
industryGroupType <- xpathApply(ns[[1]], "//offeringData/industryGroup/industryGroupType", xmlValue, recursive = TRUE)
investmentFundType <- xpathApply(ns[[1]], "//offeringData/industryGroup/investmentFundInfo/investmentFundType", xmlValue, recursive = TRUE)
revenueRange <- xpathApply(ns[[1]], "//offeringData/issuerSize/revenueRange", xmlValue, recursive = FALSE)
navRange <- xpathApply(ns[[1]], "//offeringData/issuerSize/aggregateNetAssetValueRange", xmlValue, recursive = FALSE)
federalExemptionsExclusions <- xpathApply(ns[[1]], "//offeringData/federalExemptionsExclusions/item", xmlValue, recursive = TRUE)
isAmendment <- xpathApply(ns[[1]], "//offeringData/typeOfFiling/newOrAmendment/isAmendment", xmlValue, recursive = FALSE)
previousAccessionNumber <- xpathApply(ns[[1]], "//offeringData/typeOfFiling/newOrAmendment/previousAccessionNumber", xmlValue, recursive = FALSE)
dateOfFirstSale <- xpathApply(ns[[1]], "//offeringData/typeOfFiling/dateOfFirstSale/value", xmlValue, recursive = FALSE)
durationOfOfferingMoreThanOneYear <- xpathApply(ns[[1]], "//offeringData/durationOfOffering/moreThanOneYear", xmlValue, recursive = FALSE)
isEquityType <- xpathApply(ns[[1]], "//offeringData/typesOfSecuritiesOffered/isEquityType", xmlValue, recursive = FALSE)
isDebtType <- xpathApply(ns[[1]], "//offeringData/typesOfSecuritiesOffered/isDebtType", xmlValue, recursive = FALSE)
isOptionToAcquireType <- xpathApply(ns[[1]], "//offeringData/typesOfSecuritiesOffered/isOptionToAcquireType", xmlValue, recursive = FALSE)
isOtherType <- xpathApply(ns[[1]], "//offeringData/typesOfSecuritiesOffered/isOtherType", xmlValue, recursive = FALSE)
isSecurityToBeAcquiredType <- xpathApply(ns[[1]], "//offeringData/typesOfSecuritiesOffered/isSecurityToBeAcquiredType", xmlValue, recursive = FALSE)
descriptionOfOtherType <- xpathApply(ns[[1]], "//offeringData/typesOfSecuritiesOffered/descriptionOfOtherType", xmlValue, recursive = FALSE)
isPooledInvestmentFundType <- xpathApply(ns[[1]], "//offeringData/typesOfSecuritiesOffered/isPooledInvestmentFundType", xmlValue, recursive = FALSE)
isTenantInCommonType <- xpathApply(ns[[1]], "//offeringData/typesOfSecuritiesOffered/isTenantInCommonType", xmlValue, recursive = FALSE)
isMineralPropertyType <- xpathApply(ns[[1]], "//offeringData/typesOfSecuritiesOffered/isMineralPropertyType", xmlValue, recursive = FALSE)
isBusinessCombinationTransaction <- xpathApply(ns[[1]], "//offeringData/businessCombinationTransaction/isBusinessCombinationTransaction", xmlValue, recursive = FALSE)
minimumInvestmentAccepted <- xpathApply(ns[[1]], "//offeringData/minimumInvestmentAccepted", xmlValue, recursive = FALSE)
recipientName <- xpathApply(ns[[1]], "//offeringData/salesCompensationList/recipient/recipientName", xmlValue, recursive = FALSE)
recipientCRDNumber <- xpathApply(ns[[1]], "//offeringData/salesCompensationList/recipient/recipientCRDNumber", xmlValue, recursive = FALSE)
associatedBDName <- xpathApply(ns[[1]], "//offeringData/salesCompensationList/recipient/associatedBDName", xmlValue, recursive = FALSE)
associatedBDCRDNumber <- xpathApply(ns[[1]], "//offeringData/salesCompensationList/recipient/associatedBDCRDNumber", xmlValue, recursive = FALSE)
salesCommissionDollarAmount <- xpathApply(ns[[1]], "//offeringData/salesCommissionsFindersFees/salesCommission/dollarAmount", xmlValue, recursive = FALSE)
findersFeesDollarAmount <- xpathApply(ns[[1]], "//offeringData/salesCommissionsFindersFees/findersFees/dollarAmount", xmlValue, recursive = FALSE)
totalOfferingAmount <- xpathApply(ns[[1]], "//offeringData/offeringSalesAmounts/totalOfferingAmount", xmlValue, recursive = FALSE)
totalAmountSold <- xpathApply(ns[[1]], "//offeringData/offeringSalesAmounts/totalAmountSold", xmlValue, recursive = FALSE)
typesOfSecuritiesOfferedOtherDescription <- xpathApply(ns[[1]], "//offeringData/typesOfSecuritiesOffered/isOtherType/descriptionOfOtherType", xmlValue, recursive = FALSE)
x = data.table(
originalFileName = form_d_file,
testOrLive = checkForZeroStrings(unlist(testOrLive)),
issuerName = checkForZeroStrings(unlist(issuerName)),
cik = checkForZeroStrings(unlist(cik)),
issuerState = checkForZeroStrings(unlist(issuerState)),
issuerZip = checkForZeroStrings(unlist(issuerZip)),
issuerEntityType = checkForZeroStrings(unlist(issuerEntityType)),
issuerEntityTypeOtherDesc = checkForZeroStrings(unlist(issuerEntityTypeOtherDesc)),
industryGroup = checkForZeroStrings(unlist(industryGroup)),
industryGroupType = checkForZeroStrings(unlist(industryGroupType)),
investmentFundType = checkForZeroStrings(unlist(investmentFundType)),
revenueRange = checkForZeroStrings(unlist(revenueRange)),
navRange = checkForZeroStrings(unlist(navRange)),
federalExemptions = checkForZeroStrings(paste(unlist(federalExemptionsExclusions), collapse = ' ')),
isAmendment = checkForZeroStrings(unlist(isAmendment)),
previousAccessionNumber = checkForZeroStrings(unlist(previousAccessionNumber)),
dateOfFirstSale = checkForZeroStrings(unlist(dateOfFirstSale)),
durationOfOfferingMoreThanOneYear = checkForZeroStrings(unlist(durationOfOfferingMoreThanOneYear)),
isEquityType = checkForZeroStrings(unlist(isEquityType)),
isDebtType = checkForZeroStrings(unlist(isDebtType)),
isOptionToAcquireType = checkForZeroStrings(unlist(isOptionToAcquireType)),
isSecurityToBeAcquiredType = checkForZeroStrings(unlist(isSecurityToBeAcquiredType)),
isOtherType = checkForZeroStrings(unlist(isOtherType)),
descriptionOfOtherType = checkForZeroStrings(unlist(descriptionOfOtherType)),
isPooledInvestmentFundType = checkForZeroStrings(unlist(isPooledInvestmentFundType)),
isTenantInCommonType = checkForZeroStrings(unlist(isTenantInCommonType)),
isMineralPropertyType = checkForZeroStrings(unlist(isMineralPropertyType)),
isBusinessCombinationTransaction = checkForZeroStrings(unlist(isBusinessCombinationTransaction)),
minimumInvestmentAccepted = checkForZeroStrings(unlist(minimumInvestmentAccepted)),
recipientName = checkForZeroStrings(paste(unlist(recipientName), collapse = '|')),
recipientCRDNumber = checkForZeroStrings(paste(unlist(recipientCRDNumber), collapse = '|')),
associatedBDName = checkForZeroStrings(paste(unlist(associatedBDName), collapse = '|')),
associatedBDCRDNumber = checkForZeroStrings(paste(unlist(associatedBDCRDNumber), collapse = '|')),
salesCommissionDollarAmount = checkForZeroStrings(unlist(salesCommissionDollarAmount)),
findersFeesDollarAmount = checkForZeroStrings(unlist(findersFeesDollarAmount)),
totalOfferingAmount = checkForZeroStrings(unlist(totalOfferingAmount)),
totalAmountSold = checkForZeroStrings(unlist(totalAmountSold)),
typesOfSecuritiesOfferedOtherDescription = checkForZeroStrings(unlist(typesOfSecuritiesOfferedOtherDescription))
)
return(x)
}
detail_table <- data.table(originalFileName = character(),
testOrLive = character(),
issuerName = character(),
cik = character(),
issuerState = character(),
issuerZip = character(),
issuerEntityType = character(),
issuerEntityTypeOtherDesc = character(),
industryGroup = character(),
industryGroupType = character(),
investmentFundType = character(),
revenueRange = character(),
navRange = character(),
federalExemptions = character(),
isAmendment = character(),
previousAccessionNumber = character(),
dateOfFirstSale = character(),
durationOfOfferingMoreThanOneYear = character(),
isEquityType = character(),
isDebtType = character(),
isOptionToAcquireType = character(),
isSecurityToBeAcquiredType = character(),
isOtherType = character(),
descriptionOfOtherType = character(),
isPooledInvestmentFundType = character(),
isTenantInCommonType = character(),
isMineralPropertyType = character(),
isBusinessCombinationTransaction = character(),
minimumInvestmentAccepted = character(),
recipientName = character(),
recipientCRDNumber = character(),
associatedBDName = character(),
associatedBDCRDNumber = character(),
salesCommissionDollarAmount = character(),
findersFeesDollarAmount = character(),
totalOfferingAmount = character(),
totalAmountSold = character(),
typesOfSecuritiesOfferedOtherDescription = character()
)
# subset the reg D and A docs
reg_d_filings <- filings[filings$V1 %in% c('D', 'D/A'),]
# remove the filings table from memory
rm(filings)
# first line below is commented as an example of how to use with a smaller dataset than the entire one
# for(row in 1:nrow(head(reg_d_filings, 100))) {
# another way is to work from the end of the detail_table:
for(row in (nrow(detail_table)+1):nrow(reg_d_filings)) {
# for(row in 1:nrow(reg_d_filings)) {
print(reg_d_filings[row,]$V4) # the date of the file being processed
detail_table <- rbind(detail_table, getFilingDetail(reg_d_filings[row,]$V5))
}
# save the reg_d_filings table to an RDS file
saveRDS(detail_table, 'detail_table.rds')
# to read it in later:
# detail_table <- readRDS('detail_table.rds')
# now mege the two tables together to have everything in one place, and save the table
reg_d_filings$V6 <- getPrimaryDocUrl(reg_d_filings$V5)
detail_table <- merge(reg_d_filings, detail_table, by.x = "V6", by.y = "originalFileName", sort = FALSE)
names(detail_table) <- c("Original Document URL", "Document Type", "Issuer Name", "CIK", "Date", "Doc URL", "Test Or Live",
"issuerName", "cik", "Issuer State", "Issuer Zip", "Issuer Entity Type",
"Issuer Entity Type Other Desc", "Industry Group", "Industry Group Type", "Investment Fund Type",
"Revenue Range", "NAV Range", "Federal Exemptions", "Is Amendment", "Amended Doc Accession Number",
"Date Of First Sale", "Duration Of Offering More Than One Year", "Is Equity Type", "Is Debt Type",
"Is Option To Acquire Type", "Is Security To Be Acquired Type", "Is Other Type",
"Description Of Other Type", "Is Pooled Investment Fund Type", "Is Tenant In Common Type",
"Is Mineral Property Type", "Is Business Combination Transaction", "Minimum Investment Accepted",
"Recipient Name", "Recipient CRD Number", "Associated BD Name", "Associated BD CRD Number",
"Sales Commission Dollar Amount", "Finders Fees Dollar Amount", "Total Offering Amount",
"Total Amount Sold", "Types Of Securities Offered Other Description")
saveRDS(detail_table, 'detail_table.rds')