Skip to content

Commit e1ca36b

Browse files
authored
Merge pull request #36 from mustberuss/master
Updating fieldsdf
2 parents bf3d452 + e4e13e4 commit e1ca36b

File tree

3 files changed

+525
-202
lines changed

3 files changed

+525
-202
lines changed

data-raw/fieldsdf.R

Lines changed: 140 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -1,71 +1,140 @@
1-
library(tidyverse)
2-
library(devtools)
3-
library(rapiclient)
4-
5-
load_all()
6-
7-
# Temp swagger API spec
8-
# TODO(any): Update with actual PatentsView version after its issues are sorted
9-
api <- get_api(url = "https://patentsview.historicip.com/swagger/openapi.json")
10-
11-
endpoint_paths <- names(api$paths)
12-
endpoint_paths <- endpoint_paths[!grepl("\\{", endpoint_paths)]
13-
endpoints <- gsub(".*/(\\w+)(/$)?", "\\1", endpoint_paths)
14-
entities <-
15-
sapply(endpoint_paths, function(y) {
16-
success_response <- api$paths[y][1][[y]]$get$responses$`200`$content$`application/json`$schema$`$ref`
17-
gsub(".*/(\\w+)SuccessResponse", "\\1", success_response)
18-
})
19-
20-
try_get_ref <- function(list) {
21-
if ("items" %in% names(list)) {
22-
gsub(".*/", "", list[["items"]][["$ref"]])
23-
} else {
24-
NA
25-
}
26-
}
27-
28-
extract_relevant_schema_info <- function(schema_elements) {
29-
out_list <- lapply(schema_elements, function(schema_element) {
30-
lapply(
31-
api$components$schemas[[schema_element]]$properties,
32-
function(x) data.frame(
33-
type = x$type,
34-
ref = try_get_ref(x)
35-
)
36-
) %>%
37-
do.call(rbind, .) %>%
38-
rownames_to_column() %>%
39-
setNames(c("field", "data_type", "ref")) %>%
40-
mutate(schema_element = schema_element)
41-
})
42-
do.call(rbind, out_list)
43-
}
44-
45-
nonnested_elements <- extract_relevant_schema_info(entities)
46-
47-
schema_element_names <- names(api$components$schemas)
48-
nested_elements <- schema_element_names[grepl("Nested$", schema_element_names)]
49-
nested_elements <- c("YearlyPatents", nested_elements)
50-
nested_elements <- extract_relevant_schema_info(nested_elements)
51-
52-
lookup <- sapply(endpoints, to_plural)
53-
names(lookup) <- entities
54-
55-
fieldsdf <-
56-
nonnested_elements %>%
57-
left_join(nested_elements, by = c("ref" = "schema_element")) %>%
58-
mutate(
59-
common_name = ifelse(is.na(ref), field.x, field.y),
60-
data_type = ifelse(is.na(ref), data_type.x, data_type.y),
61-
group = ifelse(is.na(ref), lookup[schema_element], field.x),
62-
endpoint = lookup[schema_element],
63-
field = ifelse(is.na(ref), common_name, paste0(group, ".", common_name))
64-
) %>%
65-
mutate(data_type = ifelse(grepl("_date$", common_name), "date", data_type)) %>%
66-
select(endpoint, field, data_type, group, common_name)
67-
68-
write.csv(fieldsdf, "data-raw/fieldsdf.csv", row.names = FALSE)
69-
70-
use_data(fieldsdf, internal = FALSE, overwrite = TRUE)
71-
use_data(fieldsdf, internal = TRUE, overwrite = TRUE)
1+
library(tidyverse)
2+
library(devtools)
3+
library(rapiclient)
4+
5+
load_all()
6+
7+
# TODO(any): remove corrections when bugs are fixed
8+
9+
corrections <- read.table(
10+
text = "endpoint field data_type
11+
assignee assignee_type int
12+
patent assignees.assignee_type int
13+
patent/us_application_citation citation_document_number int
14+
publication assignees.assignee_type int
15+
publication rule_47_flag bool",
16+
header = TRUE,
17+
stringsAsFactors = FALSE
18+
)
19+
20+
api <- get_api(url = "https://search.patentsview.org/static/openapi.json")
21+
22+
endpoint_paths <- names(api$paths)
23+
24+
# get rid of url parameter paths
25+
endpoint_paths <- endpoint_paths[!grepl("\\{", endpoint_paths)]
26+
27+
# now we need to keep the parent portion of the nested patent/ and publication/ endpoints
28+
endpoints <- sub("/api/v1/((patent/|publication/)?\\w+)/$", "\\1", endpoint_paths)
29+
30+
entities <-
31+
sapply(endpoint_paths, function(y) {
32+
success_response <- api$paths[y][1][[y]]$get$responses$`200`$content$`application/json`$schema$`$ref`
33+
gsub(".*/(\\w+SuccessResponse)", "\\1", success_response)
34+
})
35+
36+
lookup <- endpoints
37+
names(lookup) <- entities
38+
39+
# detect "type":"string", "format":"date" (which is normal)
40+
# Not sure if the other checks are standard but they're used in the patentsview object
41+
42+
data_type_intuit <- function(field_definition) {
43+
type <- field_definition$type
44+
format <- if ("format" %in% names(field_definition)) field_definition$format else ""
45+
example <- if ("example" %in% names(field_definition)) field_definition$example else ""
46+
as_is_types <- c("integer", "boolean", "array")
47+
48+
if (type %in% as_is_types) {
49+
type
50+
} else if (type == "number") {
51+
"integer"
52+
} else if (format == "date") {
53+
"date"
54+
} else if (type == "string" && example == "double") {
55+
"number"
56+
} else {
57+
type
58+
}
59+
}
60+
61+
62+
# recurse if type is array?
63+
64+
extract_relevant_schema_info <- function(schema_elements) {
65+
lapply(schema_elements, function(schema_element) {
66+
middle <- lapply(
67+
names(api$components$schemas[[schema_element]]$properties[[1]]$items$properties),
68+
function(x, y) {
69+
data_type <- data_type_intuit(y[[x]])
70+
71+
if (data_type == "array") {
72+
group <- x
73+
74+
inner <- lapply(
75+
names(y[[x]]$items$properties),
76+
function(a, b) {
77+
# only nested one deep- wouldn't be an array here
78+
data.frame(
79+
endpoint = lookup[[schema_element]],
80+
field = paste0(group, ".", a),
81+
data_type = data_type_intuit(b[[a]]),
82+
group = group,
83+
common_name = a
84+
)
85+
},
86+
y[[x]]$items$properties
87+
)
88+
89+
do.call(rbind, inner)
90+
} else {
91+
data.frame(
92+
endpoint = lookup[[schema_element]],
93+
field = x,
94+
data_type = data_type,
95+
group = "",
96+
common_name = x
97+
)
98+
}
99+
}, api$components$schemas[[schema_element]]$properties[[1]]$items$properties
100+
)
101+
102+
do.call(rbind, middle)
103+
}) %>%
104+
do.call(rbind, .) %>%
105+
arrange(endpoint, field) # sort so we can tell if the csv file changed
106+
}
107+
108+
fieldsdf <- extract_relevant_schema_info(entities)
109+
110+
# TODO(any): remove hard coding corrections when possible
111+
112+
# We need to make two sets of corrections. First we make hard coded type corrections
113+
# that we reported as bugs
114+
fieldsdf <- fieldsdf %>%
115+
left_join(corrections, by = c("endpoint", "field")) %>%
116+
mutate(data_type = coalesce(data_type.y, data_type.x)) %>%
117+
select(-data_type.x, -data_type.y) %>%
118+
relocate(data_type, .after = field)
119+
120+
# The second set of corrections is to append "_id" to fields and common_names below.
121+
# The API team may not concider this to be a bug. The OpenAPI object describes the
122+
# API's return, not the requests we make (requests with the _id are returned without them)
123+
# "patent","assignees.assignee","string","assignees","assignee"
124+
# "patent","inventors.inventor","string","inventors","inventor"
125+
# "publication","assignees.assignee","string","assignees","assignee"
126+
# "publication","inventors.inventor","string","inventors","inventor"
127+
128+
add_id_to <- c("assignees.assignee", "inventors.inventor")
129+
130+
# change common_name first, condition isn't met if field is changed first DAMHIKT
131+
fieldsdf <- fieldsdf %>%
132+
mutate(
133+
common_name = if_else(field %in% add_id_to, paste0(common_name, "_id"), common_name),
134+
field = if_else(field %in% add_id_to, paste0(field, "_id"), field)
135+
)
136+
137+
write.csv(fieldsdf, "data-raw/fieldsdf.csv", row.names = FALSE)
138+
139+
use_data(fieldsdf, internal = FALSE, overwrite = TRUE)
140+
use_data(fieldsdf, internal = TRUE, overwrite = TRUE)

0 commit comments

Comments
 (0)