|
1 |
| -library(tidyverse) |
2 |
| -library(devtools) |
3 |
| -library(rapiclient) |
4 |
| - |
5 |
| -load_all() |
6 |
| - |
7 |
| -# Temp swagger API spec |
8 |
| -# TODO(any): Update with actual PatentsView version after its issues are sorted |
9 |
| -api <- get_api(url = "https://patentsview.historicip.com/swagger/openapi.json") |
10 |
| - |
11 |
| -endpoint_paths <- names(api$paths) |
12 |
| -endpoint_paths <- endpoint_paths[!grepl("\\{", endpoint_paths)] |
13 |
| -endpoints <- gsub(".*/(\\w+)(/$)?", "\\1", endpoint_paths) |
14 |
| -entities <- |
15 |
| - sapply(endpoint_paths, function(y) { |
16 |
| - success_response <- api$paths[y][1][[y]]$get$responses$`200`$content$`application/json`$schema$`$ref` |
17 |
| - gsub(".*/(\\w+)SuccessResponse", "\\1", success_response) |
18 |
| - }) |
19 |
| - |
20 |
| -try_get_ref <- function(list) { |
21 |
| - if ("items" %in% names(list)) { |
22 |
| - gsub(".*/", "", list[["items"]][["$ref"]]) |
23 |
| - } else { |
24 |
| - NA |
25 |
| - } |
26 |
| -} |
27 |
| - |
28 |
| -extract_relevant_schema_info <- function(schema_elements) { |
29 |
| - out_list <- lapply(schema_elements, function(schema_element) { |
30 |
| - lapply( |
31 |
| - api$components$schemas[[schema_element]]$properties, |
32 |
| - function(x) data.frame( |
33 |
| - type = x$type, |
34 |
| - ref = try_get_ref(x) |
35 |
| - ) |
36 |
| - ) %>% |
37 |
| - do.call(rbind, .) %>% |
38 |
| - rownames_to_column() %>% |
39 |
| - setNames(c("field", "data_type", "ref")) %>% |
40 |
| - mutate(schema_element = schema_element) |
41 |
| - }) |
42 |
| - do.call(rbind, out_list) |
43 |
| -} |
44 |
| - |
45 |
| -nonnested_elements <- extract_relevant_schema_info(entities) |
46 |
| - |
47 |
| -schema_element_names <- names(api$components$schemas) |
48 |
| -nested_elements <- schema_element_names[grepl("Nested$", schema_element_names)] |
49 |
| -nested_elements <- c("YearlyPatents", nested_elements) |
50 |
| -nested_elements <- extract_relevant_schema_info(nested_elements) |
51 |
| - |
52 |
| -lookup <- sapply(endpoints, to_plural) |
53 |
| -names(lookup) <- entities |
54 |
| - |
55 |
| -fieldsdf <- |
56 |
| - nonnested_elements %>% |
57 |
| - left_join(nested_elements, by = c("ref" = "schema_element")) %>% |
58 |
| - mutate( |
59 |
| - common_name = ifelse(is.na(ref), field.x, field.y), |
60 |
| - data_type = ifelse(is.na(ref), data_type.x, data_type.y), |
61 |
| - group = ifelse(is.na(ref), lookup[schema_element], field.x), |
62 |
| - endpoint = lookup[schema_element], |
63 |
| - field = ifelse(is.na(ref), common_name, paste0(group, ".", common_name)) |
64 |
| - ) %>% |
65 |
| - mutate(data_type = ifelse(grepl("_date$", common_name), "date", data_type)) %>% |
66 |
| - select(endpoint, field, data_type, group, common_name) |
67 |
| - |
68 |
| -write.csv(fieldsdf, "data-raw/fieldsdf.csv", row.names = FALSE) |
69 |
| - |
70 |
| -use_data(fieldsdf, internal = FALSE, overwrite = TRUE) |
71 |
| -use_data(fieldsdf, internal = TRUE, overwrite = TRUE) |
| 1 | +library(tidyverse) |
| 2 | +library(devtools) |
| 3 | +library(rapiclient) |
| 4 | + |
| 5 | +load_all() |
| 6 | + |
| 7 | +# TODO(any): remove corrections when bugs are fixed |
| 8 | + |
| 9 | +corrections <- read.table( |
| 10 | + text = "endpoint field data_type |
| 11 | + assignee assignee_type int |
| 12 | + patent assignees.assignee_type int |
| 13 | + patent/us_application_citation citation_document_number int |
| 14 | + publication assignees.assignee_type int |
| 15 | + publication rule_47_flag bool", |
| 16 | + header = TRUE, |
| 17 | + stringsAsFactors = FALSE |
| 18 | +) |
| 19 | + |
| 20 | +api <- get_api(url = "https://search.patentsview.org/static/openapi.json") |
| 21 | + |
| 22 | +endpoint_paths <- names(api$paths) |
| 23 | + |
| 24 | +# get rid of url parameter paths |
| 25 | +endpoint_paths <- endpoint_paths[!grepl("\\{", endpoint_paths)] |
| 26 | + |
| 27 | +# now we need to keep the parent portion of the nested patent/ and publication/ endpoints |
| 28 | +endpoints <- sub("/api/v1/((patent/|publication/)?\\w+)/$", "\\1", endpoint_paths) |
| 29 | + |
| 30 | +entities <- |
| 31 | + sapply(endpoint_paths, function(y) { |
| 32 | + success_response <- api$paths[y][1][[y]]$get$responses$`200`$content$`application/json`$schema$`$ref` |
| 33 | + gsub(".*/(\\w+SuccessResponse)", "\\1", success_response) |
| 34 | + }) |
| 35 | + |
| 36 | +lookup <- endpoints |
| 37 | +names(lookup) <- entities |
| 38 | + |
| 39 | +# detect "type":"string", "format":"date" (which is normal) |
| 40 | +# Not sure if the other checks are standard but they're used in the patentsview object |
| 41 | + |
| 42 | +data_type_intuit <- function(field_definition) { |
| 43 | + type <- field_definition$type |
| 44 | + format <- if ("format" %in% names(field_definition)) field_definition$format else "" |
| 45 | + example <- if ("example" %in% names(field_definition)) field_definition$example else "" |
| 46 | + as_is_types <- c("integer", "boolean", "array") |
| 47 | + |
| 48 | + if (type %in% as_is_types) { |
| 49 | + type |
| 50 | + } else if (type == "number") { |
| 51 | + "integer" |
| 52 | + } else if (format == "date") { |
| 53 | + "date" |
| 54 | + } else if (type == "string" && example == "double") { |
| 55 | + "number" |
| 56 | + } else { |
| 57 | + type |
| 58 | + } |
| 59 | +} |
| 60 | + |
| 61 | + |
| 62 | +# recurse if type is array? |
| 63 | + |
| 64 | +extract_relevant_schema_info <- function(schema_elements) { |
| 65 | + lapply(schema_elements, function(schema_element) { |
| 66 | + middle <- lapply( |
| 67 | + names(api$components$schemas[[schema_element]]$properties[[1]]$items$properties), |
| 68 | + function(x, y) { |
| 69 | + data_type <- data_type_intuit(y[[x]]) |
| 70 | + |
| 71 | + if (data_type == "array") { |
| 72 | + group <- x |
| 73 | + |
| 74 | + inner <- lapply( |
| 75 | + names(y[[x]]$items$properties), |
| 76 | + function(a, b) { |
| 77 | + # only nested one deep- wouldn't be an array here |
| 78 | + data.frame( |
| 79 | + endpoint = lookup[[schema_element]], |
| 80 | + field = paste0(group, ".", a), |
| 81 | + data_type = data_type_intuit(b[[a]]), |
| 82 | + group = group, |
| 83 | + common_name = a |
| 84 | + ) |
| 85 | + }, |
| 86 | + y[[x]]$items$properties |
| 87 | + ) |
| 88 | + |
| 89 | + do.call(rbind, inner) |
| 90 | + } else { |
| 91 | + data.frame( |
| 92 | + endpoint = lookup[[schema_element]], |
| 93 | + field = x, |
| 94 | + data_type = data_type, |
| 95 | + group = "", |
| 96 | + common_name = x |
| 97 | + ) |
| 98 | + } |
| 99 | + }, api$components$schemas[[schema_element]]$properties[[1]]$items$properties |
| 100 | + ) |
| 101 | + |
| 102 | + do.call(rbind, middle) |
| 103 | + }) %>% |
| 104 | + do.call(rbind, .) %>% |
| 105 | + arrange(endpoint, field) # sort so we can tell if the csv file changed |
| 106 | +} |
| 107 | + |
| 108 | +fieldsdf <- extract_relevant_schema_info(entities) |
| 109 | + |
| 110 | +# TODO(any): remove hard coding corrections when possible |
| 111 | + |
| 112 | +# We need to make two sets of corrections. First we make hard coded type corrections |
| 113 | +# that we reported as bugs |
| 114 | +fieldsdf <- fieldsdf %>% |
| 115 | + left_join(corrections, by = c("endpoint", "field")) %>% |
| 116 | + mutate(data_type = coalesce(data_type.y, data_type.x)) %>% |
| 117 | + select(-data_type.x, -data_type.y) %>% |
| 118 | + relocate(data_type, .after = field) |
| 119 | + |
| 120 | +# The second set of corrections is to append "_id" to fields and common_names below. |
| 121 | +# The API team may not concider this to be a bug. The OpenAPI object describes the |
| 122 | +# API's return, not the requests we make (requests with the _id are returned without them) |
| 123 | +# "patent","assignees.assignee","string","assignees","assignee" |
| 124 | +# "patent","inventors.inventor","string","inventors","inventor" |
| 125 | +# "publication","assignees.assignee","string","assignees","assignee" |
| 126 | +# "publication","inventors.inventor","string","inventors","inventor" |
| 127 | + |
| 128 | +add_id_to <- c("assignees.assignee", "inventors.inventor") |
| 129 | + |
| 130 | +# change common_name first, condition isn't met if field is changed first DAMHIKT |
| 131 | +fieldsdf <- fieldsdf %>% |
| 132 | + mutate( |
| 133 | + common_name = if_else(field %in% add_id_to, paste0(common_name, "_id"), common_name), |
| 134 | + field = if_else(field %in% add_id_to, paste0(field, "_id"), field) |
| 135 | + ) |
| 136 | + |
| 137 | +write.csv(fieldsdf, "data-raw/fieldsdf.csv", row.names = FALSE) |
| 138 | + |
| 139 | +use_data(fieldsdf, internal = FALSE, overwrite = TRUE) |
| 140 | +use_data(fieldsdf, internal = TRUE, overwrite = TRUE) |
0 commit comments