|
| 1 | +require "faraday" |
| 2 | +require "json" |
| 3 | +require "yaml" |
| 4 | +require "open-uri" |
| 5 | + |
| 6 | +module GitHub |
| 7 | + class GitHubAdvisorySync |
| 8 | + |
| 9 | + # Sync makes sure there are rubysec advisories for all GitHub advisories |
| 10 | + # It writes a set of yaml files, one for each GitHub Advisory that |
| 11 | + # is not already present in this repo |
| 12 | + # |
| 13 | + # The min_year argument specifies the earliest year CVE to sync |
| 14 | + # There are many old CVEs in the GitHub advisory dataset that are not in here |
| 15 | + # It is more important to sync the newer ones, so this allows the user to |
| 16 | + # control how old of CVEs the sync should pull over |
| 17 | + def self.sync(min_year: 2018) |
| 18 | + gh_advisories = GraphQLAPIClient.new.retrieve_all_rubygem_publishable_advisories |
| 19 | + |
| 20 | + # filter out advisories with a CVE year that is before the min_year |
| 21 | + # The script will write many files for years 2013, 2014 and other earlier years |
| 22 | + # Since older CVEs are not as interesting, I am leaving it up to the caller to |
| 23 | + # decide how older they want. The script is really designed to keep data synced |
| 24 | + # over going forward |
| 25 | + gh_advisories.select! do |advisory| |
| 26 | + _, cve_year = advisory.cve_id.match(/^CVE-(\d+)-\d+$/).to_a |
| 27 | + cve_year.to_i >= min_year |
| 28 | + end |
| 29 | + |
| 30 | + files_written = [] |
| 31 | + gh_advisories.each do |advisory| |
| 32 | + files_written += advisory.write_files |
| 33 | + end |
| 34 | + |
| 35 | + puts "\nSync completed" |
| 36 | + if files_written.empty? |
| 37 | + puts "Nothing to sync today! All CVEs after #{min_year} are already present" |
| 38 | + else |
| 39 | + puts "Wrote these files:\n#{files_written.to_yaml}" |
| 40 | + end |
| 41 | + |
| 42 | + files_written |
| 43 | + end |
| 44 | + end |
| 45 | + |
| 46 | + class GraphQLAPIClient |
| 47 | + GITHUB_API_URL = "https://api.github.com/graphql" |
| 48 | + |
| 49 | + GitHubApiTokenMissingError = Class.new(StandardError) |
| 50 | + |
| 51 | + # return a lazy initialized connection to github api |
| 52 | + def github_api(adapter = :net_http) |
| 53 | + @faraday_connection ||= begin |
| 54 | + puts "Initializing GitHub API connection to URL: #{GITHUB_API_URL}" |
| 55 | + Faraday.new do |conn_builder| |
| 56 | + conn_builder.adapter adapter |
| 57 | + conn_builder.headers = { |
| 58 | + "User-Agent" => "rubysec/ruby-advisory-db rubysec sync script", |
| 59 | + "Content-Type" => "application/json", |
| 60 | + "Authorization" => "token #{github_api_token}" |
| 61 | + } |
| 62 | + end |
| 63 | + end |
| 64 | + @faraday_connection |
| 65 | + end |
| 66 | + |
| 67 | + # An error class which gets raised when a GraphQL request fails |
| 68 | + GitHubGraphQLAPIError = Class.new(StandardError) |
| 69 | + |
| 70 | + # all interactions with the API go through this method to standardize |
| 71 | + # error checking and how queries and requests are formed |
| 72 | + def github_graphql_query(graphql_query_name, graphql_variables = {}) |
| 73 | + graphql_query_str = GraphQLQueries.const_get graphql_query_name |
| 74 | + graphql_body = JSON.generate query: graphql_query_str, |
| 75 | + variables: graphql_variables |
| 76 | + puts "Executing GraphQL request: #{graphql_query_name}. Request variables:\n#{graphql_variables.to_yaml}\n" |
| 77 | + faraday_response = github_api.post do |req| |
| 78 | + req.url GITHUB_API_URL |
| 79 | + req.body = graphql_body |
| 80 | + end |
| 81 | + puts "Got response code: #{faraday_response.status}" |
| 82 | + if faraday_response.status != 200 |
| 83 | + raise(GitHubGraphQLAPIError, "GitHub GraphQL request to #{faraday_response.env.url} failed: #{faraday_response.body}") |
| 84 | + end |
| 85 | + body_obj = JSON.parse faraday_response.body |
| 86 | + if body_obj["errors"] |
| 87 | + raise(GitHubGraphQLAPIError, body_obj["errors"].map { |e| e["message"] }.join(", ")) |
| 88 | + end |
| 89 | + body_obj |
| 90 | + end |
| 91 | + |
| 92 | + def retrieve_all_github_advisories(max_pages = 10, page_size = 100) |
| 93 | + all_advisories = [] |
| 94 | + variables = { "first" => page_size } |
| 95 | + max_pages.times do |page_num| |
| 96 | + puts "Getting page #{page_num + 1} of GitHub Advisories" |
| 97 | + page = github_graphql_query(:GITHUB_ADVISORIES_WITH_RUBYGEM_VULNERABILITY, variables) |
| 98 | + advisories_this_page = page["data"]["securityAdvisories"]["nodes"] |
| 99 | + all_advisories += advisories_this_page |
| 100 | + break unless page["data"]["securityAdvisories"]["pageInfo"]["hasNextPage"] == true |
| 101 | + variables["after"] = page["data"]["securityAdvisories"]["pageInfo"]["endCursor"] |
| 102 | + end |
| 103 | + puts "Retrieved #{all_advisories.length} Advisories from GitHub API" |
| 104 | + |
| 105 | + all_advisories.map do |advisory_graphql_obj| |
| 106 | + GitHubAdvisory.new github_advisory_graphql_object: advisory_graphql_obj |
| 107 | + end |
| 108 | + end |
| 109 | + |
| 110 | + def retrieve_all_rubygem_publishable_advisories |
| 111 | + all_advisories = retrieve_all_github_advisories |
| 112 | + # remove withdrawn advisories, |
| 113 | + # and remove those where there are no vulnerabilities for ruby |
| 114 | + all_advisories.reject { |advisory| advisory.withdrawn? } |
| 115 | + .select { |advisory| advisory.has_ruby_vulnerabilities? } |
| 116 | + end |
| 117 | + |
| 118 | + module GraphQLQueries |
| 119 | + GITHUB_ADVISORIES_WITH_RUBYGEM_VULNERABILITY = <<-GRAPHQL.freeze |
| 120 | + query($first: Int, $after: String) { |
| 121 | + securityAdvisories(first: $first, after: $after) { |
| 122 | + pageInfo { |
| 123 | + endCursor |
| 124 | + hasNextPage |
| 125 | + hasPreviousPage |
| 126 | + startCursor |
| 127 | + } |
| 128 | + nodes { |
| 129 | + identifiers { |
| 130 | + type |
| 131 | + value |
| 132 | + } |
| 133 | + summary |
| 134 | + description |
| 135 | + severity |
| 136 | + references { |
| 137 | + url |
| 138 | + } |
| 139 | + publishedAt |
| 140 | + withdrawnAt |
| 141 | + vulnerabilities(ecosystem:RUBYGEMS, first: 10) { |
| 142 | + nodes { |
| 143 | + package { |
| 144 | + name |
| 145 | + ecosystem |
| 146 | + } |
| 147 | + vulnerableVersionRange |
| 148 | + firstPatchedVersion { |
| 149 | + identifier |
| 150 | + } |
| 151 | + } |
| 152 | + } |
| 153 | + } |
| 154 | + } |
| 155 | + } |
| 156 | + GRAPHQL |
| 157 | + end |
| 158 | + |
| 159 | + private |
| 160 | + |
| 161 | + def github_api_token |
| 162 | + unless ENV["GH_API_TOKEN"] |
| 163 | + raise GitHubApiTokenMissingError, "Unable to make API requests. Must define 'GH_API_TOKEN' environment variable." |
| 164 | + end |
| 165 | + ENV["GH_API_TOKEN"] |
| 166 | + end |
| 167 | + end |
| 168 | + |
| 169 | + class GitHubAdvisory |
| 170 | + |
| 171 | + attr_reader :github_advisory_graphql_object |
| 172 | + |
| 173 | + def initialize(github_advisory_graphql_object:) |
| 174 | + @github_advisory_graphql_object = github_advisory_graphql_object |
| 175 | + end |
| 176 | + |
| 177 | + # extract the CVE identifier from the GitHub Advisory identifier list |
| 178 | + def cve_id |
| 179 | + identifier_list = github_advisory_graphql_object["identifiers"] |
| 180 | + cve_id_obj = identifier_list.find { |id| id["type"] == "CVE" } |
| 181 | + return nil unless cve_id_obj |
| 182 | + |
| 183 | + cve_id_obj["value"] |
| 184 | + end |
| 185 | + |
| 186 | + # return a date as a string like 2019-03-21. |
| 187 | + def published_day |
| 188 | + return nil unless github_advisory_graphql_object["publishedAt"] |
| 189 | + |
| 190 | + pub_date = Date.parse(github_advisory_graphql_object["publishedAt"]) |
| 191 | + # pub_date.strftime("%Y-%m-%d") |
| 192 | + pub_date |
| 193 | + end |
| 194 | + |
| 195 | + def package_names |
| 196 | + github_advisory_graphql_object["vulnerabilities"]["nodes"].map{|v| v["package"]["name"]}.uniq |
| 197 | + end |
| 198 | + |
| 199 | + def rubysec_filenames |
| 200 | + package_names.map do |package_name| |
| 201 | + File.join("gems", package_name, "#{cve_id}.yml") |
| 202 | + end |
| 203 | + end |
| 204 | + |
| 205 | + def withdrawn? |
| 206 | + !github_advisory_graphql_object["withdrawnAt"].nil? |
| 207 | + end |
| 208 | + |
| 209 | + def external_reference |
| 210 | + github_advisory_graphql_object["references"].first["url"] |
| 211 | + end |
| 212 | + |
| 213 | + def vulnerabilities |
| 214 | + github_advisory_graphql_object["vulnerabilities"]["nodes"] |
| 215 | + end |
| 216 | + |
| 217 | + def has_ruby_vulnerabilities? |
| 218 | + vulnerabilities.any? do |vuln| |
| 219 | + vuln["package"]["ecosystem"] == "RUBYGEMS" |
| 220 | + end |
| 221 | + end |
| 222 | + |
| 223 | + def some_rubysec_files_do_not_exist? |
| 224 | + rubysec_filenames.any?{|filename| !File.exist?(filename) } |
| 225 | + end |
| 226 | + |
| 227 | + def write_files |
| 228 | + return [] unless cve_id |
| 229 | + return [] unless some_rubysec_files_do_not_exist? |
| 230 | + |
| 231 | + files_written = [] |
| 232 | + vulnerabilities.each do |vulnerability| |
| 233 | + filename_to_write = File.join("gems", vulnerability["package"]["name"], "#{cve_id}.yml") |
| 234 | + next if File.exist?(filename_to_write) |
| 235 | + |
| 236 | + data = { |
| 237 | + "gem" => vulnerability["package"]["name"], |
| 238 | + "cve" => cve_id[4..20], |
| 239 | + "date" => published_day, |
| 240 | + "url" => external_reference, |
| 241 | + "title" => github_advisory_graphql_object["summary"], |
| 242 | + "description" => github_advisory_graphql_object["description"], |
| 243 | + "cvss_v3" => "<FILL IN IF AVAILABLE>", |
| 244 | + "patched_versions" => [ "<FILL IN SEE BELOW>" ], |
| 245 | + "unaffected_versions" => [ "<OPTIONAL: FILL IN SEE BELOW>" ] |
| 246 | + } |
| 247 | + |
| 248 | + dir_to_write = File.dirname(filename_to_write) |
| 249 | + Dir.mkdir dir_to_write unless Dir.exist?(dir_to_write) |
| 250 | + File.open(filename_to_write, "w") do |file| |
| 251 | + # create an automatically generated advisory yaml file |
| 252 | + file.write data.to_yaml |
| 253 | + |
| 254 | + # The data we just wrote is incomplete, |
| 255 | + # and therefore should not be committed as is |
| 256 | + # We can not directly translate from GitHub to rubysec advisory format |
| 257 | + # |
| 258 | + # The patched_versions field is not exactly available. |
| 259 | + # - GitHub has a first_patched_version field, |
| 260 | + # but rubysec advisory needs a ruby version spec |
| 261 | + # |
| 262 | + # The unnaffected_versions field is similarly not directly available |
| 263 | + # This optional field must be inferred from the vulnerableVersionRange |
| 264 | + # |
| 265 | + # To help write those fields, we put all the github data below. |
| 266 | + # |
| 267 | + # The second block of yaml in a .yaml file is ignored (after the second "---" line) |
| 268 | + # This effectively makes this data a large comment |
| 269 | + # Still it should be removed before the data goes into rubysec |
| 270 | + file.write "\n\n# GitHub advisory data below - **Remove this data before committing**\n" |
| 271 | + file.write "# Use this data to write patched_versions (and potentially unaffected_versions) above\n" |
| 272 | + file.write github_advisory_graphql_object.to_yaml |
| 273 | + end |
| 274 | + puts "Wrote: #{filename_to_write}" |
| 275 | + files_written << filename_to_write |
| 276 | + end |
| 277 | + |
| 278 | + files_written |
| 279 | + end |
| 280 | + end |
| 281 | +end |
0 commit comments