Extract Schema.org Data Script (Python)

Extract Schema.org Data Script

Maybe this is helpful for somebody...

Description

This script extracts Schema.org data from a given URL and saves it to a file.

Usage
  1. Run the Script: Execute the script in a Python environment.
  2. Input URL: Enter the URL of the webpage (without 'https://') when prompted.
  3. Output: The extracted data is saved in schema_data.txt.
Features
  • Extracts JSON-LD data from webpages.
  • Identifies and counts schema types and fields.
  • Saves formatted data along with metadata to a file.
Requirements
  • Python libraries: requests, beautifulsoup4.

      # extract_schema_data.py
      # Author: Christopher Hneke
      # Date: 07.07.2024
      # Description: This script extracts Schema.org data from a given URL and saves it to a file.
    
      import requests
      from bs4 import BeautifulSoup
      import json
      import os
      from collections import defaultdict
    
      # Function to extract Schema.org data from a given URL
      def extract_schema_data(url):
          response = requests.get(url)
          soup = BeautifulSoup(response.content, 'html.parser')
    
          schema_data = []
          schema_types = set()
          field_count = defaultdict(int)
    
          # Recursive helper function to extract types and field frequencies from JSON data
          def extract_types_and_fields(data):
              if isinstance(data, dict):
                  if '@type' in data:
                      if isinstance(data['@type'], list):
                          schema_types.update(data['@type'])
                      else:
                          schema_types.add(data['@type'])
                  for key, value in data.items():
                      field_count[key] += 1
                      extract_types_and_fields(value)
              elif isinstance(data, list):
                  for item in data:
                      extract_types_and_fields(item)
    
          # Look for all <script> tags with type="application/ld+json"
          for script in soup.find_all('script', type='application/ld+json'):
              try:
                  json_data = json.loads(script.string)
                  schema_data.append(json_data)
                  extract_types_and_fields(json_data)
              except json.JSONDecodeError as e:
                  print(f"Error decoding JSON: {e}")
    
          return schema_data, schema_types, field_count
    
      # Function to format Schema.org data for readable output
      def format_schema_data(schema_data):
          formatted_data = ""
          for data in schema_data:
              formatted_data += json.dumps(data, indent=4) + "\n\n"
          return formatted_data
    
      # Function to get the meta title of the page
      def get_meta_title(url):
          response = requests.get(url)
          soup = BeautifulSoup(response.content, 'html.parser')
          title_tag = soup.find('title')
          return title_tag.string if title_tag else 'No title found'
    
      # Function to save extracted data to a file
      def save_to_file(url, title, schema_types, formatted_data, field_count, filename='schema_data.txt'):
          try:
              with open(filename, 'w', encoding='utf-8') as file:
                  file.write(f"URL: {url}\n")
                  file.write(f"TITLE: {title}\n")
                  file.write(f"SCHEMA TYPES: {', '.join(schema_types)}\n\n")
                  file.write("Field Frequencies:\n")
                  for field, count in field_count.items():
                      file.write(f"{field}: {count}\n")
                  file.write("\nSchema Data:\n")
                  file.write(formatted_data)
              print(f"Schema.org data successfully saved to {filename}")
          except Exception as e:
              print(f"Error saving to file: {e}")
    
      # Main function to orchestrate the extraction and saving process
      def main():
          url_input = input("Please enter the URL without 'https://': ")
          url = f"https://{url_input}"
    
          schema_data, schema_types, field_count = extract_schema_data(url)
          if not schema_data:
              print("No Schema.org data found.")
              return
    
          meta_title = get_meta_title(url)
          formatted_data = format_schema_data(schema_data)
          save_to_file(url, meta_title, schema_types, formatted_data, field_count)
    
      if __name__ == "__main__":
          main()