使用 Selenium 和 OpenAI 的 GPT 4o-Mini 建立 LinkedIn Scraper

2024-08-12 | Fateh Ali Aamir

我想從頭開始建立一個 LinkedIn 抓取器。我嘗試查看各種公開可用的解決方案，但都不適合我。我發現，任何傳統透過抓取網頁元素並提取相關資料的方法都很容易因為程式碼的變動而失效。所以我決定將 GPT 加入其中。我意識到，如果我能獲取頁面上的「所有」文字，然後直接請 LLM 幫我提取資訊，就不需要做那些繁瑣的元素提取。畢竟，推理是它最強大的應用之一。因此，在學習了 Selenium 的基本知識並研究了一些現有的解決方案後，我建立了一個屬於自己的抓取器。接下來我們來深入了解一下。

actions.py

import getpass
from . import constants as c
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC

def login(driver, email=None, password=None, cookie = None, timeout=30):
    try:   
        driver.get("https://www.linkedin.com/login")
        element = WebDriverWait(driver, 10).until(EC.presence_of_element_located(("id", "username")))
    
        email_elem = driver.find_element("id","username")
        email_elem.send_keys(email)
    
        password_elem = driver.find_element("id","password")
        password_elem.send_keys(password)
        password_elem.submit()
       
        element = WebDriverWait(driver, timeout).until(EC.presence_of_element_located(("class name", "global-nav__content"))
        
    except Exception as e:
        print(f"Failed to log in: {e}")

我們使用driver.get()函數來取得 LinkedIn 登入網頁。我們使用WebDriverWait()函數來檢查網頁上的特定元素。我們使用此函數來等待網頁完全載入。頁面載入後，我們使用driver.find_element()來取得使用者名稱id和密碼id。獲得這些元素後，我們使用send_keys()函數將值放入文字方塊中。對於密碼元素，我們使用Submit ( )函數登入。如果我們得到了這個，就意味著我們已經加入了。

entity.py

import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
from .objects import Experience, Education, Scraper, Interest, Accomplishment, Contact
import os
from linkedin_scraper import selectors

class Entity(Scraper):

    __TOP_CARD = "top-card-background-hero-image"
    __WAIT_FOR_ELEMENT_TIMEOUT = 5

    def __init__(
        self,
        linkedin_url=None,
        driver=None,
        get=True,
        close_on_complete=True,
    ):
        self.driver = driver

我們創建了一個名為Entity的類，其中有兩個稍後將使用的常數。建構子使用linkedin_url、driver、get和close_on_complete進行初始化。然後，它根據呼叫建構函數時傳入的內容設定驅動程式。

def scrape(self, close_on_complete=True):    
    driver = self.driver

    try:
        # Wait for the page to load
        WebDriverWait(driver, self.__WAIT_FOR_ELEMENT_TIMEOUT).until(
            EC.presence_of_element_located(
                (
                    "tag name", "body"
                )
            )
        )
        self.focus()
        self.wait(5)
        
        # Scroll to the bottom of the page to load all dynamic content
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        self.wait(3)  # wait for additional content to load
        
        # Get all the text on the page
        page_text = driver.find_element("tag name", "body").text
        
        print("Scraped Text:")
        print(page_text)
        
        return page_text
        
    except Exception as e:
        print(f"Failed to scrape the page: {e}")
        page_text = ""

    finally:
        if close_on_complete:
            driver.quit()

    return page_text

scrape()函數將檢查頁面上是否存在主體。然後它將使用driver.execute_script()函數滾動到頁面底部以載入所有內容。之後，它將使用driver.find_element(“tag name”, “body”).text從網頁中提取所有文本，然後我們將列印並返回抓取的文本。

linkedin_scraper.py

from linkedin_scraper import Person, actions
from openai import OpenAI 
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

def scrape_linkedin(linkedin_url, email, password):
    # Set up Chrome options
    chrome_options = Options()
    chrome_options.add_argument("--headless")  # Run in headless mode
    chrome_options.add_argument("--disable-gpu")  # Disable GPU acceleration
    chrome_options.add_argument("--no-sandbox")  # Bypass OS security model
    chrome_options.add_argument("--disable-dev-shm-usage")  # Overcome limited resource problems
    chrome_options.add_argument("start-maximized")  # Start maximized
    chrome_options.add_argument("enable-automation")  # Enable automation controls
    chrome_options.add_argument("--disable-infobars")  # Disable infobars
    chrome_options.add_argument("--disable-extensions")  # Disable extensions
    chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36")
    
    # Initialize the Chrome driver with the options
    service = Service('/usr/bin/chromedriver')  # Update with the correct path to your chromedriver
    driver = webdriver.Chrome(service=service, options=chrome_options)
    client = OpenAI(api_key="OPENAI_API_KEY")
    if email and password:
        try:    
                
            # Log in to LinkedIn
            actions.login(driver, email, password)
                        
            # Wait for the LinkedIn homepage to load or for login to complete
            WebDriverWait(driver, 20).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, ".global-nav__me-photo"))
            )
            
            # Navigate to the profile page
            driver.get(linkedin_url)
            
            # Wait for the profile page to load
            WebDriverWait(driver, 20).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, ".pv-top-card__photo-wrapper.ml0"))
            )
            
            # Create an Entity object for the LinkedIn profile
            entity = Entity(linkedin_url=linkedin_url, driver=driver, scrape=False)
            
            # Scrape the LinkedIn profile data
            linkedin_data = entity.scrape(close_on_complete=True)  # Close browser after scraping
            
            prompt = """Extract and summarize the LinkedIn profile data into the following format:
                linkedin_data = {
                    "name": person.name,
                    "linkedin_url": person.linkedin_url,
                    "about": person.about,
                    "experiences": [str(exp) for exp in person.experiences],
                    "educations": [str(edu) for edu in person.educations],
                    "interests": [str(interest) for interest in person.interests],
                    "accomplishments": [str(accomplishment) for accomplishment in person.accomplishments],
                    "company": person.company,
                    "job_title": person.job_title
                }
            
            """
        
            response = client.chat.completions.create(
                model="gpt-4o-mini",
                temperature=0.1,
                messages=[
                    {"role": "system", "content": f'{prompt}'},
                    {"role": "user", "content": f'parse the following data: {linkedin_data}'}
                ]
            )

            print(response.choices[0].message.content)
            return response.choices[0].message.content
            
        except Exception as e:
            print(f"Failed to scrape {linkedin_url}: {e}")
            linkedin_data = {}
        
        finally:
            driver.quit()
    else:
        return {}

在我們的主要 scrape_linkedin() 函式中，我們首先會建立一些必要的設定。這些設定包括 chrome_options、services、driver 和 OpenAI 客戶端。接著，我們會使用 login() 函式與 driver、電子郵件和密碼進行登入。然後我們使用 WebDriverWait() 檢查是否已到達首頁，我們會檢查導覽列中的個人資料照片。確認後，我們會使用 driver.get() 函式前往我們想要抓取的個人資料頁面。

一旦到達該頁面，我們會再次使用 WebDriverWait() 檢查個人資料照片。如果確認存在，我們會初始化一個 Entity 物件，然後執行 scrape() 函式以從網頁中擷取所有文字資料。一旦取得這些資料，我們可以使用 OpenAI 的 Chat Completions API 將資料傳送至 OpenAI，讓其從這大量文字中擷取所需資料。最後，我們可以打印並返回這些資料。

整個解決方案是參考以下的 Github repo 所啟發的：https://github.com/joeyism/linkedin_scraper。請務必查看。

資料來源: https://fatehaliaamir.medium.com/build-a-linkedin-scraper-using-selenium-and-openais-gpt-4o-mini-d0f5a61e9902