{"id":7232,"date":"2024-08-12T14:48:00","date_gmt":"2024-08-12T06:48:00","guid":{"rendered":"https:\/\/aict.nkust.edu.tw\/digitrans\/?p=7232"},"modified":"2024-12-13T21:11:25","modified_gmt":"2024-12-13T13:11:25","slug":"%e4%bd%bf%e7%94%a8-selenium-%e5%92%8c-openai-%e7%9a%84-gpt-4o-mini-%e5%bb%ba%e7%ab%8b-linkedin-scraper","status":"publish","type":"post","link":"https:\/\/aict.nkust.edu.tw\/digitrans\/?p=7232","title":{"rendered":"\u4f7f\u7528 Selenium \u548c OpenAI \u7684 GPT 4o-Mini \u5efa\u7acb LinkedIn Scraper"},"content":{"rendered":"\n<p>2024-08-12 | Fateh Ali Aamir<\/p>\n\n\n\n<p id=\"bbc2\">\u6211\u60f3\u5f9e\u982d\u958b\u59cb\u5efa\u7acb\u4e00\u500b LinkedIn \u6293\u53d6\u5668\u3002\u6211\u5617\u8a66\u67e5\u770b\u5404\u7a2e\u516c\u958b\u53ef\u7528\u7684\u89e3\u6c7a\u65b9\u6848\uff0c\u4f46\u90fd\u4e0d\u9069\u5408\u6211\u3002\u6211\u767c\u73fe\uff0c\u4efb\u4f55\u50b3\u7d71\u900f\u904e\u6293\u53d6\u7db2\u9801\u5143\u7d20\u4e26\u63d0\u53d6\u76f8\u95dc\u8cc7\u6599\u7684\u65b9\u6cd5\u90fd\u5f88\u5bb9\u6613\u56e0\u70ba\u7a0b\u5f0f\u78bc\u7684\u8b8a\u52d5\u800c\u5931\u6548\u3002\u6240\u4ee5\u6211\u6c7a\u5b9a\u5c07 GPT \u52a0\u5165\u5176\u4e2d\u3002\u6211\u610f\u8b58\u5230\uff0c\u5982\u679c\u6211\u80fd\u7372\u53d6\u9801\u9762\u4e0a\u7684\u300c\u6240\u6709\u300d\u6587\u5b57\uff0c\u7136\u5f8c\u76f4\u63a5\u8acb LLM \u5e6b\u6211\u63d0\u53d6\u8cc7\u8a0a\uff0c\u5c31\u4e0d\u9700\u8981\u505a\u90a3\u4e9b\u7e41\u7463\u7684\u5143\u7d20\u63d0\u53d6\u3002\u7562\u7adf\uff0c\u63a8\u7406\u662f\u5b83\u6700\u5f37\u5927\u7684\u61c9\u7528\u4e4b\u4e00\u3002\u56e0\u6b64\uff0c\u5728\u5b78\u7fd2\u4e86 Selenium \u7684\u57fa\u672c\u77e5\u8b58\u4e26\u7814\u7a76\u4e86\u4e00\u4e9b\u73fe\u6709\u7684\u89e3\u6c7a\u65b9\u6848\u5f8c\uff0c\u6211\u5efa\u7acb\u4e86\u4e00\u500b\u5c6c\u65bc\u81ea\u5df1\u7684\u6293\u53d6\u5668\u3002\u63a5\u4e0b\u4f86\u6211\u5011\u4f86\u6df1\u5165\u4e86\u89e3\u4e00\u4e0b\u3002<\/p>\n\n\n\n<p id=\"7a2d\"><strong>actions.py<\/strong><\/p>\n\n\n\n<pre class=\"wp-block-preformatted\">import getpass\nfrom . import constants as c\nfrom selenium.webdriver.support.wait import WebDriverWait\nfrom selenium.webdriver.common.by import By\nfrom selenium.webdriver.support import expected_conditions as EC\n\ndef login(driver, email=None, password=None, cookie = None, timeout=30):\n    try:   \n        driver.get(\"https:\/\/www.linkedin.com\/login\")\n        element = WebDriverWait(driver, 10).until(EC.presence_of_element_located((\"id\", \"username\")))\n    \n        email_elem = driver.find_element(\"id\",\"username\")\n        email_elem.send_keys(email)\n    \n        password_elem = driver.find_element(\"id\",\"password\")\n        password_elem.send_keys(password)\n        password_elem.submit()\n       \n        element = WebDriverWait(driver, timeout).until(EC.presence_of_element_located((\"class name\", \"global-nav__content\"))\n        \n    except Exception as e:\n        print(f\"Failed to log in: {e}\")<\/pre>\n\n\n\n<p id=\"74df\">\u6211\u5011\u4f7f\u7528<em>driver.get()<\/em>\u51fd\u6578\u4f86\u53d6\u5f97 LinkedIn \u767b\u5165\u7db2\u9801\u3002\u6211\u5011\u4f7f\u7528<em>WebDriverWait()<\/em>\u51fd\u6578\u4f86\u6aa2\u67e5\u7db2\u9801\u4e0a\u7684\u7279\u5b9a\u5143\u7d20\u3002\u6211\u5011\u4f7f\u7528\u6b64\u51fd\u6578\u4f86\u7b49\u5f85\u7db2\u9801\u5b8c\u5168\u8f09\u5165\u3002\u9801\u9762\u8f09\u5165\u5f8c\uff0c\u6211\u5011\u4f7f\u7528<em>driver.find_element()<\/em>\u4f86\u53d6\u5f97\u4f7f\u7528\u8005\u540d\u7a31<em>id<\/em>\u548c\u5bc6\u78bc<em>id<\/em>\u3002\u7372\u5f97\u9019\u4e9b\u5143\u7d20\u5f8c\uff0c\u6211\u5011\u4f7f\u7528<em>send_keys()<\/em>\u51fd\u6578\u5c07\u503c\u653e\u5165\u6587\u5b57\u65b9\u584a\u4e2d\u3002\u5c0d\u65bc\u5bc6\u78bc\u5143\u7d20\uff0c\u6211\u5011\u4f7f\u7528Submit&nbsp;<em>(&nbsp;<\/em><em>)<\/em>\u51fd\u6578\u767b\u5165\u3002\u5982\u679c\u6211\u5011\u5f97\u5230\u4e86\u9019\u500b\uff0c\u5c31\u610f\u5473\u8457\u6211\u5011\u5df2\u7d93\u52a0\u5165\u4e86\u3002<em><\/em><\/p>\n\n\n\n<p id=\"9936\"><strong>entity.py<\/strong><\/p>\n\n\n\n<pre class=\"wp-block-preformatted\">import requests\nfrom selenium import webdriver\nfrom selenium.webdriver.common.by import By\nfrom selenium.webdriver.support.ui import WebDriverWait\nfrom selenium.webdriver.support import expected_conditions as EC\nfrom selenium.common.exceptions import NoSuchElementException\nfrom .objects import Experience, Education, Scraper, Interest, Accomplishment, Contact\nimport os\nfrom linkedin_scraper import selectors\n\nclass Entity(Scraper):\n\n    __TOP_CARD = \"top-card-background-hero-image\"\n    __WAIT_FOR_ELEMENT_TIMEOUT = 5\n\n    def __init__(\n        self,\n        linkedin_url=None,\n        driver=None,\n        get=True,\n        close_on_complete=True,\n    ):\n        self.driver = driver<\/pre>\n\n\n\n<p id=\"ea2a\">\u6211\u5011\u5275\u5efa\u4e86\u4e00\u500b\u540d\u70ba<em>Entity<\/em>\u7684\u985e\uff0c\u5176\u4e2d\u6709\u5169\u500b\u7a0d\u5f8c\u5c07\u4f7f\u7528\u7684\u5e38\u6578\u3002\u5efa\u69cb\u5b50\u4f7f\u7528<em>linkedin_url<\/em>\u3001<em>driver<\/em>\u3001<em>get<\/em>\u548c<em>close_on_complete<\/em>\u9032\u884c\u521d\u59cb\u5316\u3002\u7136\u5f8c\uff0c\u5b83\u6839\u64da\u547c\u53eb\u5efa\u69cb\u51fd\u6578\u6642\u50b3\u5165\u7684\u5167\u5bb9\u8a2d\u5b9a\u9a45\u52d5\u7a0b\u5f0f\u3002<\/p>\n\n\n\n<pre class=\"wp-block-preformatted\">def scrape(self, close_on_complete=True):    <br>    driver = self.driver<br><br>    try:<br>        # Wait for the page to load<br>        WebDriverWait(driver, self.__WAIT_FOR_ELEMENT_TIMEOUT).until(<br>            EC.presence_of_element_located(<br>                (<br>                    \"tag name\", \"body\"<br>                )<br>            )<br>        )<br>        self.focus()<br>        self.wait(5)<br>        <br>        # Scroll to the bottom of the page to load all dynamic content<br>        driver.execute_script(\"window.scrollTo(0, document.body.scrollHeight);\")<br>        self.wait(3)  # wait for additional content to load<br>        <br>        # Get all the text on the page<br>        page_text = driver.find_element(\"tag name\", \"body\").text<br>        <br>        print(\"Scraped Text:\")<br>        print(page_text)<br>        <br>        return page_text<br>        <br>    except Exception as e:<br>        print(f\"Failed to scrape the page: {e}\")<br>        page_text = \"\"<br><br>    finally:<br>        if close_on_complete:<br>            driver.quit()<br><br>    return page_text<\/pre>\n\n\n\n<p id=\"e629\"><em>scrape()<\/em>\u51fd\u6578\u5c07\u6aa2\u67e5\u9801\u9762\u4e0a\u662f\u5426\u5b58\u5728<em>\u4e3b\u9ad4<\/em>\u3002\u7136\u5f8c\u5b83\u5c07\u4f7f\u7528<em>driver.execute_script()<\/em>\u51fd\u6578\u6efe\u52d5\u5230\u9801\u9762\u5e95\u90e8\u4ee5\u8f09\u5165\u6240\u6709\u5167\u5bb9\u3002\u4e4b\u5f8c\uff0c\u5b83\u5c07\u4f7f\u7528<em>driver.find_element(\u201ctag name\u201d, \u201cbody\u201d).text<\/em>\u5f9e\u7db2\u9801\u4e2d\u63d0\u53d6\u6240\u6709\u6587\u672c\uff0c\u7136\u5f8c\u6211\u5011\u5c07\u5217\u5370\u4e26\u8fd4\u56de\u6293\u53d6\u7684\u6587\u672c\u3002<\/p>\n\n\n\n<p id=\"e52f\"><strong>linkedin_scraper.py<\/strong><\/p>\n\n\n\n<pre class=\"wp-block-preformatted\">from linkedin_scraper import Person, actions\nfrom openai import OpenAI \nfrom selenium import webdriver\nfrom selenium.webdriver.chrome.service import Service\nfrom selenium.webdriver.chrome.options import Options\nfrom selenium.webdriver.common.by import By\nfrom selenium.webdriver.support.ui import WebDriverWait\nfrom selenium.webdriver.support import expected_conditions as EC\n\ndef scrape_linkedin(linkedin_url, email, password):\n    # Set up Chrome options\n    chrome_options = Options()\n    chrome_options.add_argument(\"--headless\")  # Run in headless mode\n    chrome_options.add_argument(\"--disable-gpu\")  # Disable GPU acceleration\n    chrome_options.add_argument(\"--no-sandbox\")  # Bypass OS security model\n    chrome_options.add_argument(\"--disable-dev-shm-usage\")  # Overcome limited resource problems\n    chrome_options.add_argument(\"start-maximized\")  # Start maximized\n    chrome_options.add_argument(\"enable-automation\")  # Enable automation controls\n    chrome_options.add_argument(\"--disable-infobars\")  # Disable infobars\n    chrome_options.add_argument(\"--disable-extensions\")  # Disable extensions\n    chrome_options.add_argument(\"user-agent=Mozilla\/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/58.0.3029.110 Safari\/537.36\")\n    \n    # Initialize the Chrome driver with the options\n    service = Service('\/usr\/bin\/chromedriver')  # Update with the correct path to your chromedriver\n    driver = webdriver.Chrome(service=service, options=chrome_options)\n    client = OpenAI(api_key=\"OPENAI_API_KEY\")\n    if email and password:\n        try:    \n                \n            # Log in to LinkedIn\n            actions.login(driver, email, password)\n                        \n            # Wait for the LinkedIn homepage to load or for login to complete\n            WebDriverWait(driver, 20).until(\n                EC.presence_of_element_located((By.CSS_SELECTOR, \".global-nav__me-photo\"))\n            )\n            \n            # Navigate to the profile page\n            driver.get(linkedin_url)\n            \n            # Wait for the profile page to load\n            WebDriverWait(driver, 20).until(\n                EC.presence_of_element_located((By.CSS_SELECTOR, \".pv-top-card__photo-wrapper.ml0\"))\n            )\n            \n            # Create an Entity object for the LinkedIn profile\n            entity = Entity(linkedin_url=linkedin_url, driver=driver, scrape=False)\n            \n            # Scrape the LinkedIn profile data\n            linkedin_data = entity.scrape(close_on_complete=True)  # Close browser after scraping\n            \n            prompt = \"\"\"Extract and summarize the LinkedIn profile data into the following format:\n                linkedin_data = {\n                    \"name\": person.name,\n                    \"linkedin_url\": person.linkedin_url,\n                    \"about\": person.about,\n                    \"experiences\": [str(exp) for exp in person.experiences],\n                    \"educations\": [str(edu) for edu in person.educations],\n                    \"interests\": [str(interest) for interest in person.interests],\n                    \"accomplishments\": [str(accomplishment) for accomplishment in person.accomplishments],\n                    \"company\": person.company,\n                    \"job_title\": person.job_title\n                }\n            \n            \"\"\"\n        \n            response = client.chat.completions.create(\n                model=\"gpt-4o-mini\",\n                temperature=0.1,\n                messages=[\n                    {\"role\": \"system\", \"content\": f'{prompt}'},\n                    {\"role\": \"user\", \"content\": f'parse the following data: {linkedin_data}'}\n                ]\n            )\n\n            print(response.choices[0].message.content)\n            return response.choices[0].message.content\n            \n        except Exception as e:\n            print(f\"Failed to scrape {linkedin_url}: {e}\")\n            linkedin_data = {}\n        \n        finally:\n            driver.quit()\n    else:\n        return {}<\/pre>\n\n\n\n<p id=\"9e2e\">\u5728\u6211\u5011\u7684\u4e3b\u8981 <code>scrape_linkedin()<\/code> \u51fd\u5f0f\u4e2d\uff0c\u6211\u5011\u9996\u5148\u6703\u5efa\u7acb\u4e00\u4e9b\u5fc5\u8981\u7684\u8a2d\u5b9a\u3002\u9019\u4e9b\u8a2d\u5b9a\u5305\u62ec <code>chrome_options<\/code>\u3001<code>services<\/code>\u3001<code>driver<\/code> \u548c OpenAI \u5ba2\u6236\u7aef\u3002\u63a5\u8457\uff0c\u6211\u5011\u6703\u4f7f\u7528 <code>login()<\/code> \u51fd\u5f0f\u8207 <code>driver<\/code>\u3001\u96fb\u5b50\u90f5\u4ef6\u548c\u5bc6\u78bc\u9032\u884c\u767b\u5165\u3002\u7136\u5f8c\u6211\u5011\u4f7f\u7528 <code>WebDriverWait()<\/code> \u6aa2\u67e5\u662f\u5426\u5df2\u5230\u9054\u9996\u9801\uff0c\u6211\u5011\u6703\u6aa2\u67e5\u5c0e\u89bd\u5217\u4e2d\u7684\u500b\u4eba\u8cc7\u6599\u7167\u7247\u3002\u78ba\u8a8d\u5f8c\uff0c\u6211\u5011\u6703\u4f7f\u7528 <code>driver.get()<\/code> \u51fd\u5f0f\u524d\u5f80\u6211\u5011\u60f3\u8981\u6293\u53d6\u7684\u500b\u4eba\u8cc7\u6599\u9801\u9762\u3002<\/p>\n\n\n\n<p>\u4e00\u65e6\u5230\u9054\u8a72\u9801\u9762\uff0c\u6211\u5011\u6703\u518d\u6b21\u4f7f\u7528 <code>WebDriverWait()<\/code> \u6aa2\u67e5\u500b\u4eba\u8cc7\u6599\u7167\u7247\u3002\u5982\u679c\u78ba\u8a8d\u5b58\u5728\uff0c\u6211\u5011\u6703\u521d\u59cb\u5316\u4e00\u500b <code>Entity<\/code> \u7269\u4ef6\uff0c\u7136\u5f8c\u57f7\u884c <code>scrape()<\/code> \u51fd\u5f0f\u4ee5\u5f9e\u7db2\u9801\u4e2d\u64f7\u53d6\u6240\u6709\u6587\u5b57\u8cc7\u6599\u3002\u4e00\u65e6\u53d6\u5f97\u9019\u4e9b\u8cc7\u6599\uff0c\u6211\u5011\u53ef\u4ee5\u4f7f\u7528 OpenAI \u7684 Chat Completions API \u5c07\u8cc7\u6599\u50b3\u9001\u81f3 OpenAI\uff0c\u8b93\u5176\u5f9e\u9019\u5927\u91cf\u6587\u5b57\u4e2d\u64f7\u53d6\u6240\u9700\u8cc7\u6599\u3002\u6700\u5f8c\uff0c\u6211\u5011\u53ef\u4ee5\u6253\u5370\u4e26\u8fd4\u56de\u9019\u4e9b\u8cc7\u6599\u3002<\/p>\n\n\n\n<p id=\"30bb\">\u6574\u500b\u89e3\u6c7a\u65b9\u6848\u662f\u53c3\u8003\u4ee5\u4e0b\u7684 Github repo \u6240\u555f\u767c\u7684\uff1ahttps:\/\/github.com\/joeyism\/linkedin_scraper\u3002\u8acb\u52d9\u5fc5\u67e5\u770b\u3002<\/p>\n\n\n\n<p>\u8cc7\u6599\u4f86\u6e90: <a href=\"https:\/\/fatehaliaamir.medium.com\/build-a-linkedin-scraper-using-selenium-and-openais-gpt-4o-mini-d0f5a61e9902\">https:\/\/fatehaliaamir.medium.com\/build-a-linkedin-scraper-using-selenium-and-openais-gpt-4o-mini-d0f5a61e9902<\/a><\/p>\n\n\n\n<p><\/p>\n","protected":false},"excerpt":{"rendered":"<p>2024-08-12 | Fateh Ali Aamir \u6211\u60f3\u5f9e\u982d\u958b\u59cb\u5efa\u7acb\u4e00\u500b LinkedIn \u6293\u53d6\u5668\u3002\u6211\u5617\u8a66\u67e5\u770b\u5404\u7a2e\u516c\u958b\u53ef\u7528\u7684\u89e3\u6c7a\u65b9\u6848\uff0c\u4f46\u90fd\u4e0d\u9069\u5408\u6211\u3002\u6211\u767c\u73fe\uff0c\u4efb\u4f55\u50b3\u7d71\u900f\u904e\u6293\u53d6\u7db2&hellip;<\/p>\n","protected":false},"author":4,"featured_media":7233,"comment_status":"closed","ping_status":"closed","sticky":false,"template":"","format":"standard","meta":{"_jetpack_newsletter_access":"","_jetpack_dont_email_post_to_subs":false,"_jetpack_newsletter_tier_id":0,"_jetpack_memberships_contains_paywalled_content":false,"_jetpack_memberships_contains_paid_content":false,"footnotes":"","jetpack_post_was_ever_published":false},"categories":[579,4],"tags":[40],"class_list":["post-7232","post","type-post","status-publish","format-standard","has-post-thumbnail","hentry","category-579","category-industry-news","tag-40"],"gutentor_comment":0,"jetpack_featured_media_url":"https:\/\/i0.wp.com\/aict.nkust.edu.tw\/digitrans\/wp-content\/uploads\/2024\/10\/%E8%9E%A2%E5%B9%95%E6%93%B7%E5%8F%96%E7%95%AB%E9%9D%A2-2024-10-18-145056.png?fit=674%2C479&ssl=1","jetpack-related-posts":[],"jetpack_sharing_enabled":true,"_links":{"self":[{"href":"https:\/\/aict.nkust.edu.tw\/digitrans\/index.php?rest_route=\/wp\/v2\/posts\/7232","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/aict.nkust.edu.tw\/digitrans\/index.php?rest_route=\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/aict.nkust.edu.tw\/digitrans\/index.php?rest_route=\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/aict.nkust.edu.tw\/digitrans\/index.php?rest_route=\/wp\/v2\/users\/4"}],"replies":[{"embeddable":true,"href":"https:\/\/aict.nkust.edu.tw\/digitrans\/index.php?rest_route=%2Fwp%2Fv2%2Fcomments&post=7232"}],"version-history":[{"count":5,"href":"https:\/\/aict.nkust.edu.tw\/digitrans\/index.php?rest_route=\/wp\/v2\/posts\/7232\/revisions"}],"predecessor-version":[{"id":7251,"href":"https:\/\/aict.nkust.edu.tw\/digitrans\/index.php?rest_route=\/wp\/v2\/posts\/7232\/revisions\/7251"}],"wp:featuredmedia":[{"embeddable":true,"href":"https:\/\/aict.nkust.edu.tw\/digitrans\/index.php?rest_route=\/wp\/v2\/media\/7233"}],"wp:attachment":[{"href":"https:\/\/aict.nkust.edu.tw\/digitrans\/index.php?rest_route=%2Fwp%2Fv2%2Fmedia&parent=7232"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/aict.nkust.edu.tw\/digitrans\/index.php?rest_route=%2Fwp%2Fv2%2Fcategories&post=7232"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/aict.nkust.edu.tw\/digitrans\/index.php?rest_route=%2Fwp%2Fv2%2Ftags&post=7232"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}