Debug School

rakesh kumar
rakesh kumar

Posted on

How to display all the header tags from website and make dataframe using BeautifulSoup library for webscrapping

step 1 first install libraries

pip install bs4
pip install request
Enter fullscreen mode Exit fullscreen mode

step 2 import libraries

from bs4 import BeautifulSoup
import requests
Enter fullscreen mode Exit fullscreen mode

step3:Send an HTTP GET request to the URL and (status code 200)

page  = requests.get('https://www.wikipedia.org')
page
Enter fullscreen mode Exit fullscreen mode

output

<Response [200]>
Enter fullscreen mode Exit fullscreen mode

step4: check page content

soup= BeautifulSoup(page.content)
soup
Enter fullscreen mode Exit fullscreen mode

output

Image description
step5: get header tag information

header_tags = soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
header_tags
Enter fullscreen mode Exit fullscreen mode

output

Image description

step6: append all header and store in list

name=[]
for tag in header_tags:
    name.append(tag.name)
name
Enter fullscreen mode Exit fullscreen mode

output
['h1', 'h2', 'h2', 'h2', 'h2', 'h2']

step6: append all header content and store in list

text=[]
for tag in header_tags:
    text.append(tag.text.strip())
text
Enter fullscreen mode Exit fullscreen mode

output
['Wikipedia\n\nThe Free Encyclopedia',
'1\xa0000\xa0000+\n\n\narticles',
'100\xa0000+\n\n\narticles',
'10\xa0000+\n\n\narticles',
'1\xa0000+\n\n\narticles',
'100+\n\n\narticles']

step7: make a dataframe


import pandas as pd
df= pd.DataFrame({'tag':name,'text':text})
df
Enter fullscreen mode Exit fullscreen mode

output
Image description

step7: make a dataframe for only header tag

import pandas as pd
df= pd.DataFrame({'tag':name})
df
Enter fullscreen mode Exit fullscreen mode

Image description

Another Methods

import requests
import pandas as pd
from bs4 import BeautifulSoup

# URL of the web page to scrape
url = 'https://en.wikipedia.org/wiki/Main_Page'

# Send an HTTP GET request to the URL
response = requests.get(url)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Parse the HTML content of the web page
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find all header tags (h1, h2, h3, etc.)
    header_tags = soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])

    # Create a DataFrame to store the header tags
    header_df = pd.DataFrame(columns=['Tag', 'Text'])

    # Iterate through the header tags and extract their text
    for tag in header_tags:
        header_df = header_df.append({'Tag': tag.name, 'Text': tag.get_text()}, ignore_index=True)

    # Display the DataFrame with header tags
    print(header_df)

else:
    print('Failed to retrieve the web page.')
Enter fullscreen mode Exit fullscreen mode

==================================================

import requests
import pandas as pd
from bs4 import BeautifulSoup

# URL of the web page to scrape
url = 'https://en.wikipedia.org/wiki/Main_Page'

# Send an HTTP GET request to the URL
response = requests.get(url)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Parse the HTML content of the web page
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find all header tags (h1, h2, h3, h4, h5, h6)
    header_tags = soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])

    # Create a list to store the names of header tags
    header_tag_names = []

    # Iterate through the header tags and extract their names
    for tag in header_tags:
        header_tag_names.append(tag.name)

    # Display the list of header tag names
    print(header_tag_names)

else:
    print('Failed to retrieve the web page.')
Enter fullscreen mode Exit fullscreen mode

=================================

import requests
from bs4 import BeautifulSoup

# scraping a wikipedia article
url_link = 'https://www.geeksforgeeks.org/how-to-scrape-all-pdf-files-in-a-website/'
request = requests.get(url_link)

Soup = BeautifulSoup(request.text, 'lxml')

# creating a list of all common heading tags
heading_tags = ["h1", "h2", "h3"]
for tags in Soup.find_all(heading_tags):
    print(tags.name + ' -> ' + tags.text.strip())
Enter fullscreen mode Exit fullscreen mode

Output

h2 -> Related Articles
h2 -> Python3
h2 -> Python3
h2 -> Python3
h2 -> Python3
h2 -> Python3
h2 -> Python3
Enter fullscreen mode Exit fullscreen mode

Reference

Top comments (0)