이디야커피는 스타벅스 매장이 위치하는 곳에 매장을 위치시키는가?¶

In [4]:

import mysql.connector
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
import googlemaps
import time
import warnings
from bs4 import BeautifulSoup 
warnings.simplefilter(action='ignore')
import pandas as pd
import matplotlib.pyplot as plt
import koreanize_matplotlib
import seaborn as sns

DB 생성¶

create database coffee;

    use database coffee;

Table 생성¶

CREATE TABLE COFFEE_BRAND
    (
        id int NOT NULL AUTO_INCREMENT PRIMARY KEY,
        name varchar(10)
    )

    INSERT INTO COFFEE_BRAND VALUES (1, 'starbucks')
    INSERT INTO COFFEE_BRAND VALUES (2, 'ediya')

CREATE TABLE COFFEE_STORE
    (
        id int NOT NULL AUTO_INCREMENT PRIMARY KEY,
        brand int,name varchar(32) NOT NULL,
        gu_name varchar(5) NOT NULL,
        address varchar(128) NOT NULL,
        lat decimal(16,14) NOT NULL,
        lng decimal(17,14) NOT NULL,
        FOREIGN KEY (brand) REFERENCES COFFEE_BRAND(id)
    );

데이터 수집: 스타벅스¶

서울 전체 지역 스타벅스 매장 정보

In [ ]:

options = webdriver.ChromeOptions()

prefs = {
    'download.default_directory':'/home/haneol/dev_ws/EDA/data/',
    'download.propt_for_download':False
}

options.add_experimental_option('prefs', prefs)

url = 'https://www.starbucks.co.kr/store/store_map.do'
driver = webdriver.Chrome(
    service=Service('../driver/chromedriver-linux64/chromedriver'),
    options=options
)
driver.get(url)

# 지역
xpath_one ='//*[@id="container"]/div/form/fieldset/div/section/article[1]/article/header[2]'
some_tag_one = driver.find_element(By.XPATH,  xpath_one)
some_tag_one.click()

# 서울
xpath_seoul ='//*[@id="container"]/div/form/fieldset/div/section/article[1]/article/article[2]/div[1]/div[2]/ul/li[1]/a'
some_tag_seoul = driver.find_element(By.XPATH, xpath_seoul)
some_tag_seoul.click()

# 서울: 전체
xpath_seoul_all ='//*[@id="mCSB_2_container"]/ul/li[1]/a'
some_tag_seoul_all = driver.find_element(By.XPATH, xpath_seoul_all)
some_tag_seoul_all.click()

# BeautifulSoup
star_req = driver.page_source
star_soup = BeautifulSoup(star_req, "html.parser")

test_soup = star_soup.find_all("li", {"class":"quickResultLstCon"})

driver.close()

크롤링 데이터 DB에 저장¶

In [ ]:

# AWS RDS 접속
conn = mysql.connector.connect(
    host = '******************',
    port = 3306,
    user = '****',
    password = '****',
    database = 'coffee'
)
cursor = conn.cursor(buffered=True)

sql = "INSERT INTO COFFEE_STORE (brand, name, gu_name, address, lat, lng) VALUES (1 ,%s,%s,%s,%s,%s)"

for i in tqdm_notebook(range(1, len(test_soup))):
    name = test_soup[i]["data-name"].strip()
    lat = test_soup[i]["data-lat"].strip()
    lng = test_soup[i]["data-long"].strip()
    addess = test_soup[i].find("p").get_text()[0:len(test_soup[i].find("p").get_text())-9]
    gu_name = addess.split()[1]
    
    cursor.execute(sql, (name, gu_name, addess, lat, lng))
    conn.commit()

conn.close()

데이터 수집: 이디야¶

서울 전체 지역 이디야 매장 정보

In [ ]:

# 이디야 페이지 접근
options = webdriver.ChromeOptions()

prefs = {'download.default_directory':'/home/haneol/dev_ws/EDA/data/',
        'download.propt_for_download':False}

options.add_experimental_option('prefs', prefs)

url = 'https://www.ediya.com/contents/find_store.html#c'
driver = webdriver.Chrome(service=Service('../driver/chromedriver-linux64/chromedriver'),
                        options=options)
driver.get(url)

# 주소
xpath_address = '//*[@id="contentWrap"]/div[3]/div/div[1]/ul/li[2]/a'
some_tag_address = driver.find_element(By.XPATH, xpath_address)
some_tag_address.click()

# "서울 + 구"로 검색
conn = mysql.connector.connect(
    host = '********************',
    port = 3306,
    user = '***',
    password = '****',
    database = 'coffee'
)
gu_list = []
cursor = conn.cursor(buffered=True)
cursor.execute("select distinct(gu_name) from COFFEE_STORE")
result = cursor.fetchall()

for row in result:
    gu_list.append(str("서울 ") + row[0])

gmaps_key = "AIzaSyD9E0pAkieQi1BJ5rc4bfTPbvc6GiUVZFM"
gmaps = googlemaps.Client(key=gmaps_key)

sql = "INSERT INTO COFFEE_STORE (brand, name, gu_name, address, lat, lng) VALUES (2 ,%s,%s,%s,%s,%s)"

for num in tqdm_notebook(gu_list):  # 광명시 제거 필요
    some_tag = driver.find_element(By.XPATH, '//*[@id="keyword"]')
    some_tag.send_keys(num)
    time.sleep(1)
    driver.find_element(By.XPATH, '//*[@id="keyword_div"]/form/button').click()
    time.sleep(1)
    edi_req = driver.page_source
    edi_soup = BeautifulSoup(edi_req, "html.parser")
    
    for i in range(len(edi_soup.find_all("li", {"class":"item"}))):
        name = edi_soup.find_all("li", {"class":"item"})[i].find("dt").text # 매장명
        gu_name = edi_soup.find_all("li", {"class":"item"})[i].text.split()[2]  # 구
        e = edi_soup.find_all("li", {"class":"item"})[i].get_text()      # 주소
        
        if len(e.split()) > 4: # 
            addess = e.split()[1] + e.split()[2] + e.split()[3] + e.split()[4]
        else: 
            addess = e.split()[1] + e.split()[2] + e.split()[3] + e.split()[0][0:-1]

        lat = gmaps.geocode(addess)[0].get("geometry")["location"]["lat"] 
        lng = gmaps.geocode(addess)[0].get("geometry")["location"]["lng"]


        cursor.execute(sql,(name,gu_name,e,lat,lng))
        conn.commit()
        
        
    time.sleep(1)    
    some_tag.clear()
 
driver.close()
conn.close()

CSV 파일로 저장¶

In [ ]:

conn = mysql.connector.connect(
    host = '*****************',
    port = 3306,
    user = '****',
    password = '***',
    database = 'coffee'
)

cursor = conn.cursor(buffered=True)

sql =  "select * from COFFEE_STORE"

cursor.execute(sql)
result = cursor.fetchall()
num_fields = len(cursor.description)
field_names = [i[0] for i in cursor.description]

df = pd.DataFrame(result)
df.columns = field_names

df.to_csv('../data/coffee_output.csv', index = False, encoding = "utf-8")
conn.close()

EDIYA & STARBUCKS 위치 정보 EDA¶

In [1]:

import pandas as pd
import matplotlib.pyplot as plt
import koreanize_matplotlib
import seaborn as sns

In [29]:

coffee = pd.read_csv('../data/coffee_output.csv')
coffee.tail()
Starbucks = coffee[coffee['brand']==1]
Ediya = coffee[coffee['brand']==2]

가정1 : 두 브랜드는 유사한 지역 별 매장 분포를 보일 것¶

서로 다른 경향
- STARBUCKS : 지역에 따른 편차가 큼
- EDIYA : 상대적으로 고르게 분포

In [50]:

fig, ax = plt.subplots(ncols=2, figsize = (25,6), sharey=True)
sns.set_palette("muted")
sns.barplot(x='gu_name', y='count', data=Starbucks['gu_name'].value_counts().reset_index(), ax=ax[0])
ax[0].set_xlabel('구', fontsize=15)
ax[0].set_ylabel('개수', fontsize=15)
ax[0].set_title('STARBUCKS', fontsize=20)
ax[0].grid()

sns.barplot(x='gu_name', y='count', data=Ediya['gu_name'].value_counts().reset_index(), ax=ax[1])
ax[1].set_xlabel('구', fontsize=15)
ax[1].set_ylabel('')
ax[1].set_title('EDIYA', fontsize=20)
ax[1].grid()
plt.show()

No description has been provided for this image

매장 수에 따른 상위 5개 지역¶

상위 분포 지역이 서로 다른 양상을 보인다

In [132]:

fig, ax = plt.subplots(ncols=2, figsize = (15,6), sharey=True)
sns.set_palette("deep")
sns.barplot(x='gu_name', y='count', data=Starbucks['gu_name'].value_counts().reset_index()[:5], ax=ax[0])
ax[0].set_xlabel('지역', fontsize=15)
ax[0].set_ylabel('개수', fontsize=15)
ax[0].set_title('STARBUCKS', fontsize=20)
ax[0].grid()

sns.barplot(x='gu_name', y='count', data=Ediya['gu_name'].value_counts().reset_index()[:5], ax=ax[1])
ax[1].set_xlabel('지역', fontsize=15)
ax[1].set_ylabel('')
ax[1].set_title('EDIYA', fontsize=20)
ax[1].grid()
plt.show()

가정2 : 이디야 매장 근방에 스타벅스가 위치할 것¶

실제로 EDIYA 주변에는 스타벅스가 있는가?¶

성인 남성 기준 걷는 속도 5km/h
83m : 걸어서 1분 거리
167m : 걸어서 2분 거리
250m : 걸어서 3분 거리

haversine : 위경도(Latitude, Longitude) 서로 다른 위치 사이의 거리를 계산
close_to_SB : 특정 거리 (dist_thr) 반경에 스타벅스가 있는 이디야 지점 수를 '구' 별로 반환

In [111]:

from haversine import haversine

# '구' 리스트
gu_list = (Ediya['gu_name']).unique()

# 특정 거리 (dist_thr) 반경에 스타벅스가 있는 이디야 지점 수를 '구' 별로 반환
def close_to_SB(Starbucks, Ediya, gu_list, dist_thr):
    close_to_SB_list = []
    for gu in gu_list:
        count = 0
        dist_thr = dist_thr
        ed = Ediya[Ediya['gu_name']==gu]
        sb = Starbucks[Starbucks['gu_name']==gu]
        ed_len = len(ed['id'])
        
        for each_ed in ed.iterrows():
            ed_loc = (each_ed[1]['lat'], each_ed[1]['lng'])  # 이디야 경위도
            for each_sb in sb.iterrows():
                sb_loc = (each_sb[1]['lat'], each_sb[1]['lng'])  # 스타벅스 경위도
                dist = haversine(ed_loc, sb_loc, unit ='m')  # 거리 계산
                if dist <= dist_thr:  # 두 지점 사이의 거리가 dist_thr보다 작은 경우 count
                    count += 1
                    break  # 주변에 스타벅스가 하나라도 있다면 한 개만 count
        
        close_to_SB_list.append([count / ed_len, ed_len, count, gu])
    
    return close_to_SB_list 


close_to_SB_83 = close_to_SB(Starbucks, Ediya, gu_list, dist_thr=83)  # 1분 거리
close_to_SB_167 = close_to_SB(Starbucks, Ediya, gu_list, dist_thr=167)  # 2분 거리
close_to_SB_250 = close_to_SB(Starbucks, Ediya, gu_list, dist_thr=250)  # 3분 거리

# 정렬
close_to_SB_83.sort(reverse=True)
close_to_SB_167.sort(reverse=True)
close_to_SB_250.sort(reverse=True)

# DF 변환
p_83 = pd.DataFrame(close_to_SB_83, columns=['ratio', 'count', 'num_store', 'gu_name'])
p_167 = pd.DataFrame(close_to_SB_167, columns=['ratio', 'count', 'num_store', 'gu_name'])
p_250 = pd.DataFrame(close_to_SB_250, columns=['ratio', 'count', 'num_store', 'gu_name'])

In [141]:

fig, ax = plt.subplots(ncols=3, figsize = (25,6), sharey=True)
sns.set_palette("deep")

sns.barplot(x='gu_name', y='ratio', data=p_83[:10], ax=ax[0])
ax[0].set_xlabel('지역', fontsize=15)
ax[0].set_ylabel('스타벅스 근처 지점 수 / 전체 지점 수', fontsize=15)
ax[0].set_title('1분 거리', fontsize=15)
ax[0].grid()

sns.barplot(x='gu_name', y='ratio', data=p_167[:10], ax=ax[1])
ax[1].set_xlabel('지역', fontsize=15)
ax[1].set_ylabel('')
ax[1].set_title('2분 거리', fontsize=15)
ax[1].grid()

sns.barplot(x='gu_name', y='ratio', data=p_250[:10], ax=ax[2])
ax[2].set_xlabel('지역', fontsize=15)
ax[2].set_ylabel('')
ax[2].set_title('3분 거리', fontsize=15)
ax[2].grid()

plt.show()

전체 매장 중, 1분 거리에 스타벅스가 있는 이디야 매장 비율 : 최대 31%
전체 매장 중, 2분 거리에 스타벅스가 있는 이디야 매장 비율 : 최대 62%
전체 매장 중, 3분 거리에 스타벅스가 있는 이디야 매장 비율 : 최대 79%
1분 거리에 위치한 매장은 전체 매장의 1/3 미만
2분 거리 이상부터는 높은 비율을 보이나, 해당 지역은 스타벅스 매장이 많이 분포하는 상위 지역임

In [142]:

fig, ax = plt.subplots(ncols=3, figsize = (30,6), sharey=True)
sns.set_palette("deep")

sns.barplot(x='gu_name', y='ratio', data=p_83, ax=ax[0])
ax[0].set_xlabel('지역', fontsize=15)
ax[0].set_ylabel('스타벅스 근처 지점 수 / 전체 지점 수', fontsize=15)
ax[0].set_title('1분 거리', fontsize=15)
ax[0].grid()

sns.barplot(x='gu_name', y='ratio', data=p_167, ax=ax[1])
ax[1].set_xlabel('지역', fontsize=15)
ax[1].set_ylabel('')
ax[1].set_title('2분 거리', fontsize=15)
ax[1].grid()

sns.barplot(x='gu_name', y='ratio', data=p_250, ax=ax[2])
ax[2].set_xlabel('지역', fontsize=15)
ax[2].set_ylabel('')
ax[2].set_title('3분 거리', fontsize=15)
ax[2].grid()

plt.show()

비율이 전지역에 걸쳐서 고르게 나타나지 않음

In [131]:

fig, ax = plt.subplots(ncols=3, figsize = (25,6), sharey=True)
sns.set_palette("deep")

sns.barplot(x='gu_name', y='ratio', data=p_83[:-11:-1], ax=ax[0])
ax[0].set_xlabel('지역', fontsize=15)
ax[0].set_ylabel('스타벅스 근처 지점 수 / 전체 지점 수', fontsize=15)
ax[0].set_title('1분 거리', fontsize=15)
ax[0].grid()

sns.barplot(x='gu_name', y='ratio', data=p_167[:-11:-1], ax=ax[1])
ax[1].set_xlabel('지역', fontsize=15)
ax[1].set_ylabel('')
ax[1].set_title('2분 거리', fontsize=15)
ax[1].grid()

sns.barplot(x='gu_name', y='ratio', data=p_250[:-11:-1], ax=ax[2])
ax[2].set_xlabel('지역', fontsize=15)
ax[2].set_ylabel('')
ax[2].set_title('3분 거리', fontsize=15)
ax[2].grid()

plt.show()

근처에 스타벅스가 있는 이디야 매장 비율 하위 10개 지역을 추려 보았을 때, 스타벅스 매장 수 하위 직역이 다수 분포함

지도 시각화¶

지도 상에서 위의 분석 결과를 확인해본다
folium.plugins : MarkerCluster() & FeatureGroupSubGroup()
- Ediya만 선택하여 Cluster 확인 가능

In [136]:

import folium
from folium.plugins import MarkerCluster

center = [37.541, 126.986]
tiles = ['cartodbpositron', 'Stamen Toner', 'OpenStreetMap']

# visualization ---- 2.
m = folium.Map(
    location = [center[0], center[1]],
    zoom_start = 12,
    tiles = tiles[0]
)

mcg = folium.plugins.MarkerCluster(control=False)
m.add_child(mcg)

SB = folium.plugins.FeatureGroupSubGroup(mcg, 'Starbucks')
m.add_child(SB)

ED = folium.plugins.FeatureGroupSubGroup(mcg, 'Ediya')
m.add_child(ED)

for lat, lng in zip(Starbucks['lat'], Starbucks['lng']):
    folium.Marker([lat, lng], icon = folium.Icon(color="green")).add_to(SB)

for lat, lng in zip(Ediya['lat'], Ediya['lng']):
    folium.Marker([lat, lng], icon = folium.Icon(color="blue")).add_to(ED)

folium.LayerControl(collapsed=False).add_to(m)

m

Out[136]:

Make this Notebook Trusted to load map: File -> Trust Notebook

이디야커피는 스타벅스 매장이 위치하는 곳에 매장을 위치시키는가?¶

결론:¶

가정1 : 두 브랜드는 유사한 지역 별 매장 분포를 보일 것
- 스타벅스와 이디야는 서로 다른 매장 분포를 보임

가정2 : 이디야 매장 근방에 스타벅스가 위치할 것
- 근거리에 스타벅스가 위치하는 이디야 매장의 비율은 높지 않으며,
- 전 지역에 걸쳐서 유사한 비율을 보이지 않음
- 중구, 강남구, 종로구 등 스타벅스와 이디야가 가까이 위치하는 비율이 높은 지역은 처음부터 스타벅스 밀집 지역임

[Python] [Data Analysis] Selenium을 활용한 주유소 가격 웹크롤링 및 데이터 분석 (1)	2023.08.16
[Python] [Data Analysis] Selenium을 활용한 웹크롤링 (0)	2023.08.16

Paul's Grit

Paul's Grit

[Python] EDA Toy Project : 스타벅스 & 이디야 위치 정보 EDA 본문

[Python] EDA Toy Project : 스타벅스 & 이디야 위치 정보 EDA

이디야커피는 스타벅스 매장이 위치하는 곳에 매장을 위치시키는가?¶

DB 생성¶

Table 생성¶

데이터 수집: 스타벅스¶

크롤링 데이터 DB에 저장¶

데이터 수집: 이디야¶

CSV 파일로 저장¶

EDIYA & STARBUCKS 위치 정보 EDA¶

가정1 : 두 브랜드는 유사한 지역 별 매장 분포를 보일 것¶

매장 수에 따른 상위 5개 지역¶

가정2 : 이디야 매장 근방에 스타벅스가 위치할 것¶

실제로 EDIYA 주변에는 스타벅스가 있는가?¶

지도 시각화¶

이디야커피는 스타벅스 매장이 위치하는 곳에 매장을 위치시키는가?¶

결론:¶

'Data Analysis > Web Crawling' 카테고리의 다른 글

티스토리툴바

« 2025/06 »
일	월	화	수	목	금	토
1	2	3	4	5	6	7
8	9	10	11	12	13	14
15	16	17	18	19	20	21
22	23	24	25	26	27	28
29	30