Background 背景
The Hong Kong Film Awards, also known as the Hong Kong Film Awards Presentation, is an annual awards ceremony that honors outstanding achievements in the Hong Kong film industry. Established in 1982, it is considered one of the most prestigious film awards in the Chinese-speaking world.
The awards cover various categories, including Best Film, Best Director, Best Actor, Best Actress, Best Supporting Actor, Best Supporting Actress, Best New Director, and many others. The winners are selected by a jury consisting of industry professionals, film critics, and scholars.
Here is a table showcasing the winners of the Hong Kong Film Awards's Best film, Best Director and Best Screenplay awards:
香港電影金像獎(Hong Kong Film Awards)是一個每年舉辦的頒獎典禮,旨在表彰香港電影業的傑出成就。該獎項成立於1982年,被視為華語電影界最具聲望的獎項之一。
香港電影金像獎涵蓋多個類別,包括最佳電影、最佳導演、最佳男主角、最佳女主角、最佳男配角、最佳女配角、最佳新晉導演等等。得獎者由由業內專業人士、影評人和學者組成的評審團選出。
以下是香港電影金像獎每屆的最佳電影, 最佳導演及最佳劇本的簡潔風表格展示:
Source: https://www.hkfaa.com/winnerlist.html
Python 代碼
Using Python's BeautifulSoup library to scrape a ranking table from an HTML file provided by HKFA, which is a static website. The table is well-organized with a fixed order for rows represented by <tr> tags and columns represented by <td> tags. However, it's important to exercise caution when dealing with different sessions, such as session 1 and sessions 3 to 7, as they may have unique characteristics.
HKFASession.py
1 if i > 1:
2 try:
3 best_film = rows[th+1].find_all("td")[2]
4 best_director = rows[th+2].find_all("td")[2]
5 bestScreenPlay = rows[th+6].find_all("td")[2]
6
7 best_actor = rows[th+3].find_all("td")[2]
8 best_actress = rows[th+4].find_all("td")[2]
9
10 tidy_best_director = re.sub(r"\n.*", "", best_director.text.strip())
11 tidy_best_screenplay = re.sub(r"\n.*", "", bestScreenPlay.text.strip())
12
13 tidy_best_actor = re.sub(r"\n.*", "", best_actor.text.strip())
14 tidy_best_actress = re.sub(r"\n.*", "", best_actress.text.strip())
15
16 if i > 3:
17 bestScreenPlay = rows[th+8].find_all("td")[2]
18 tidy_best_screenplay = re.sub(r"\n.*", "", bestScreenPlay.text.strip())
19
20 if i > 7:
21 bestScreenPlay = rows[th+3].find_all("td")[2]
22 best_actor = rows[th+4].find_all("td")[2]
23 best_actress = rows[th+5].find_all("td")[2]
24
25 tidy_best_director = re.sub(r"\n.*", "", best_director.text.strip())
26 tidy_best_screenplay = re.sub(r"\n.*", "", bestScreenPlay.text.strip())
27 tidy_best_actor = re.sub(r"\n.*", "", best_actor.text.strip())
28 tidy_best_actress = re.sub(r"\n.*", "", best_actress.text.strip())
29 ...
Using Python for data scraping offers significant efficiency advantages compared to manual copy and paste. It saves time and effort by automating the extraction process. However, please be aware that the extracted data may not always be correct due to the possibility of rows or columns being disorganized. In such cases, it might be beneficial to improve the search method to accurately identify the target data.
Nevertheless, scraping the data using Python and BeautifulSoup is still faster than the manual copy and paste approach, providing a time-saving benefit. It is advised to review the extracted data for accuracy, ensuring that it aligns with the expected format and content.
Source Code: https://github.com/jaredycw/datajson/blob/main/movie/chinese/hkfa/HKFA.py
HKFA.py
1import requests
2from bs4 import BeautifulSoup
3import json
4import re
5
6def looping_list(target, target_list, target_name, relative_list, sessions_title):
7 for target in target_list:
8 duplicate_entry = next((entry for entry in relative_list if entry["session"] == sessions_title[0]), None)
9 if duplicate_entry:
10 duplicate_entry[str(target_name)] = target
11 else:
12 relative_list.append({str(target_name): target})
13
14def scrape_hkfa_awards(start_year, end_year):
15 base_url = "https://www.hkfaa.com/winnerlist{:02d}.html"
16 movie_list = []
17 th = 2
18
19 for i in range(start_year, end_year + 1):
20 url = base_url.format(i)
21 response = requests.get(url)
22 soup = BeautifulSoup(response.content, "html.parser")
23 table = soup.find(id="table")
24 year = 1982 + i - 1 # Calculate the year based on the index
25
26 if table:
27 rows = table.find_all("tr")
28
29 if rows:
30 award_sessions = [re.sub(r"\n.+", "", session.text.strip()) for session in rows[0].find_all("td")]
31 best_films_list = []
32 best_directors_list = []
33 best_screenplay_list = []
34 best_actors_list= []
35 best_actress_list= []
36 year_list = []
37 session_list = []
38
39 session = ""
40 bestFilm = ""
41 bestDirector = ""
42 bestScreenPlay = ""
43 bestActress = ""
44 bestActor = ""
45
46 if i == 1:
47 try:
48 best_film = rows[th+1].find_all("td")[1]
49 best_director = rows[th+2].find_all("td")[1]
50 bestScreenPlay = rows[th+3].find_all("td")[1]
51 best_actor = rows[th+4].find_all("td")[1]
52 best_actress = rows[th+5].find_all("td")[1]
53 best_films_list.append(best_film.text.strip())
54 best_directors_list.append(best_director.text.strip())
55 best_screenplay_list.append(bestScreenPlay.text.strip())
56 best_actors_list.append(best_actor.text.strip())
57 best_actress_list.append(best_actress.text.strip())
58 year_list.append(year)
59 session_list.append(i)
60 except (IndexError, KeyError):
61 print("Error in first session")
62 if i > 1:
63 try:
64 best_film = rows[th+1].find_all("td")[2]
65 best_director = rows[th+2].find_all("td")[2]
66 bestScreenPlay = rows[th+6].find_all("td")[2]
67
68 best_actor = rows[th+3].find_all("td")[2]
69 best_actress = rows[th+4].find_all("td")[2]
70
71 tidy_best_director = re.sub(r"\n.*", "", best_director.text.strip())
72 tidy_best_screenplay = re.sub(r"\n.*", "", bestScreenPlay.text.strip())
73
74 tidy_best_actor = re.sub(r"\n.*", "", best_actor.text.strip())
75 tidy_best_actress = re.sub(r"\n.*", "", best_actress.text.strip())
76
77 if i > 3:
78 bestScreenPlay = rows[th+8].find_all("td")[2]
79 tidy_best_screenplay = re.sub(r"\n.*", "", bestScreenPlay.text.strip())
80
81 if i > 7:
82 bestScreenPlay = rows[th+3].find_all("td")[2]
83 best_actor = rows[th+4].find_all("td")[2]
84 best_actress = rows[th+5].find_all("td")[2]
85
86 tidy_best_director = re.sub(r"\n.*", "", best_director.text.strip())
87 tidy_best_screenplay = re.sub(r"\n.*", "", bestScreenPlay.text.strip())
88 tidy_best_actor = re.sub(r"\n.*", "", best_actor.text.strip())
89 tidy_best_actress = re.sub(r"\n.*", "", best_actress.text.strip())
90
91 tidy_best_film = re.sub(r"\n.*", "", best_film.text.strip())
92 best_films_list.append(tidy_best_film)
93
94 best_directors_list.append(tidy_best_director)
95 best_screenplay_list.append(tidy_best_screenplay)
96 best_actors_list.append(tidy_best_actor)
97 best_actress_list.append(tidy_best_actress)
98
99 year_list.append(year)
100 session_list.append(i)
101 except (IndexError, KeyError):
102 print("Error after first session")
103
104 field_name = "session"
105 looping_list( session, award_sessions, field_name, movie_list, award_sessions)
106 field_name = "sessionNumber"
107 looping_list( i, session_list, field_name, movie_list, award_sessions)
108 field_name = "year"
109 looping_list( year, year_list, field_name, movie_list, award_sessions)
110 field_name = "bestFilm"
111 looping_list( bestFilm, best_films_list, field_name, movie_list, award_sessions)
112 field_name = "bestDirector"
113 looping_list( bestDirector, best_directors_list, field_name, movie_list, award_sessions)
114 field_name = "bestScreenPlay"
115 looping_list( bestScreenPlay, best_screenplay_list, field_name, movie_list, award_sessions)
116 field_name = "bestActor"
117 looping_list( bestActor, best_actors_list, field_name, movie_list, award_sessions)
118 field_name = "bestActress"
119 looping_list( bestActress, best_actress_list, field_name, movie_list, award_sessions)
120
121 return movie_list
122
123movie_list = scrape_hkfa_awards(1, 41)
124
125with open("data.json", "w", encoding="utf-8") as jsonfile:
126 json.dump(movie_list, jsonfile, ensure_ascii=False, indent=4)