0%

python爬取qq群人员信息

爬取QQ群中的人员信息

Python3读取、写入、追加写入Excel文件

‘chromedriver’ executable needs to be in PATH解决办法

python文件绝对路径写法(windows)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
# coding=UTF-8
import openpyxl
import time
from selenium import webdriver

class qqGroupSpider():
'''
Q群爬虫类
'''
def __init__(self, driver,qq,passwd,qqgroup):
'''
初始化根据用户信息登录到Q群管理界面
:param driver:
:param qq:
:param passwd:
:param qqgroup:
:param writefile:
'''
url = "https://qun.qq.com/member.html#gid={}".format(qqgroup)
self.driver=driver
# driver.delete_all_cookies()
driver.get(url)
time.sleep(1)
# driver.switch_to.frame("login_frame") # 进入登录iframe
# time.sleep(1)
# change = driver.find_element_by_id("switcher_plogin")
# change.click()
# driver.find_element_by_id('u').clear() # 选择用户名框
# driver.find_element_by_id('u').send_keys(qq)
# driver.find_element_by_id('p').clear()
# driver.find_element_by_id('p').send_keys(passwd)
# driver.find_element_by_class_name("login_button").click()
time.sleep(1)

def scroll_foot(self,driver):
'''
控制屏幕向下滚动到底部
:param driver:
:return:
'''
js = "var q=document.documentElement.scrollTop=100000"
return driver.execute_script(js)

def getTbodyList(self, driver):
print("getTbodyList()函数运行过")
return driver.find_elements_by_xpath('//div[@class="group-memeber"]//tbody[contains(@class,"list")]')

def parseTbody(self, html):
'''
解析tbody里面的内容,一个tbody里面有多个成员,
解析完成后,返回成员基本情况的列表
:param html:
:return:
'''
# selector = etree.HTML(html)
print("parseTbody()函数运行过")
memberLists = []
for each in html:
memberList = each.find_elements_by_xpath('tr[contains(@class,"mb mb")]')
memberLists += memberList

print("memberLists长度为:{}".format(len(memberLists)))
memberLists_data = []
for each in memberLists:
memberLists_data.append(self.parseMember(each))
return memberLists_data

def parseMember(self, mb):
'''
解析每个人各项描述,以逗号隔开,返回一个成员的基本情况
:param mb:
:return:
'''
print("parseMember()函数运行过")

td = mb.find_elements_by_xpath('td')
print("td长度为:{}".format(len(td)))

qId = td[1].text.strip()
nickName = td[2].find_element_by_xpath('span').text.strip()
card = td[3].find_element_by_xpath('span').text.strip()
qq = td[4].text.strip()
sex = td[5].text.strip()
qqAge = td[6].text.strip()
joinTime = td[7].text.strip()
lastTime = td[8].text.strip()

a = (qId + "|" + qq + "|" + nickName + "|" + card + "|" + sex + "|" + qqAge + "|" + joinTime + "|" + lastTime)
print(a)
return a

def parseAndWrite(self, tbody):
'''
解析HTML中的tbody,解析完成后写入到本地文件
:param tbody:
:return:
'''
print("parseAndWrite()函数运行过")

memberList = self.parseTbody(tbody)

with open("1607.csv", 'a+', encoding="utf-8") as f:
for each in memberList:
f.write(str(each)+"\n")

def main():
qq = "##"
passwd = "##"
qqgroup = "##"
# chromedriver的位置
chrome_driver = '##'
driver = webdriver.Chrome(executable_path=chrome_driver)
spider=qqGroupSpider(driver,qq,passwd,qqgroup)
time.sleep(10)
# 找到QQ群的人数
qqNum = int(driver.find_element_by_xpath('//*[@id="groupMemberNum"]').text.strip())
print("QQ群人数为:"+str(qqNum))
curren_qq_num=0
prelen=0


while curren_qq_num != qqNum:
curren_qq_num=len(driver.find_elements_by_xpath('//*[@id="groupMember"]//td[contains(@class,"td-no")]'))
#不停的向下滚动屏幕,直到底部
spider.scroll_foot(driver)
#每次滚动休息1秒
time.sleep(1)

tlist = spider.getTbodyList(driver)

spider.parseAndWrite(tlist[prelen:])

prelen = len(tlist)#更新tbody列表的长度

driver.quit()

if __name__ == '__main__':
main()





# def write_excel_xlsx(path, sheet_name, value):
# index = len(value)
# workbook = openpyxl.Workbook()
# sheet = workbook.active
# sheet.title = sheet_name
# for i in range(0, index):
# for j in range(0, len(value[i])):
# sheet.cell(row=i+1, column=j+1, value=str(value[i][j]))
# workbook.save(path)
# print("xlsx格式表格写入数据成功!")


# def read_excel_xlsx(path, sheet_name):
# workbook = openpyxl.load_workbook(path)
# # sheet = wb.get_sheet_by_name(sheet_name)这种方式已经弃用,不建议使用
# sheet = workbook[sheet_name]
# for row in sheet.rows:
# for cell in row:
# print(cell.value, "\t", end="")
# print()


# book_name_xlsx = 'xlsx格式测试工作簿.xlsx'

# sheet_name_xlsx = 'xlsx格式测试表'

# value3 = [["姓名", "性别", "年龄", "城市", "职业"],
# ["111", "女", "66", "石家庄", "运维工程师"],
# ["222", "男", "55", "南京", "饭店老板"],
# ["333", "女", "27", "苏州", "保安"],]


# write_excel_xlsx(book_name_xlsx, sheet_name_xlsx, value3)
# read_excel_xlsx(book_name_xlsx, sheet_name_xlsx)