1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180
| # coding=UTF-8 import openpyxl import time from selenium import webdriver
class qqGroupSpider(): ''' Q群爬虫类 ''' def __init__(self, driver,qq,passwd,qqgroup): ''' 初始化根据用户信息登录到Q群管理界面 :param driver: :param qq: :param passwd: :param qqgroup: :param writefile: ''' url = "https://qun.qq.com/member.html#gid={}".format(qqgroup) self.driver=driver # driver.delete_all_cookies() driver.get(url) time.sleep(1) # driver.switch_to.frame("login_frame") # 进入登录iframe # time.sleep(1) # change = driver.find_element_by_id("switcher_plogin") # change.click() # driver.find_element_by_id('u').clear() # 选择用户名框 # driver.find_element_by_id('u').send_keys(qq) # driver.find_element_by_id('p').clear() # driver.find_element_by_id('p').send_keys(passwd) # driver.find_element_by_class_name("login_button").click() time.sleep(1)
def scroll_foot(self,driver): ''' 控制屏幕向下滚动到底部 :param driver: :return: ''' js = "var q=document.documentElement.scrollTop=100000" return driver.execute_script(js) def getTbodyList(self, driver): print("getTbodyList()函数运行过") return driver.find_elements_by_xpath('//div[@class="group-memeber"]//tbody[contains(@class,"list")]')
def parseTbody(self, html): ''' 解析tbody里面的内容,一个tbody里面有多个成员, 解析完成后,返回成员基本情况的列表 :param html: :return: ''' # selector = etree.HTML(html) print("parseTbody()函数运行过") memberLists = [] for each in html: memberList = each.find_elements_by_xpath('tr[contains(@class,"mb mb")]') memberLists += memberList
print("memberLists长度为:{}".format(len(memberLists))) memberLists_data = [] for each in memberLists: memberLists_data.append(self.parseMember(each)) return memberLists_data
def parseMember(self, mb): ''' 解析每个人各项描述,以逗号隔开,返回一个成员的基本情况 :param mb: :return: ''' print("parseMember()函数运行过")
td = mb.find_elements_by_xpath('td') print("td长度为:{}".format(len(td)))
qId = td[1].text.strip() nickName = td[2].find_element_by_xpath('span').text.strip() card = td[3].find_element_by_xpath('span').text.strip() qq = td[4].text.strip() sex = td[5].text.strip() qqAge = td[6].text.strip() joinTime = td[7].text.strip() lastTime = td[8].text.strip()
a = (qId + "|" + qq + "|" + nickName + "|" + card + "|" + sex + "|" + qqAge + "|" + joinTime + "|" + lastTime) print(a) return a
def parseAndWrite(self, tbody): ''' 解析HTML中的tbody,解析完成后写入到本地文件 :param tbody: :return: ''' print("parseAndWrite()函数运行过")
memberList = self.parseTbody(tbody)
with open("1607.csv", 'a+', encoding="utf-8") as f: for each in memberList: f.write(str(each)+"\n")
def main(): qq = "##" passwd = "##" qqgroup = "##" # chromedriver的位置 chrome_driver = '##' driver = webdriver.Chrome(executable_path=chrome_driver) spider=qqGroupSpider(driver,qq,passwd,qqgroup) time.sleep(10) # 找到QQ群的人数 qqNum = int(driver.find_element_by_xpath('//*[@id="groupMemberNum"]').text.strip()) print("QQ群人数为:"+str(qqNum)) curren_qq_num=0 prelen=0
while curren_qq_num != qqNum: curren_qq_num=len(driver.find_elements_by_xpath('//*[@id="groupMember"]//td[contains(@class,"td-no")]')) #不停的向下滚动屏幕,直到底部 spider.scroll_foot(driver) #每次滚动休息1秒 time.sleep(1) tlist = spider.getTbodyList(driver)
spider.parseAndWrite(tlist[prelen:])
prelen = len(tlist)#更新tbody列表的长度
driver.quit()
if __name__ == '__main__': main()
# def write_excel_xlsx(path, sheet_name, value): # index = len(value) # workbook = openpyxl.Workbook() # sheet = workbook.active # sheet.title = sheet_name # for i in range(0, index): # for j in range(0, len(value[i])): # sheet.cell(row=i+1, column=j+1, value=str(value[i][j])) # workbook.save(path) # print("xlsx格式表格写入数据成功!") # def read_excel_xlsx(path, sheet_name): # workbook = openpyxl.load_workbook(path) # # sheet = wb.get_sheet_by_name(sheet_name)这种方式已经弃用,不建议使用 # sheet = workbook[sheet_name] # for row in sheet.rows: # for cell in row: # print(cell.value, "\t", end="") # print() # book_name_xlsx = 'xlsx格式测试工作簿.xlsx' # sheet_name_xlsx = 'xlsx格式测试表' # value3 = [["姓名", "性别", "年龄", "城市", "职业"], # ["111", "女", "66", "石家庄", "运维工程师"], # ["222", "男", "55", "南京", "饭店老板"], # ["333", "女", "27", "苏州", "保安"],] # write_excel_xlsx(book_name_xlsx, sheet_name_xlsx, value3) # read_excel_xlsx(book_name_xlsx, sheet_name_xlsx)
|