為了練習最近自學的幾項東西,因此我決定來做一隻台股爬蟲,試試看AI智能交易。
主要包含兩部份:
- 資料蒐集的部份以Crawler來抓,並用MySQL建立並管理資料庫。
- 以機器學習、深度學習來做模型,試圖預測明日股價相對於大盤的表現(Relative Strength)。一是看漲或跌的機率,二是漲跌價格。目前計畫用機器學習來選indicators,並建立模型,再利用深度學習也建幾個模型來比對模型表現(Performance)。
選擇資料庫跟寫爬蟲¶
資料庫選MySQL學而不是python SQLite3,主因是MySQL畢竟還是大宗,且網路上免費的教學多,配上Oracle免費的GUI介面MySQL Workbench,學起比較舒服。
爬蟲大概花了兩天學加寫,結果除BUG大概花了三倍時間。原因如下:
- 各大資料提供的網站都有一些防爬蟲的措施,抓的量一大多會被鎖。大部分可以用fake-useragent來做虛擬的User Agent,避過同一台裝置短時間內發送大量請求的審查。
Indicator挑選¶
我目前選擇抓取的資料為:Price, Change, Ratio(Comparing to MktAvg), MA(6:12), Volume, Income(per month), PBR。結合技術分析跟基本面比較即時的資料-營收與PBR。挑這幾個做Indicators,是先使用隨機森林(Random Forest)做出來,P-value稍微可看的幾個,這部份我只是很草率的先做過,因為免費且優質資料的蒐集實屬不易,只能且戰且走。目前還想加入三大法人的每日成交量,並監看每日融資餘額、融券餘額,不過還沒找到合適的網站來抓,構想先放著。等到資料量大,再用其他演算法或統計來挑看看。
程式碼¶
這次特意使用物件導向(object-oriented)來做。很久以前自學Java時,書本一翻開就是教物件導向,但研究所都是用R,太久沒用實在忘的差不多了,這次剛好複習一下。(code經過幾天的又修又補,有幾個部份看起來扎眼,但也懶的改了。)
In [ ]:
SleepTime = 4 #先設定每次上證交所網站的時間間隔
import sys
MyStockList=[]
with open('/MyStockList.info','r') as file:
for line in file:
MyStockList.append(line.strip('\n').split(',')[0])
mystock=[]
for stock in MyStockList:
mystock.append(int(stock))
print("股票清單:\n",mystock)
In [ ]:
class twstock_crawler():
def __init__(self, stock_list=[],
web = "https://goodinfo.tw/StockInfo/StockDetail.asp?STOCK_ID=%i"):
self.stocklist=stock_list
self.url = dict()
for stock in stock_list:
self.url.update({stock:web%stock})
import pandas as pd
import numpy as np
import requests #傳送request
import bs4 as bs #beautifulsoup4 解HTML用
from fake_useragent import UserAgent# 虛擬UA
ua= UserAgent()
result=dict()
for stock in self.stocklist:
PBR_Url = self.url[stock]
header=ua.random() #亂數抽一個user agent
headerForReq={"User-Agent":header} #r建立dict: request header
response_goodinfo = requests.post(PBR_Url, headers=headerForReq)
result.update({stock:
bs.BeautifulSoup(response_goodinfo.content, "html.parser")})
#提供 request header並抓HTML檔
self.html=result
def crawler(self, label=str("td"), *args,**kwargs):
import pandas as pd
import numpy as np
import requests #傳送request
import bs4 as bs #beautifulsoup4 解HTML用
import twstock
import time
#twstock.__update_codes()
df = pd.DataFrame({"Stock":self.stocklist,
"Price":np.zeros(len(self.stocklist)),
"Change(%)":np.zeros(len(self.stocklist)),
"Ratio_to_MktAvg":np.zeros(len(self.stocklist)),
"MA(6:12)":np.zeros(len(self.stocklist)),
"Volume":np.zeros(len(self.stocklist)),
"Income(M)":np.zeros(len(self.stocklist)),
"PBR":np.zeros(len(self.stocklist))})
df.set_index("Stock", inplace= True)
#PBR
try:
for n in range(0,len(self.stocklist)):
data = self.html[self.stocklist[n]].find_all(label) #從抓取的HTML檔中,找出有<td>標籤, 建立名為data的list
try:
if data[102].string != "N/A": #arg 102 或103是PBR在data中的位置(此公司沒有PER資料時會多一個<td>標籤)
PBR = float(data[102].string)
df.iloc[n,6]=PBR
elif data[102].string == "N/A":
PBR = float(data[103].string)
df.iloc[n,6]=PBR
else:
print('Stock %i' % self.stocklist[n],"got ERROR")
except ValueError:
if data[100].string != "N/A": #arg 100是PBR在data中的位置 (此公司沒有警示資料時會少兩個<td>標籤)
PBR = float(data[100].string)
df.iloc[n,6]=PBR
elif data[100].string == "N/A":
PBR = float(data[100].string)
df.iloc[n,6]=PBR
else:
print('Stock %i' % self.stocklist[n],"got ERROR")
time.sleep(1)
except ValueError:
print("PBR LOOP went wrong")
#Income
for n in range(0,len(self.stocklist)):
data = self.html[self.stocklist[n]].find_all(label, style="color:") #從抓取的HTML檔中,找出有<td>標籤, 建立名為data的list
income = float(data[0].string.replace(',',''))
df.iloc[n,5]=income
#Price/Change/Ratio/MA/Volume
##crawler to TAIEX(0000)加權指數
taiexurl = "https://finance.yahoo.com/world-indices/"
response = requests.get(taiexurl)
import io
f = io.StringIO(response.text)
dfs = pd.read_html(f)
taiex_chg = float(pd.read_html(f)[0].set_index("Name").loc['TSEC weighted index', '% Change'].replace('%',''))
##
for n in range(0,len(self.stocklist)):
stock=self.stocklist[n]
datatoday=twstock.Stock(str(stock))
price = datatoday.price[-1] #col 0
change= datatoday.change[-1] #col 1
ratio = change/taiex_chg #col 2
MA2060=datatoday.ma_bias_ratio(6,12)[-1]#col 3
volume=int(datatoday.capacity[-1]) #col 4
time.sleep(SleepTime)#證交所會擋IP, 睡一下。測試limit 3 query in 5 sec
df.iloc[n,0]=price
df.iloc[n,1]=change
df.iloc[n,2]=round(ratio,3)
df.iloc[n,3]=round(MA2060,2)
df.iloc[n,4]=round(volume)
print('Stock %i' % self.stocklist[n],"ok,第",n+1,"/",len(self.stocklist),"筆")
self.data = df
self.price=df.loc[:,'Price']
self.change=df.loc[:,'Change(%)']
self.ratio=df.loc[:,'Ratio_to_MktAvg']
self.ma=df.loc[:,'MA(6:12)']
self.volume=df.loc[:,'Volume']
self.income = df.loc[:,'Income(M)']
self.pbr = df.loc[:,'PBR']
print("crawler done!")
def savetodb(self):
import pandas as pd
import numpy as np
import mysql.connector
import twstock
HugoDB = mysql.connector.connect(
host = "127.0.0.1", # Input the database server IP
user = "root", #Input the username
password = "******", #input the username
database = "MyStockRepository", #input the name of the database
)
cursor=HugoDB.cursor()
Stocktb=[]
#建立表格於database(如果已建立表個會直接略過)
for stock in range(0,len(self.stocklist)):
tb="CREATE TABLE IF NOT EXISTS TW%i(Date DATE, Price FLOAT(7,2) NOT NULL DEFAULT '0', Change_ FLOAT(4,2) NOT NULL DEFAULT '0', RatioMktAvg FLOAT(4,2) NOT NULL DEFAULT '0',MA6to12 FLOAT(4,2) NOT NULL DEFAULT '0', Volume INT NOT NULL DEFAULT '0',IncomeMonth FLOAT(8,3) NOT NULL DEFAULT '0', PBR FLOAT(7,3) NOT NULL DEFAULT '0',PRIMARY KEY)ENGINE=MyISAM DEFAULT CHARSET=big5;"%self.stocklist[stock]
Stocktb.append("TW%i"%self.stocklist[stock])
CreateTableCmd = tb
cursor.execute(CreateTableCmd)
self.stocktb=Stocktb
#建立資料清單
inputdata = []
for n in range(0,len(self.stocklist)):
stock = self.stocklist[n]
price=str(self.price.loc[stock])
change=str(self.change.loc[stock])
ratio=str(self.ratio.loc[stock])
ma2060=str(self.ma.loc[stock])
volume=str(self.volume.loc[stock])
income=str(self.income.loc[stock])
pbr=str(self.pbr.loc[stock])
inputdata.append(
(price,change,ratio,ma2060,volume,income,pbr)
); #全部轉成string格式, 防止放入cursor.excutemany()時出錯
self.inputdata=inputdata
#將當日資料存入MySQL
try:
for n in range(0,len(self.stocklist)):
#為了方便處理資料,這邊用的超土法煉鋼
sqlcmd="INSERT INTO "+Stocktb[n]+" VALUES (DATE(NOW()),%s," %self.price[self.stocklist[n]]
sqlcmd=sqlcmd+"%s,"%self.change[self.stocklist[n]]
sqlcmd=sqlcmd+"%s,"%self.ratio[self.stocklist[n]]
sqlcmd=sqlcmd+"%s,"%self.ma[self.stocklist[n]]
sqlcmd=sqlcmd+"%s,"%self.volume[self.stocklist[n]]
sqlcmd=sqlcmd+"%s,"%self.income[self.stocklist[n]]
sqlcmd=sqlcmd+"%s)"%self.pbr[self.stocklist[n]]
cursor.execute(sqlcmd)
HugoDB.commit()
print("已存入本日PBR")
except:
print("已有相同日期之資料存入。登入資料庫確認")
In [ ]:
exe = twstock_crawler(mystock)
exe.crawler()
exe.savetodb();
執行¶
寫了一個簡單的shell腳本放在桌面上,滑鼠點兩下執行,然後在MySQL Workbench上查看結果