本文為大家分享了python實現大音頻文件語音識別功能的具體代碼,供大家參考,具體內容如下
實現思路:先用ffmpeg將其他非wav格式的音頻轉換為wav格式,并轉換音頻的聲道(百度支持聲道為1),采樣率(值為8000),格式轉換完成后,再用ffmpeg將音頻切成百度。
支持的時長(30秒和60秒2種,本程序用的是30秒)。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
|
# coding: utf-8 import json import time import base64 from inc import rtysdb import urllib2 import requests import os import uuid from inc import db_config class BaiduRest: def __init__( self , cu_id, api_key, api_secert): self .token_url = "https://openapi.baidu.com/oauth/2.0/token?grant_type=client_credentials&client_id=%s&client_secret=%s" self .getvoice_url = "http://tsn.baidu.com/text2audio?tex=%s&lan=zh&cuid=%s&ctp=1&tok=%s" self .upvoice_url = 'http://vop.baidu.com/server_api' self .cu_id = cu_id self .get_token(api_key, api_secert) return def get_token( self , api_key, api_secert): token_url = self .token_url % (api_key, api_secert) r_str = urllib2.urlopen(token_url).read() token_data = json.loads(r_str) self .token_str = token_data[ 'access_token' ] return True # 語音合成 def text2audio( self , text, filename): get_url = self .getvoice_url % (urllib2.quote(text), self .cu_id, self .token_str) voice_data = urllib2.urlopen(get_url).read() voice_fp = open (filename, 'wb+' ) voice_fp.write(voice_data) voice_fp.close() return True ##語音識別 def audio2text( self , filename): data = {} data[ 'format' ] = 'wav' data[ 'rate' ] = 8000 data[ 'channel' ] = 1 data[ 'cuid' ] = self .cu_id data[ 'token' ] = self .token_str wav_fp = open (filename, 'rb' ) voice_data = wav_fp.read() data[ 'len' ] = len (voice_data) # data['speech'] = base64.b64encode(voice_data).decode('utf-8') data[ 'speech' ] = base64.b64encode(voice_data).replace( '\n' , '') # post_data = json.dumps(data) result = requests.post( self .upvoice_url, json = data, headers = { 'Content-Type' : 'application/json' }) data_result = result.json() if (data_result[ 'err_msg' ] = = 'success.' ): return data_result[ 'result' ][ 0 ] else : return False def test_voice(voice_file): api_key = "vossGHIgEETS6IMRxBDeahv8" api_secert = "3c1fe6a6312f41fa21fa2c394dad5510" bdr = BaiduRest( "0-57-7B-9F-1F-A1" , api_key, api_secert) # 生成 #start = time.time() #bdr.text2audio("你好啊", "out.wav") #using = time.time() - start #print using # 識別 #start = time.time() result = bdr.audio2text(voice_file) # result = bdr.audio2text("weather.pcm") #using = time.time() - start return result def get_master_audio(check_status = 'cut_status' ): if check_status = = 'cut_status' : sql = "SELECT id,url, time_long,sharps FROM ocenter_recognition WHERE status=0" elif check_status = = 'finished_status' : sql = "SELECT id,url, time_long,sharps FROM ocenter_recognition WHERE finished_status=0" else : return False data = rtysdb.select_data(sql, 'more' ) if data: return data else : return False def go_recognize(master_id): section_path = db_config.SYS_PATH sql = "SELECT id,rid,url,status FROM ocenter_section WHERE rid=%d AND status=0 order by id asc limit 10" % (master_id) #print sql record = rtysdb.select_data(sql, 'more' ) #print record if not record: return False for rec in record: #print section_path+'/'+rec[1] voice_file = section_path + '/' + rec[ 2 ] if not os.path.exists(voice_file): continue result = test_voice(voice_file) print result exit( 0 ) if result: #rtysdb.update_by_pk('ocenter_section',rec[0],{'content':result,'status':1}) sql = "update ocenter_section set content='%s', status='%d' where id=%d" % (result, 1 ,rec[ 0 ]) #print sql rtysdb.do_exec_sql(sql) parent_content = rtysdb.select_data( "SELECT id,content FROM ocenter_recognition WHERE id=%d" % (rec[ 1 ])) #print parent_content if parent_content: new_content = parent_content[ 1 ] + result update_content_sql = "update ocenter_recognition set content='%s' where id=%d" % (new_content,rec[ 1 ]) rtysdb.do_exec_sql(update_content_sql) else : rtysdb.do_exec_sql( "update ocenter_section set status='%d' where id=%d" % (result, 1 ,rec[ 0 ])) time.sleep( 5 ) else : rtysdb.do_exec_sql( "UPDATE ocenter_recognition SET finished_status=1 WHERE id=%d" % (master_id)) #對百度語音識別不了的音頻文件進行轉換 def ffmpeg_convert(): section_path = db_config.SYS_PATH #print section_path used_audio = get_master_audio( 'cut_status' ) #print used_audio if used_audio: for audio in used_audio: audio_path = section_path + '/' + audio[ 1 ] new_audio = uuid.uuid1() command_line = "ffmpeg -i " + audio_path + " -ar 8000 -ac 1 -f wav " + section_path + "/Uploads/Convert/convert_" + str (new_audio) + ".wav" ; #print command_line os.popen(command_line) if os.path.exists(section_path + "/Uploads/Convert/convert_" + str (new_audio) + ".wav" ): convert_name = "Uploads/Convert/convert_" + str (new_audio) + ".wav" ffmpeg_cut(convert_name,audio[ 3 ],audio[ 0 ]) sql = "UPDATE ocenter_recognition SET status=1,convert_name='%s' where id=%d" % (convert_name,audio[ 0 ]) rtysdb.do_exec_sql(sql) #將大音頻文件切成碎片 def ffmpeg_cut(convert_name,sharps,master_id): section_path = db_config.SYS_PATH if sharps> 0 : for i in range ( 0 ,sharps): timeArray = time.localtime(i * 30 ) h = time.strftime( "%H" , timeArray) h = int (h) - 8 h = "0" + str (h) ms = time.strftime( "%M:%S" ,timeArray) start_time = h + ':' + str (ms) cut_name = section_path + '/' + convert_name db_store_name = "Uploads/Section/" + str (uuid.uuid1()) + '-' + str (i + 1 ) + ".wav" section_name = section_path + "/" + db_store_name command_line = "ffmpeg.exe -i " + cut_name + " -vn -acodec copy -ss " + start_time + " -t 00:00:30 " + section_name #print command_line os.popen(command_line) data = {} data[ 'rid' ] = master_id data[ 'url' ] = db_store_name data[ 'create_time' ] = int (time.time()) data[ 'status' ] = 0 rtysdb.insert_one( 'ocenter_section' ,data) if __name__ = = "__main__" : ffmpeg_convert() audio = get_master_audio( 'finished_status' ) if audio: for ad in audio: go_recognize(ad[ 0 ]) |
以上就是本文的全部內容,希望對大家的學習有所幫助,也希望大家多多支持服務器之家。
原文鏈接:https://blog.csdn.net/septwolves2015/article/details/78554524