forked from X-PLUG/MobileAgent
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
阳渠
committed
Jan 31, 2024
1 parent
244a247
commit b0bbed5
Showing
81 changed files
with
9,718 additions
and
6 deletions.
There are no files selected for viewing
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
import base64 | ||
import requests | ||
|
||
def encode_image(image_path): | ||
with open(image_path, "rb") as image_file: | ||
return base64.b64encode(image_file.read()).decode('utf-8') | ||
|
||
|
||
def inference_chat(chat, API_TOKEN): | ||
api_url = 'https://api.openai.com/v1/chat/completions' | ||
headers = { | ||
"Content-Type": "application/json", | ||
"Authorization": f"Bearer {API_TOKEN}" | ||
} | ||
|
||
data = { | ||
"model": 'gpt-4-vision-preview', | ||
"messages": [], | ||
"max_tokens": 2048, | ||
} | ||
|
||
for role, content in chat: | ||
data["messages"].append({"role": role, "content": content}) | ||
|
||
while 1: | ||
try: | ||
res = requests.post(api_url, headers=headers, json=data) | ||
res = res.json()['choices'][0]['message']['content'] | ||
except: | ||
print("Network Error:") | ||
print(res) | ||
else: | ||
break | ||
|
||
return res |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,68 @@ | ||
import copy | ||
from MobileAgent.api import encode_image | ||
|
||
|
||
def init_chat(instruction): | ||
operation_history = [] | ||
sysetm_prompt = "You are a helpful phone operating assistant. You need to help me operate the phone to complete my instruction.\n" | ||
sysetm_prompt += f"My instruction is: {instruction}" | ||
operation_history.append(["user", [{"type": "text", "text": sysetm_prompt}]]) | ||
operation_history.append(["assistant", [{"type": "text", "text": "Sure. How can I help you?"}]]) | ||
return operation_history | ||
|
||
|
||
def add_response(role, prompt, chat_history, image=None): | ||
new_chat_history = copy.deepcopy(chat_history) | ||
if image: | ||
base64_image = encode_image(image) | ||
content = [ | ||
{ | ||
"type": "text", | ||
"text": prompt | ||
}, | ||
{ | ||
"type": "image_url", | ||
"image_url": { | ||
"url": f"data:image/jpeg;base64,{base64_image}" | ||
} | ||
}, | ||
] | ||
else: | ||
content = [ | ||
{ | ||
"type": "text", | ||
"text": prompt | ||
}, | ||
] | ||
new_chat_history.append([role, content]) | ||
return new_chat_history | ||
|
||
|
||
def add_multiimage_response(role, prompt, chat_history, images): | ||
new_chat_history = copy.deepcopy(chat_history) | ||
content = [ | ||
{ | ||
"type": "text", | ||
"text": prompt | ||
}, | ||
] | ||
for image in images: | ||
base64_image = encode_image(image) | ||
this_content = { | ||
"type": "image_url", | ||
"image_url": { | ||
"url": f"data:image/jpeg;base64,{base64_image}" | ||
} | ||
} | ||
content.append(this_content) | ||
|
||
new_chat_history.append([role, content]) | ||
return new_chat_history | ||
|
||
|
||
def print_status(chat_history): | ||
print("*"*100) | ||
for chat in chat_history: | ||
print("role:", chat[0]) | ||
print(chat[1][0]["text"] + "<image>"*(len(chat[1])-1)) | ||
print("*"*100) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,84 @@ | ||
import time | ||
import subprocess | ||
from PIL import Image | ||
|
||
|
||
def get_size(adb_path): | ||
command = adb_path + " shell wm size" | ||
result = subprocess.run(command, capture_output=True, text=True, shell=True) | ||
resolution_line = result.stdout.strip().split('\n')[-1] | ||
width, height = map(int, resolution_line.split(' ')[-1].split('x')) | ||
return width, height | ||
|
||
|
||
def get_screenshot(adb_path): | ||
command = adb_path + " shell rm /sdcard/screenshot.png" | ||
subprocess.run(command, capture_output=True, text=True, shell=True) | ||
time.sleep(0.5) | ||
command = adb_path + " shell screencap -p /sdcard/screenshot.png" | ||
subprocess.run(command, capture_output=True, text=True, shell=True) | ||
time.sleep(0.5) | ||
command = adb_path + " pull /sdcard/screenshot.png ./screenshot" | ||
subprocess.run(command, capture_output=True, text=True, shell=True) | ||
image_path = "./screenshot/screenshot.png" | ||
save_path = "./screenshot/screenshot.jpg" | ||
image = Image.open(image_path) | ||
original_width, original_height = image.size | ||
new_width = int(original_width * 0.5) | ||
new_height = int(original_height * 0.5) | ||
resized_image = image.resize((new_width, new_height)) | ||
resized_image.convert("RGB").save(save_path, "JPEG") | ||
time.sleep(1) | ||
|
||
|
||
def tap(adb_path, x, y, px, py): | ||
w = px | ||
h = py | ||
ax = int(x*w) | ||
ay = int(y*h) | ||
command = adb_path + f" shell input tap {ax} {ay}" | ||
subprocess.run(command, capture_output=True, text=True, shell=True) | ||
time.sleep(1) | ||
|
||
|
||
def type(adb_path, text): | ||
text = text.replace("\\n", "_").replace("\n", "_") | ||
for char in text: | ||
if char == ' ': | ||
command = adb_path + f" shell input text %s" | ||
subprocess.run(command, capture_output=True, text=True, shell=True) | ||
elif char == '_': | ||
command = adb_path + f" shell input keyevent 66" | ||
subprocess.run(command, capture_output=True, text=True, shell=True) | ||
elif 'a' <= char <= 'z' or 'A' <= char <= 'Z' or char.isdigit(): | ||
command = adb_path + f" shell input text {char}" | ||
subprocess.run(command, capture_output=True, text=True, shell=True) | ||
elif char in '-.,!?@\'°/:;()': | ||
command = adb_path + f" shell input text \"{char}\"" | ||
subprocess.run(command, capture_output=True, text=True, shell=True) | ||
else: | ||
command = adb_path + f" shell am broadcast -a ADB_INPUT_TEXT --es msg \"{char}\"" | ||
subprocess.run(command, capture_output=True, text=True, shell=True) | ||
time.sleep(1) | ||
|
||
|
||
def slide(adb_path, action, x, y): | ||
if "down" in action: | ||
command = adb_path + f" shell input swipe {int(x/2)} {int(y/2)} {int(x/2)} {int(y/4)} 500" | ||
subprocess.run(command, capture_output=True, text=True, shell=True) | ||
elif "up" in action: | ||
command = adb_path + f" shell input swipe {int(x/2)} {int(y/2)} {int(x/2)} {int(3*y/4)} 500" | ||
subprocess.run(command, capture_output=True, text=True, shell=True) | ||
time.sleep(1) | ||
|
||
|
||
def back(adb_path): | ||
command = adb_path + f" shell input keyevent 4" | ||
subprocess.run(command, capture_output=True, text=True, shell=True) | ||
time.sleep(1) | ||
|
||
|
||
def back_to_desktop(adb_path): | ||
command = adb_path + f" shell am start -a android.intent.action.MAIN -c android.intent.category.HOME" | ||
subprocess.run(command, capture_output=True, text=True, shell=True) | ||
time.sleep(1) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,141 @@ | ||
import math | ||
import cv2 | ||
import numpy as np | ||
from PIL import Image, ImageDraw, ImageFont | ||
import clip | ||
import torch | ||
|
||
|
||
def crop_image(img, position): | ||
def distance(x1,y1,x2,y2): | ||
return math.sqrt(pow(x1 - x2, 2) + pow(y1 - y2, 2)) | ||
position = position.tolist() | ||
for i in range(4): | ||
for j in range(i+1, 4): | ||
if(position[i][0] > position[j][0]): | ||
tmp = position[j] | ||
position[j] = position[i] | ||
position[i] = tmp | ||
if position[0][1] > position[1][1]: | ||
tmp = position[0] | ||
position[0] = position[1] | ||
position[1] = tmp | ||
|
||
if position[2][1] > position[3][1]: | ||
tmp = position[2] | ||
position[2] = position[3] | ||
position[3] = tmp | ||
|
||
x1, y1 = position[0][0], position[0][1] | ||
x2, y2 = position[2][0], position[2][1] | ||
x3, y3 = position[3][0], position[3][1] | ||
x4, y4 = position[1][0], position[1][1] | ||
|
||
corners = np.zeros((4,2), np.float32) | ||
corners[0] = [x1, y1] | ||
corners[1] = [x2, y2] | ||
corners[2] = [x4, y4] | ||
corners[3] = [x3, y3] | ||
|
||
img_width = distance((x1+x4)/2, (y1+y4)/2, (x2+x3)/2, (y2+y3)/2) | ||
img_height = distance((x1+x2)/2, (y1+y2)/2, (x4+x3)/2, (y4+y3)/2) | ||
|
||
corners_trans = np.zeros((4,2), np.float32) | ||
corners_trans[0] = [0, 0] | ||
corners_trans[1] = [img_width - 1, 0] | ||
corners_trans[2] = [0, img_height - 1] | ||
corners_trans[3] = [img_width - 1, img_height - 1] | ||
|
||
transform = cv2.getPerspectiveTransform(corners, corners_trans) | ||
dst = cv2.warpPerspective(img, transform, (int(img_width), int(img_height))) | ||
return dst | ||
|
||
|
||
def calculate_size(box): | ||
return (box[2]-box[0]) * (box[3]-box[1]) | ||
|
||
|
||
def calculate_iou(box1, box2): | ||
xA = max(box1[0], box2[0]) | ||
yA = max(box1[1], box2[1]) | ||
xB = min(box1[2], box2[2]) | ||
yB = min(box1[3], box2[3]) | ||
|
||
interArea = max(0, xB - xA) * max(0, yB - yA) | ||
box1Area = (box1[2] - box1[0]) * (box1[3] - box1[1]) | ||
box2Area = (box2[2] - box2[0]) * (box2[3] - box2[1]) | ||
unionArea = box1Area + box2Area - interArea | ||
iou = interArea / unionArea | ||
|
||
return iou | ||
|
||
|
||
def crop(image, box, i, text_data=None): | ||
image = Image.open(image) | ||
|
||
if text_data: | ||
draw = ImageDraw.Draw(image) | ||
draw.rectangle(((text_data[0], text_data[1]), (text_data[2], text_data[3])), outline="red", width=5) | ||
font_size = int((text_data[3] - text_data[1])*0.75) | ||
font = ImageFont.truetype("arial.ttf", font_size) | ||
draw.text((text_data[0]+5, text_data[1]+5), str(i), font=font, fill="red") | ||
|
||
cropped_image = image.crop(box) | ||
cropped_image.save(f"./temp/{i}.jpg") | ||
|
||
|
||
def in_box(box, target): | ||
if (box[0] > target[0]) and (box[1] > target[1]) and (box[2] < target[2]) and (box[3] < target[3]): | ||
return True | ||
else: | ||
return False | ||
|
||
|
||
def crop_for_clip(image, box, i, position): | ||
image = Image.open(image) | ||
w, h = image.size | ||
if position == "left": | ||
bound = [0, 0, w/2, h] | ||
elif position == "right": | ||
bound = [w/2, 0, w, h] | ||
elif position == "top": | ||
bound = [0, 0, w, h/2] | ||
elif position == "bottom": | ||
bound = [0, h/2, w, h] | ||
elif position == "top left": | ||
bound = [0, 0, w/2, h/2] | ||
elif position == "top right": | ||
bound = [w/2, 0, w, h/2] | ||
elif position == "bottom left": | ||
bound = [0, h/2, w/2, h] | ||
elif position == "bottom right": | ||
bound = [w/2, h/2, w, h] | ||
else: | ||
bound = [0, 0, w, h] | ||
|
||
if in_box(box, bound): | ||
cropped_image = image.crop(box) | ||
cropped_image.save(f"./temp/{i}.jpg") | ||
return True | ||
else: | ||
return False | ||
|
||
|
||
def clip_for_icon(clip_model, clip_preprocess, images, prompt): | ||
image_features = [] | ||
for image_file in images: | ||
image = clip_preprocess(Image.open(image_file)).unsqueeze(0) | ||
image_feature = clip_model.encode_image(image) | ||
image_features.append(image_feature) | ||
image_features = torch.cat(image_features) | ||
|
||
text = clip.tokenize([prompt]) | ||
text_features = clip_model.encode_text(text) | ||
|
||
image_features /= image_features.norm(dim=-1, keepdim=True) | ||
text_features /= text_features.norm(dim=-1, keepdim=True) | ||
similarity = (100.0 * image_features @ text_features.T).softmax(dim=0).squeeze(0) | ||
_, max_pos = torch.max(similarity, dim=0) | ||
pos = max_pos.item() | ||
|
||
return pos |
Oops, something went wrong.