Java 人工智能 - Apache Mahout - 推荐算法
数据集
BX-Users 包含用户信息
BX-Books 包含图书信息
BX-Book-Ratings 包含图书评分信息
加载数据
从文件加载数据
使用 FileDatamodel类从文件加载数据
文件中的数据是以逗号进行分隔
每一行的顺序包含
userId,itemId[,preference[,timestamp]]
[]是可选的
preference 为二元偏好值 对于某本书 要么是喜欢 要么是不喜欢
#号 以及空行都会被忽略
数据行业可以包含其他字段 但这些字段会诶忽略
Datamodel 类 可以接受如下类型
userID、itemID 是 long 类型
preference 是 double 类型
timestrap 是 long 类型
如果能提供这些格式的数据集 就可以简单使用如下代码加载数据
Datamodel model = new FileDatamodel(new File(path));
不适合加载大数据量 比如几千万行数据
需要加载大量数据时 使用带有 JDBC 支持 Datamode 与数据库更加合适
实际的项目中 我们无法保证提供给我们的输入数据中 userID 与 itemID 中为整形值
将字符串转换为整形


对 FileDatamodel 做扩展
package com.osrcd.mahout;
import org.apache.mahout.cf.taste.common.TasteException;
import org.apache.mahout.cf.taste.impl.model.file.FileDatamodel;
import java.io.File;
import java.io.IOException;
public class StringItemIdFileDatamodel extends FileDatamodel {
// 初始化将String转换为long的转换器
public ItemMemIDMigrator memIdMigtr;
public StringItemIdFileDatamodel(File dataFile,String regex) throws IOException {
super(dataFile,regex);
}
@Override
protected long readItemIDFromString(String value) {
if (memIdMigtr == null) {
memIdMigtr = new ItemMemIDMigrator();
}
// 转换为 long
long retValue = memIdMigtr.toLongID(value);
// 存储到缓存
try {
if (null == memIdMigtr.toStringID(retValue)) {
memIdMigtr.singleInit(value);
}
} catch (TasteException e) {
e.printStackTrace();
}
return retValue;
}
// 将long转换为String
String getItemIDAsString(long itemId) throws TasteException {
return memIdMigtr.toStringID(itemId);
}
}
对AbstractIDMIgrator类做扩展
package com.osrcd.mahout;
import org.apache.mahout.cf.taste.common.TasteException;
import org.apache.mahout.cf.taste.impl.common.FastByIDMap;
import org.apache.mahout.cf.taste.impl.model.AbstractIDMigrator;
public class ItemMemIDMigrator extends AbstractIDMigrator {
private FastByIDMap<String> longToString;
public ItemMemIDMigrator() {
this.longToString = new FastByIDMap<String>(10000);
}
public void storeMApping(long longID,String stringID) {
longToString.put(longID, stringID);
}
public void singleInit(String stringID) {
storeMApping(toLongID(stringID), stringID);
}
@Override
public String toStringID(long longID) throws TasteException {
return longToString.get(longID);
}
}

对其他有用的方法进行重载
如果用户ID不是数字
readUserIDFromString(String value)
更改解析时间戳的方式
readTimestampFromString(String value)
开始加载数据集
package com.osrcd.mahout;
import org.apache.mahout.cf.taste.common.TasteException;
import java.io.File;
import java.io.IOException;
public class LoadDataFromFile {
public static void main(String[] args) throws IOException, TasteException {
StringItemIdFileDatamodel model = new StringItemIdFileDatamodel(
new File(LoadDataFromFile.class.getClassLoader().getResource("BX-Book-Ratings.csv").getPath()), ";");
System.out.println("Total items:" + model.getNumItems() + "\nTotal users:" + model.getNumUsers());
}
}
输入的结果
Total items:340556
Total users:105283
从数据库中加载数据






内存数据库
package com.osrcd.mahout;
import org.apache.mahout.cf.taste.common.TasteException;
import org.apache.mahout.cf.taste.impl.common.FastByIDMap;
import org.apache.mahout.cf.taste.impl.model.GenericDatamodel;
import org.apache.mahout.cf.taste.impl.model.GenericUserPreferenceArray;
import org.apache.mahout.cf.taste.model.Datamodel;
import org.apache.mahout.cf.taste.model.Preference;
import org.apache.mahout.cf.taste.model.PreferenceArray;
import java.io.File;
import java.io.IOException;
public class LoadDataFromFile {
public static void main(String[] args) throws IOException, TasteException {
StringItemIdFileDatamodel model = new StringItemIdFileDatamodel(
new File(LoadDataFromFile.class.getClassLoader().getResource("BX-Book-Ratings.csv").getPath()), ";");
System.out.println("Total items:" + model.getNumItems() + "\nTotal users:" + model.getNumUsers());
// 内存数据库 散列表映射到存储一组用户偏好的数组 PreferenceArray
FastByIDMap<PreferenceArray> preferences = new FastByIDMap<>();
// 创建一个用户偏好的数组 存储用评分
PreferenceArray prefsForUser1 = new GenericUserPreferenceArray(10);
// 为0号位置设置用户ID
prefsForUser1.setUserID(0, 0L);
// 为当前偏好(0号位置)设置项目 ID
prefsForUser1.setItemID(0, 101L);
// 为0号位置设置偏好值
prefsForUser1.setValue(0, 3.0f);
prefsForUser1.setItemID(1,102L);
prefsForUser1.setValue(1, 4.5F);
// 把该对象添加到散列表当中
preferences.put(1L, prefsForUser1); // userID 用作键
// 使用散列表初始化数据模型
Datamodel datamodel = new GenericDatamodel(preferences);
}
}
协同过滤
通过用户对项目的偏好 用户项目的资料进行推荐


基于用户的过滤
加载数据模型
package com.osrcd.mahout;
import org.apache.mahout.cf.taste.common.TasteException;
import org.apache.mahout.cf.taste.eval.RecommenderBuilder;
import org.apache.mahout.cf.taste.impl.neighborhood.ThresholdUserNeighborhood;
import org.apache.mahout.cf.taste.impl.recommender.GenericUserBasedRecommender;
import org.apache.mahout.cf.taste.impl.similarity.PearsonCorrelationSimilarity;
import org.apache.mahout.cf.taste.model.Datamodel;
import org.apache.mahout.cf.taste.model.Preference;
import org.apache.mahout.cf.taste.neighborhood.UserNeighborhood;
import org.apache.mahout.cf.taste.recommender.RecommendedItem;
import org.apache.mahout.cf.taste.recommender.Recommender;
import org.apache.mahout.cf.taste.recommender.UserBasedRecommender;
import org.apache.mahout.cf.taste.similarity.UserSimilarity;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.HashMap;
import java.util.List;
public class BookRecommender {
static HashMap<String, String> books;
public static void main(String[] args) throws Exception {
books = loadBooks(BookRecommender.class.getClassLoader().getResource("BX-Books.csv").getPath());
userBased();
}
public static HashMap<String,String> loadBooks(String filename) throws Exception{
HashMap<String, String> map = new HashMap<>();
BufferedReader in = new BufferedReader(new FileReader(filename));
String line = "";
while ((line = in.readLine()) != null) {
String parts[] = line.replace("\"", "").split(";");
map.put(parts[0], parts[1]);
}
in.close();
System.out.println("Total items: " + map.size());
return map;
}
public static void userBased() throws IOException, TasteException {
// 加载数据模型
StringItemIdFileDatamodel model = new StringItemIdFileDatamodel(
new File(BookRecommender.class.getClassLoader().getResource("BX-Book-Ratings.csv").getPath()), ";");
// 用户关联性 使用皮尔逊相关系数
UserSimilarity similarity = new PearsonCorrelationSimilarity(model);
// 定义如何指出那些用户是相似的 既评分彼此相近的用户
UserNeighborhood neighborhood = new ThresholdUserNeighborhood(0.1, similarity, model);
// 使用数据模型 邻居 相似对象初始化基于通用用户推荐
UserBasedRecommender recommender = new GenericUserBasedRecommender(model, neighborhood, similarity);
long userID = 80683;
int noItems = 10;
List<RecommendedItem> recommendations = recommender.recommend(userID, noItems);
System.out.println("Rated items by user:");
for (Preference preference : model.getPreferencesFromUser(userID)) {
// convert long itemID back to ISBN
String itemISBN = model.getItemIDAsString(preference.getItemID());
System.out.println("Item:" + books.get(itemISBN) + " | Item id:" + itemISBN + " | Value:" + preference.getValue());
}
System.out.println("\nRecommended items:");
for (RecommendedItem item : recommendations) {
String itemISBN = model.getItemIDAsString(item.getItemID());
System.out.println("Item:" + books.get(itemISBN) + " | Item id:" + itemISBN + " | Value:" + item.getValue());
}
}
}
基于项目的过滤
GenericItemSimilarity 预先计算项目的相似度
PearsonCorrelationSimilarity 实时计算相似度 大量数据时 计算速度慢得难以忍受
public static void main(String[] args) throws Exception {
books = loadBooks(BookRecommender.class.getClassLoader().getResource("BX-Books.csv").getPath());
// userBased();
itemBased();
}
public static void itemBased() throws IOException, TasteException {
StringItemIdFileDatamodel model = new StringItemIdFileDatamodel(new File(BookRecommender.class.getClassLoader().getResource("BX-Book-Ratings.csv").getPath()), ";");
ItemSimilarity itemSimilarity = new PearsonCorrelationSimilarity(model);
ItemBasedRecommender recommender = new GenericItemBasedRecommender(model, itemSimilarity);
String itemISBN = "0395272238";
long itemID = model.readItemIDFromString(itemISBN);
int noItems = 10;
List<RecommendedItem> recommendations = recommender.mostSimilarItems(itemID, noItems);
System.out.println("Recommendations for item:"+books.get(itemISBN));
System.out.println("\nMost similar items:");
for (RecommendedItem item : recommendations) {
itemISBN = model.getItemIDAsString(item.getItemID());
System.out.println("Item:" + books.get(itemISBN) + " | Item id:" + itemISBN + " | Value:" + item.getValue());
}
}
"C:\Program Files\Java\jdk1.8.0_121\bin\java.exe" -agentlib:jdwp=transport=dt_socket,address=127.0.0.1:1975,suspend=y,server=n -javaagent:C:\Users\OSrcD\AppData\Local\JetBrains\Toolbox\Apps\IDEA-U\ch-0\212.4746.92\plugins\java\lib\rt\debugger-agent.jar -Dfile.encoding=GBK -classpath "C:\Program Files\Java\jdk1.8.0_121\jre\lib\charsets.jar;C:\Program Files\Java\jdk1.8.0_121\jre\lib\deploy.jar;C:\Program Files\Java\jdk1.8.0_121\jre\lib\ext\access-bridge-64.jar;C:\Program Files\Java\jdk1.8.0_121\jre\lib\ext\cldrdata.jar;C:\Program Files\Java\jdk1.8.0_121\jre\lib\ext\dnsns.jar;C:\Program Files\Java\jdk1.8.0_121\jre\lib\ext\jaccess.jar;C:\Program Files\Java\jdk1.8.0_121\jre\lib\ext\jfxrt.jar;C:\Program Files\Java\jdk1.8.0_121\jre\lib\ext\localedata.jar;C:\Program Files\Java\jdk1.8.0_121\jre\lib\ext\nashorn.jar;C:\Program Files\Java\jdk1.8.0_121\jre\lib\ext\sunec.jar;C:\Program Files\Java\jdk1.8.0_121\jre\lib\ext\sunjce_provider.jar;C:\Program Files\Java\jdk1.8.0_121\jre\lib\ext\sunmscapi.jar;C:\Program Files\Java\jdk1.8.0_121\jre\lib\ext\sunpkcs11.jar;C:\Program Files\Java\jdk1.8.0_121\jre\lib\ext\zipfs.jar;C:\Program Files\Java\jdk1.8.0_121\jre\lib\javaws.jar;C:\Program Files\Java\jdk1.8.0_121\jre\lib\jce.jar;C:\Program Files\Java\jdk1.8.0_121\jre\lib\jfr.jar;C:\Program Files\Java\jdk1.8.0_121\jre\lib\jfxswt.jar;C:\Program Files\Java\jdk1.8.0_121\jre\lib\jsse.jar;C:\Program Files\Java\jdk1.8.0_121\jre\lib\management-agent.jar;C:\Program Files\Java\jdk1.8.0_121\jre\lib\plugin.jar;C:\Program Files\Java\jdk1.8.0_121\jre\lib\resources.jar;C:\Program Files\Java\jdk1.8.0_121\jre\lib\rt.jar;C:\Users\OSrcD\Downloads\java-for-linux\apache-mahout\target\classes;C:\Users\OSrcD\.m2\repository\org\apache\mahout\mahout-mr\0.10.0\mahout-mr-0.10.0.jar;C:\Users\OSrcD\.m2\repository\org\apache\mahout\mahout-math\0.10.0\mahout-math-0.10.0.jar;C:\Users\OSrcD\.m2\repository\com\tdunning\t-digest\3.1\t-digest-3.1.jar;C:\Users\OSrcD\.m2\repository\org\apache\mahout\mahout-hdfs\0.10.0\mahout-hdfs-0.10.0.jar;C:\Users\OSrcD\.m2\repository\com\google\guava\guava\11.0.2\guava-11.0.2.jar;C:\Users\OSrcD\.m2\repository\com\google\code\findbugs\jsr305\1.3.9\jsr305-1.3.9.jar;C:\Users\OSrcD\.m2\repository\org\apache\hadoop\hadoop-client\2.4.1\hadoop-client-2.4.1.jar;C:\Users\OSrcD\.m2\repository\org\apache\hadoop\hadoop-common\2.4.1\hadoop-common-2.4.1.jar;C:\Users\OSrcD\.m2\repository\commons-httpclient\commons-httpclient\3.1\commons-httpclient-3.1.jar;C:\Users\OSrcD\.m2\repository\commons-codec\commons-codec\1.4\commons-codec-1.4.jar;C:\Users\OSrcD\.m2\repository\commons-io\commons-io\2.4\commons-io-2.4.jar;C:\Users\OSrcD\.m2\repository\commons-net\commons-net\3.1\commons-net-3.1.jar;C:\Users\OSrcD\.m2\repository\commons-collections\commons-collections\3.2.1\commons-collections-3.2.1.jar;C:\Users\OSrcD\.m2\repository\commons-logging\commons-logging\1.1.3\commons-logging-1.1.3.jar;C:\Users\OSrcD\.m2\repository\log4j\log4j\1.2.17\log4j-1.2.17.jar;C:\Users\OSrcD\.m2\repository\commons-lang\commons-lang\2.6\commons-lang-2.6.jar;C:\Users\OSrcD\.m2\repository\commons-configuration\commons-configuration\1.6\commons-configuration-1.6.jar;C:\Users\OSrcD\.m2\repository\commons-digester\commons-digester\1.8\commons-digester-1.8.jar;C:\Users\OSrcD\.m2\repository\commons-beanutils\commons-beanutils\1.7.0\commons-beanutils-1.7.0.jar;C:\Users\OSrcD\.m2\repository\commons-beanutils\commons-beanutils-core\1.8.0\commons-beanutils-core-1.8.0.jar;C:\Users\OSrcD\.m2\repository\org\slf4j\slf4j-log4j12\1.7.5\slf4j-log4j12-1.7.5.jar;C:\Users\OSrcD\.m2\repository\org\apache\avro\avro\1.7.4\avro-1.7.4.jar;C:\Users\OSrcD\.m2\repository\com\thoughtworks\paranamer\paranamer\2.3\paranamer-2.3.jar;C:\Users\OSrcD\.m2\repository\org\xerial\snAppy\snAppy-java\1.0.4.1\snAppy-java-1.0.4.1.jar;C:\Users\OSrcD\.m2\repository\com\google\protobuf\protobuf-java\2.5.0\protobuf-java-2.5.0.jar;C:\Users\OSrcD\.m2\repository\org\apache\hadoop\hadoop-auth\2.4.1\hadoop-auth-2.4.1.jar;C:\Users\OSrcD\.m2\repository\org\apache\httpcomponents\httpclient\4.2.5\httpclient-4.2.5.jar;C:\Users\OSrcD\.m2\repository\org\apache\httpcomponents\httpcore\4.2.4\httpcore-4.2.4.jar;C:\Users\OSrcD\.m2\repository\org\apache\zookeeper\zookeeper\3.4.5\zookeeper-3.4.5.jar;C:\Users\OSrcD\.m2\repository\org\apache\commons\commons-compress\1.4.1\commons-compress-1.4.1.jar;C:\Users\OSrcD\.m2\repository\org\tukaani\xz\1.0\xz-1.0.jar;C:\Users\OSrcD\.m2\repository\org\apache\hadoop\hadoop-hdfs\2.4.1\hadoop-hdfs-2.4.1.jar;C:\Users\OSrcD\.m2\repository\org\apache\hadoop\hadoop-mapreduce-client-App\2.4.1\hadoop-mapreduce-client-App-2.4.1.jar;C:\Users\OSrcD\.m2\repository\org\apache\hadoop\hadoop-mapreduce-client-common\2.4.1\hadoop-mapreduce-client-common-2.4.1.jar;C:\Users\OSrcD\.m2\repository\org\apache\hadoop\hadoop-yarn-client\2.4.1\hadoop-yarn-client-2.4.1.jar;C:\Users\OSrcD\.m2\repository\com\sun\jersey\jersey-client\1.9\jersey-client-1.9.jar;C:\Users\OSrcD\.m2\repository\org\apache\hadoop\hadoop-yarn-server-common\2.4.1\hadoop-yarn-server-common-2.4.1.jar;C:\Users\OSrcD\.m2\repository\org\apache\hadoop\hadoop-mapreduce-client-shuffle\2.4.1\hadoop-mapreduce-client-shuffle-2.4.1.jar;C:\Users\OSrcD\.m2\repository\org\apache\hadoop\hadoop-yarn-api\2.4.1\hadoop-yarn-api-2.4.1.jar;C:\Users\OSrcD\.m2\repository\org\apache\hadoop\hadoop-mapreduce-client-core\2.4.1\hadoop-mapreduce-client-core-2.4.1.jar;C:\Users\OSrcD\.m2\repository\org\apache\hadoop\hadoop-yarn-common\2.4.1\hadoop-yarn-common-2.4.1.jar;C:\Users\OSrcD\.m2\repository\javax\xml\bind\jaxb-api\2.2.2\jaxb-api-2.2.2.jar;C:\Users\OSrcD\.m2\repository\javax\xml\stream\stax-api\1.0-2\stax-api-1.0-2.jar;C:\Users\OSrcD\.m2\repository\javax\activation\activation\1.1\activation-1.1.jar;C:\Users\OSrcD\.m2\repository\javax\servlet\servlet-api\2.5\servlet-api-2.5.jar;C:\Users\OSrcD\.m2\repository\com\sun\jersey\jersey-core\1.9\jersey-core-1.9.jar;C:\Users\OSrcD\.m2\repository\org\apache\hadoop\hadoop-mapreduce-client-jobclient\2.4.1\hadoop-mapreduce-client-jobclient-2.4.1.jar;C:\Users\OSrcD\.m2\repository\org\apache\hadoop\hadoop-annotations\2.4.1\hadoop-annotations-2.4.1.jar;C:\Users\OSrcD\.m2\repository\org\codehaus\jackson\jackson-core-asl\1.9.12\jackson-core-asl-1.9.12.jar;C:\Users\OSrcD\.m2\repository\org\codehaus\jackson\jackson-mApper-asl\1.9.12\jackson-mApper-asl-1.9.12.jar;C:\Users\OSrcD\.m2\repository\org\slf4j\slf4j-api\1.7.10\slf4j-api-1.7.10.jar;C:\Users\OSrcD\.m2\repository\org\apache\commons\commons-lang3\3.1\commons-lang3-3.1.jar;C:\Users\OSrcD\.m2\repository\commons-cli\commons-cli\1.2\commons-cli-1.2.jar;C:\Users\OSrcD\.m2\repository\com\thoughtworks\xstream\xstream\1.4.4\xstream-1.4.4.jar;C:\Users\OSrcD\.m2\repository\xmlpull\xmlpull\1.1.3.1\xmlpull-1.1.3.1.jar;C:\Users\OSrcD\.m2\repository\xpp3\xpp3_min\1.1.4c\xpp3_min-1.1.4c.jar;C:\Users\OSrcD\.m2\repository\org\apache\lucene\lucene-core\4.6.1\lucene-core-4.6.1.jar;C:\Users\OSrcD\.m2\repository\org\apache\lucene\lucene-analyzers-common\4.6.1\lucene-analyzers-common-4.6.1.jar;C:\Users\OSrcD\.m2\repository\org\apache\mahout\commons\commons-cli\2.0-mahout\commons-cli-2.0-mahout.jar;C:\Users\OSrcD\.m2\repository\org\apache\commons\commons-math3\3.2\commons-math3-3.2.jar;C:\Users\OSrcD\.m2\repository\org\apache\solr\solr-commons-csv\3.5.0\solr-commons-csv-3.5.0.jar;C:\Users\OSrcD\AppData\Local\JetBrains\Toolbox\Apps\IDEA-U\ch-0\212.4746.92\lib\idea_rt.jar" com.osrcd.mahout.BookRecommender
Connected to the target VM, address: '127.0.0.1:1975', transport: 'socket'
Total items: 271379
log4j:WARN No Appenders could be found for logger (org.apache.mahout.cf.taste.impl.model.file.FileDatamodel).
log4j:WARN Please initialize the log4j system properly.
log4j:WARN See http://logging.apache.org/log4j/1.2/faq.html#noconfig for more info.
Recommendations for item:The Fellowship of the Ring (Lord of the Rings (Paperback))
Most similar items:
Item:The Hundred Secret Senses | Item id:080411109X | Value:1.0
Item:The HOLLAND SUGGESTIONS : A Novel | Item id:0671003534 | Value:1.0
Item:Girl in Hyacinth Blue | Item id:014029628X | Value:1.0
Item:The Education of Little Tree (A Zia Book) | Item id:0826308791 | Value:1.0
Item:A Fever in the Heart : Ann Rule's Crime Files, Volume III | Item id:0671793551 | Value:1.0
Item:One Hundred Years of Solitude | Item id:0060929790 | Value:1.0
Item:Hellsing, Vol. 1 | Item id:159307056X | Value:1.0
Item:The Phantom Tollbooth | Item id:0394820371 | Value:1.0
Item:Snow Crash (Bantam Spectra Book) | Item id:0553380958 | Value:1.0
Item:The Samurai's Wife (A Sano Ichiro Mystery) | Item id:0312974485 | Value:1.0
Disconnected from the target VM, address: '127.0.0.1:1975', transport: 'socket'
Process finished with exit code 0